Transcoding FFMPEG audio files using libav * libraries

I am writing an application for transcoding sound using ffmpeg libraries. Here is my code

/* * File: main.cpp * Author: vinod * Compile with "g++ -std=c++11 -o audiotranscode main.cpp -lavformat -lavcodec -lavutil -lavfilter" * */ #if !defined PRId64 || PRI_MACROS_BROKEN #undef PRId64 #define PRId64 "lld" #endif #define __STDC_FORMAT_MACROS #ifdef __cplusplus extern "C" { #endif #include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <stdint.h> #include <libavutil/imgutils.h> #include <libavutil/samplefmt.h> #include <libavutil/frame.h> #include <libavutil/timestamp.h> #include <libavformat/avformat.h> #include <libavfilter/avfilter.h> #include <libavfilter/buffersrc.h> #include <libavfilter/buffersink.h> #include <libswscale/swscale.h> #include <libavutil/opt.h> #ifdef __cplusplus } #endif #include <iostream> using namespace std; int select_stream, got_frame, got_packet; AVFormatContext *in_fmt_ctx = NULL, *out_fmt_ctx = NULL; AVCodec *dec_codec = NULL, * enc_codec = NULL; AVStream *audio_st = NULL; AVCodecContext *enc_ctx = NULL, *dec_ctx = NULL; AVFrame *pFrame = NULL, * pFrameFiltered = NULL; AVFilterGraph *filter_graph = NULL; AVFilterContext *buffersrc_ctx = NULL; AVFilterContext *buffersink_ctx = NULL; AVPacket packet; string inFileName = "/home/vinod/vinod/Media/univac.webm"; string outFileName = "audio_extracted.m4a"; int target_bit_rate = 128000, sample_rate = 22050, channels = 1; AVSampleFormat sample_fmt = AV_SAMPLE_FMT_S16; string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono"; int log_averror(int errcode) { char *errbuf = (char *) calloc(AV_ERROR_MAX_STRING_SIZE, sizeof(char)); av_strerror(errcode, errbuf, AV_ERROR_MAX_STRING_SIZE); std::cout << "Error - " << errbuf << std::endl; delete [] errbuf; return -1; } /** * Initialize conversion filter */ int initialize_audio_filter() { char args[512]; int ret; AVFilter *buffersrc = avfilter_get_by_name("abuffer"); AVFilter *buffersink = avfilter_get_by_name("abuffersink"); AVFilterInOut *outputs = avfilter_inout_alloc(); AVFilterInOut *inputs = avfilter_inout_alloc(); filter_graph = avfilter_graph_alloc(); const enum AVSampleFormat out_sample_fmts[] = {sample_fmt, AV_SAMPLE_FMT_NONE}; const int64_t out_channel_layouts[] = {av_get_default_channel_layout(out_fmt_ctx -> streams[0] -> codec -> channels), -1}; const int out_sample_rates[] = {out_fmt_ctx -> streams[0] -> codec -> sample_rate, -1}; if (!dec_ctx->channel_layout) dec_ctx->channel_layout = av_get_default_channel_layout(dec_ctx->channels); snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64, in_fmt_ctx -> streams[select_stream] -> time_base.num, in_fmt_ctx -> streams[select_stream] -> time_base.den, dec_ctx->sample_rate, av_get_sample_fmt_name(dec_ctx->sample_fmt), dec_ctx->channel_layout); ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", args, NULL, filter_graph); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source\n"); return -1; } ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, filter_graph); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink\n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n"); return ret; } /* Endpoints for the filter graph. */ outputs -> name = av_strdup("in"); outputs -> filter_ctx = buffersrc_ctx; outputs -> pad_idx = 0; outputs -> next = NULL; /* Endpoints for the filter graph. */ inputs -> name = av_strdup("out"); inputs -> filter_ctx = buffersink_ctx; inputs -> pad_idx = 0; inputs -> next = NULL; string filter_desc = filter_description; if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_desc.c_str(), &inputs, &outputs, NULL)) < 0) { log_averror(ret); exit(1); } if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) { log_averror(ret); exit(1); } /* Print summary of the sink buffer * Note: args buffer is reused to store channel layout string */ AVFilterLink *outlink = buffersink_ctx->inputs[0]; av_get_channel_layout_string(args, sizeof(args), -1, outlink->channel_layout); av_log(NULL, AV_LOG_INFO, "Output: srate:%dHz fmt:%s chlayout:%s\n", (int) outlink->sample_rate, (char *) av_x_if_null(av_get_sample_fmt_name((AVSampleFormat) outlink->format), "?"), args); return 0; } /* * */ int main(int argc, char **argv) { int ret; cout << "Hello World" << endl; printf("abcd"); avcodec_register_all(); av_register_all(); avfilter_register_all(); /* open input file, and allocate format context */ if (avformat_open_input(&in_fmt_ctx, inFileName.c_str(), NULL, NULL) < 0) { std::cout << "error opening input file - " << inFileName << std::endl; return -1; } /* retrieve stream information */ if (avformat_find_stream_info(in_fmt_ctx, NULL) < 0) { std::cerr << "Could not find stream information in the input file " << inFileName << std::endl; } /* Dump format details */ printf("\n ---------------------------------------------------------------------- \n"); av_dump_format(in_fmt_ctx, 0, inFileName.c_str(), 0); printf("\n ---------------------------------------------------------------------- \n"); /* Choose a audio stream */ select_stream = av_find_best_stream(in_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec_codec, 0); if (select_stream == AVERROR_STREAM_NOT_FOUND) { std::cerr << "No audio stream found" << std::endl; return -1; } if (select_stream == AVERROR_DECODER_NOT_FOUND) { std::cerr << "No suitable decoder found" << std::endl; return -1; } dec_ctx = in_fmt_ctx -> streams[ select_stream] -> codec; av_opt_set_int(dec_ctx, "refcounted_frames", 1, 0); /* init the audio decoder */ if ((ret = avcodec_open2(dec_ctx, dec_codec, NULL)) < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder\n"); return ret; } /* allocate output context */ ret = avformat_alloc_output_context2(&out_fmt_ctx, NULL, NULL, outFileName.c_str()); if (ret < 0) { std::cerr << "Could not create output context for the file " << outFileName << std::endl; return -1; } /* find the encoder */ enum AVCodecID codec_id = out_fmt_ctx -> oformat -> audio_codec; enc_codec = avcodec_find_encoder(codec_id); if (!(enc_codec)) { std::cerr << "Could not find encoder for - " << avcodec_get_name(codec_id) << std::endl; return -1; } /* add a new stream */ audio_st = avformat_new_stream(out_fmt_ctx, enc_codec); if (!audio_st) { std::cerr << "Could not add audio stream - " << std::endl; } /* Initialise audio codec */ audio_st -> id = out_fmt_ctx -> nb_streams - 1; enc_ctx = audio_st -> codec; enc_ctx -> codec_id = codec_id; enc_ctx -> codec_type = AVMEDIA_TYPE_AUDIO; enc_ctx -> bit_rate = target_bit_rate; enc_ctx -> sample_rate = sample_rate; enc_ctx -> sample_fmt = sample_fmt; enc_ctx -> channels = channels; enc_ctx -> channel_layout = av_get_default_channel_layout(enc_ctx -> channels); /* Some formats want stream headers to be separate. */ if (out_fmt_ctx -> oformat -> flags & AVFMT_GLOBALHEADER) { enc_ctx -> flags |= CODEC_FLAG_GLOBAL_HEADER; } ret = avcodec_open2(out_fmt_ctx -> streams[0] -> codec, enc_codec, NULL); if (ret < 0) { std::cerr << "Could not create codec context for the file " << outFileName << std::endl; return -1; } /* Initialize filter */ initialize_audio_filter(); if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) { int ret = avio_open(& out_fmt_ctx -> pb, outFileName.c_str(), AVIO_FLAG_WRITE); if (ret < 0) { log_averror(ret); return -1; } } /* Write header */ if (avformat_write_header(out_fmt_ctx, NULL) < 0) { if (ret < 0) { log_averror(ret); return -1; } } /* Allocate frame */ pFrame = av_frame_alloc(); if (!pFrame) { std::cerr << "Could not allocate frame\n"; return -1; } pFrameFiltered = av_frame_alloc(); if (!pFrameFiltered) { std::cerr << "Could not allocate frame\n"; return -1; } av_init_packet(&packet); packet.data = NULL; packet.size = 0; /* Read packet from the stream */ while (av_read_frame(in_fmt_ctx, &packet) >= 0) { if (packet.stream_index == select_stream) { avcodec_get_frame_defaults(pFrame); ret = avcodec_decode_audio4(dec_ctx, pFrame, &got_frame, &packet); if (ret < 0) { log_averror(ret); return ret; } printf("Decoded packet pts : %ld ", packet.pts); printf("Frame Best Effor pts : %ld \n", pFrame->best_effort_timestamp); /* Set frame pts */ pFrame -> pts = av_frame_get_best_effort_timestamp(pFrame); if (got_frame) { /* push the decoded frame into the filtergraph */ ret = av_buffersrc_add_frame_flags(buffersrc_ctx, pFrame, AV_BUFFERSRC_FLAG_KEEP_REF); if (ret < 0) { log_averror(ret); return ret; } /* pull filtered frames from the filtergraph */ while (1) { ret = av_buffersink_get_frame(buffersink_ctx, pFrameFiltered); if ((ret == AVERROR(EAGAIN)) || (ret == AVERROR_EOF)) { break; } if (ret < 0) { printf("Error while getting filtered frames from filtergraph\n"); log_averror(ret); return -1; } /* Initialize the packets */ AVPacket encodedPacket = {0}; av_init_packet(&encodedPacket); ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, pFrameFiltered, &got_packet); if (!ret && got_packet && encodedPacket.size) { /* Set correct pts and dts */ if (encodedPacket.pts != AV_NOPTS_VALUE) { encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } if (encodedPacket.dts != AV_NOPTS_VALUE) { encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } printf("Encoded packet pts %ld\n", encodedPacket.pts); /* Write the compressed frame to the media file. */ ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket); if (ret < 0) { log_averror(ret); return -1; } } else if (ret < 0) { log_averror(ret); return -1; } av_frame_unref(pFrameFiltered); } av_frame_unref(pFrame); } } } /* Flush delayed frames from encoder*/ got_packet=1; while (got_packet) { AVPacket encodedPacket = {0}; av_init_packet(&encodedPacket); ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, NULL, &got_packet); if (!ret && got_packet && encodedPacket.size) { /* Set correct pts and dts */ if (encodedPacket.pts != AV_NOPTS_VALUE) { encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } if (encodedPacket.dts != AV_NOPTS_VALUE) { encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } printf("Encoded packet pts %ld\n", encodedPacket.pts); /* Write the compressed frame to the media file. */ ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket); if (ret < 0) { log_averror(ret); return -1; } } else if (ret < 0) { log_averror(ret); return -1; } } /* Write Trailer */ av_write_trailer(out_fmt_ctx); avfilter_graph_free(&filter_graph); if (dec_ctx) avcodec_close(dec_ctx); avformat_close_input(&in_fmt_ctx); av_frame_free(&pFrame); av_frame_free(&pFrameFiltered); if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) avio_close(out_fmt_ctx -> pb); avcodec_close(out_fmt_ctx->streams[0]->codec); avformat_free_context(out_fmt_ctx); return 0; } 

An audio file after transcoding the same length as the input. But its completely noisy. Can someone tell me what I'm doing wrong here?

+6
source share
1 answer

I found out where the problem was, and it was resolved.

When the output file was opened in boldness, it was noticed that unwanted silence was inserted into the audio signal. The problem was that the "number of samples per frame" is fed to the encoder.

Different codecs expect different frame sizes for encoding. And the aac encoder expects a size of 1024. This can be seen by observing enc_ctx->frame_size after executing avcodec_open2() .

The filter must be provided with a frame with 1024 number of samples per channel for the encoder. Therefore, in my pFrameFiltered code pFrameFiltered should be exactly 1024 number of samples per channel. If it is less than 1024, the encoder adds zeros to make it equal to 1024 samples and then encodes it.

This can be resolved either with our own fifo queue, or with a filter available with ffmpeg audio files. We need to use the filter asetnsamples=n=1024:p=0 , as described here . Therefore, you need to change

 `string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono,asetnsamples=n=1024:p=0";` 

Just play around with the n value in the filter to better understand. Check the enc_ctx->frame_size field set by avcodec_open2 () and set the n value accordingly.

+12
source

All Articles