FFmpeg：音视频封装（含格式转化、重采样）（参考muxing.c）

如果不是特别熟悉C/C++，又要使用FFmpeg.API处理一些简单的音视频业务，那么可以使用org.bytedeco:ffmpeg-platform，下面记录一下使用ffmpeg-platform获取封装音视频数据（含格式转化、重采样）的方法。

1. 基本流程

音视频封装（复用）的基本流程主要有如下步骤：

准备音频流、视频流
构建输出流AVFormatContext
将音视频流AVStream添加到AVFormatContext
使用avio_open打开输出
avformat_write_header写入头信息
写入音视频帧
av_write_trailer写如尾信息

2. 输出流结构

这里将输出流（音频流、视频流）封装成一个内部类，结构如下：

class OutputStream {
    AVStream st;
    AVCodecContext enc;

    /* pts of the next frame that will be generated */
    long next_pts;
    int samples_count;

    AVFrame frame;
    AVFrame tmp_frame;

    AVPacket tmp_pkt;

    float t, tincr, tincr2;

    SwsContext sws_ctx;
    SwrContext swr_ctx;

    BytePointer y;
    BytePointer u;
    BytePointer v;
    BytePointer a;
}

3. 构建AVFormatContext

使用avformat_alloc_output_context2构建输出AVFormatContext对象：

int ret = avformat.avformat_alloc_output_context2(oc, null, null, output);
if (ret < 0) {
    ret = avformat.avformat_alloc_output_context2(oc, null, "mpeg", output);
}
if (ret < 0) {
    throw new IOException(ret + ":avformat_alloc_output_context2 error");
}

4. 添加输出流

音频、视频采用同一个函数，根据编码器区分，设置不同的参数：

private static AVCodec add_stream(OutputStream ost, AVFormatContext oc, int codec_id) throws IOException {
    AVCodec codec = avcodec.avcodec_find_encoder(codec_id);
    if (Objects.isNull(codec)) {
        throw new IOException("avcodec_find_encoder error");
    }

    ost.tmp_pkt = avcodec.av_packet_alloc();
    if (Objects.isNull(ost.tmp_pkt)) {
        throw new IOException("av_packet_alloc error");
    }

    ost.st = avformat.avformat_new_stream(oc, null);
    if (Objects.isNull(ost.st)) {
        throw new IOException("avformat_new_stream error");
    }

    ost.st.id(oc.nb_streams() - 1);
    AVCodecContext c = avcodec.avcodec_alloc_context3(codec);
    if (Objects.isNull(c)) {
        throw new IOException("avcodec_alloc_context3");
    }
    ost.enc = c;
    switch (codec.type()) {
        case avutil.AVMEDIA_TYPE_AUDIO:
            c.sample_fmt(
                    Objects.nonNull(codec.sample_fmts()) ? codec.sample_fmts().get() : avutil.AV_SAMPLE_FMT_FLTP);
            c.bit_rate(64000);
            c.sample_rate(44100);
            if (Objects.nonNull(codec.supported_samplerates())) {
                c.sample_rate(codec.supported_samplerates().get());
                for (int i = 0; codec.supported_samplerates().get(i) != 0; i++) {
                    if (codec.supported_samplerates().get(i) == 44100) {
                        c.sample_rate(44100);
                    }
                }
            }
            // @see libavutil/channel_layout.h
            // #define AV_CHANNEL_LAYOUT_MASK(nb, m) \
            //    { .order = AV_CHANNEL_ORDER_NATIVE, .nb_channels = (nb), .u = { .mask = (m) }}
            // #define AV_CHANNEL_LAYOUT_STEREO            AV_CHANNEL_LAYOUT_MASK(2,  AV_CH_LAYOUT_STEREO)
            // avutil.av_channel_layout_copy(c.ch_layout(), )
            c.ch_layout().nb_channels(2);
            c.ch_layout().order(avutil.AV_CHANNEL_ORDER_NATIVE);
            c.ch_layout().u_mask(avutil.AV_CH_LAYOUT_STEREO);
            AVRational ar = new AVRational();
            ar.num(1);
            ar.den(c.sample_rate());
            ost.st.time_base(ar);
            break;
        case avutil.AVMEDIA_TYPE_VIDEO:
            c.codec_id(codec_id);
            c.bit_rate(400000);
            /* Resolution must be a multiple of two. */
            c.width(352);
            c.height(288);
            /*
             * timebase: This is the fundamental unit of time (in seconds) in terms of which
             * frame timestamps are represented. For fixed-fps content, timebase should be
             * 1/framerate and timestamp increments should be identical to 1.
             */
            AVRational vr = new AVRational();
            vr.num(1);
            vr.den(STREAM_FRAME_RATE);
            ost.st.time_base(vr);
            c.time_base(ost.st.time_base());
            /* emit one intra frame every twelve frames at most */
            c.gop_size(12);
            c.pix_fmt(STREAM_PIX_FMT);
            if (c.codec_id() == avcodec.AV_CODEC_ID_MPEG2VIDEO) {
                /* just for testing, we also add B-frames */
                c.max_b_frames(2);
            }
            if (c.codec_id() == avcodec.AV_CODEC_ID_MPEG1VIDEO) {
                /*
                 * Needed to avoid using macroblocks in which some coeffs overflow. This does
                 * not happen with normal video, it just happens here as the motion of the
                 * chroma plane does not match the luma plane.
                 */
                c.mb_decision(2);
            }
            break;
    }
    if ((oc.oformat().flags() & avformat.AVFMT_GLOBALHEADER) != 0) {
        c.flags(c.flags() | avcodec.AV_CODEC_FLAG_GLOBAL_HEADER);
    }

    return codec;
}

5. 设置参数

设置视频流参数：

private static void open_video(AVCodec codec, OutputStream ost) throws IOException {
    AVCodecContext c = ost.enc;
    // open the codec
    int ret = avcodec.avcodec_open2(c, codec, (AVDictionary) null);
    if (ret < 0) {
        throw new IOException(ret + ":avcodec_open2 error");
    }

    ost.frame = alloc_picture(c.pix_fmt(), c.width(), c.height());
    if (Objects.isNull(ost.frame)) {
        throw new IOException("alloc_picture error");
    }

    /* allocate and init a re-usable frame */
    if (c.pix_fmt() != avutil.AV_PIX_FMT_YUV420P) {
        ost.tmp_frame = alloc_picture(avutil.AV_PIX_FMT_YUV420P, c.width(), c.height());
        if (Objects.isNull(ost.tmp_frame)) {
            throw new IOException("alloc_picture error");
        }
    }

    // copy the stream parameters to the muxer
    ret = avcodec.avcodec_parameters_from_context(ost.st.codecpar(), c);
    if (ret < 0) {
        throw new IOException(ret + ":avcodec_parameters_from_context error");
    }
}

设置音频流参数：

private static void open_audio(AVCodec codec, OutputStream ost) throws IOException {
    AVCodecContext c = ost.enc;
    int ret = avcodec.avcodec_open2(c, codec, (AVDictionary) null);
    if (ret < 0) {
        throw new IOException(ret + ":avcodec_open2 error");
    }

    // init signal generator
    ost.t = 0;
    ost.tincr = (float) (2 * Math.PI * 100.0 / c.sample_rate());
    ost.tincr2 = (float) (2 * Math.PI * 100.0 / c.sample_rate() / c.sample_rate());

    int nb_samples;
    if ((c.codec().capabilities() & avcodec.AV_CODEC_CAP_VARIABLE_FRAME_SIZE) != 0) {
        nb_samples = 10000;
    } else {
        nb_samples = c.frame_size();
    }

    ost.frame = alloc_audio_frame(c.sample_fmt(), c.ch_layout(), c.sample_rate(), nb_samples);
    ost.tmp_frame = alloc_audio_frame(avutil.AV_SAMPLE_FMT_S16, c.ch_layout(), c.sample_rate(), nb_samples);

    // copy the stream parameters to the muxer
    ret = avcodec.avcodec_parameters_from_context(ost.st.codecpar(), c);
    if (ret < 0) {
        throw new IOException(ret + "avcodec_parameters_from_context error");
    }

    ost.swr_ctx = swresample.swr_alloc();
    if (Objects.isNull(ost.swr_ctx)) {
        throw new IOException("swr_alloc error");
    }

    // set options
    avutil.av_opt_set_chlayout(ost.swr_ctx, "in_chlayout", c.ch_layout(), 0);
    avutil.av_opt_set_int(ost.swr_ctx, "in_sample_rate", c.sample_rate(), 0);
    avutil.av_opt_set_sample_fmt(ost.swr_ctx, "in_sample_fmt", avutil.AV_SAMPLE_FMT_S16, 0);
    avutil.av_opt_set_chlayout(ost.swr_ctx, "out_chlayout", c.ch_layout(), 0);
    avutil.av_opt_set_int(ost.swr_ctx, "out_sample_rate", c.sample_rate(), 0);
    avutil.av_opt_set_sample_fmt(ost.swr_ctx, "out_sample_fmt", c.sample_fmt(), 0);

    ret = swresample.swr_init(ost.swr_ctx);
    if (ret < 0) {
        throw new IOException(ret + ":swr_init error");
    }
}

6. 打开输出

设置FLAG为WRITTE：

// open the output file, if needed
if ((fmt.flags() & avformat.AVFMT_NOFILE) == 0) {
    AVIOContext pb = new AVIOContext(null);
    ret = avformat.avio_open(pb, output, avformat.AVIO_FLAG_WRITE);
    if (ret < 0) {
        throw new IOException(ret + ":avio_open error");
    }
    oc.pb(pb);
}

7. 写入流数据

写入头信息：

// Write the stream header, if any
ret = avformat.avformat_write_header(oc, (AVDictionary) null);
if (ret < 0) {
    // av_err2str(ret)
    throw new IOException(ret + ":avformat_write_header error");
}

写入音频、视频数据：

while (encode_video || encode_audio) {
    if (encode_video && (!encode_audio || avutil.av_compare_ts(video_st.next_pts, video_st.enc.time_base(),
            audio_st.next_pts, audio_st.enc.time_base()) <= 0)) {
        System.out.println(">>> encode_video >>>");
        encode_video = write_video_frame(oc, video_st) == 0;
    } else {
        System.out.println("<<< encode_audio <<<");
        encode_audio = write_audio_frame(oc, audio_st) == 0;
    }
}

下面是具体的写入方法：

// encode one video frame and send it to the muxer return 1 when encoding is finished, 0 otherwise
private static int write_video_frame(AVFormatContext oc, OutputStream ost) throws IOException {
    return write_frame(oc, ost.enc, ost.st, get_video_frame(ost), ost.tmp_pkt);
}

// encode one audio frame and send it to the muxer return 1 when encoding is finished, 0 otherwise
private static int write_audio_frame(AVFormatContext oc, OutputStream ost) throws IOException {
    AVFrame frame = get_audio_frame(ost);
    AVCodecContext c = ost.enc;
    if (Objects.nonNull(frame)) {
        // convert samples from native format to destination codec format, using the resampler
        // compute destination number of samples
        long dst_nb_samples = avutil.av_rescale_rnd(
                swresample.swr_get_delay(ost.swr_ctx, c.sample_rate()) + frame.nb_samples(), c.sample_rate(),
                c.sample_rate(), AV_ROUND_UP);

        // when we pass a frame to the encoder, it may keep a reference to it internally;
        // make sure we do not overwrite it here
        int ret = avutil.av_frame_make_writable(ost.frame);
        if (ret < 0) {
            throw new IOException(ret + ":av_frame_make_writable error");
        }

        // convert to destination format
        ret = swresample.swr_convert(ost.swr_ctx, ost.frame.data(), (int) dst_nb_samples, frame.data(),
                frame.nb_samples());
        if (ret < 0) {
            throw new IOException(ret + ":swr_convert error");
        }

        frame = ost.frame;

        AVRational rational = new AVRational();
        rational.num(1);
        rational.den(c.sample_rate());
        frame.pts(avutil.av_rescale_q(ost.samples_count, rational, c.time_base()));
        ost.samples_count += dst_nb_samples;
    }
    return write_frame(oc, c, ost.st, frame, ost.tmp_pkt);
}

private static int write_frame(AVFormatContext fmt_ctx, AVCodecContext c, AVStream st, AVFrame frame, AVPacket pkt)
        throws IOException {
    // System.out.println("send st: " + st.id() + "," + frame);
    int ret = avcodec.avcodec_send_frame(c, frame);
    if (ret < 0) {
        throw new IOException(ret + ":avcodec_send_frame error");
    }

    while (true) {
        ret = avcodec.avcodec_receive_packet(c, pkt);
        if (ret == avutil.AVERROR_EAGAIN() || ret == avutil.AVERROR_EOF()) {
            break;
        } else if (ret < 0) {
            throw new IOException(ret + ":avcodec_receive_packet error");
        }

        // rescale output packet timestamp values from codec to stream timebase
        avcodec.av_packet_rescale_ts(pkt, c.time_base(), st.time_base());
        pkt.stream_index(st.index());

        // Write the compressed frame to the media file.
        ret = avformat.av_interleaved_write_frame(fmt_ctx, pkt);
        if (ret < 0) {
            throw new IOException(ret + ":av_interleaved_write_frame error");
        }
    }

    return ret == avutil.AVERROR_EOF() ? 1 : 0;
}

写入的音视频数据由程序生成：

private static AVFrame get_video_frame(OutputStream ost) throws IOException {
    AVRational rational = new AVRational();
    rational.num(1);
    rational.den(1);
    if (avutil.av_compare_ts(ost.next_pts, ost.enc.time_base(), STREAM_DURATION, rational) > 0) {
        return null;
    }
    int ret = avutil.av_frame_make_writable(ost.frame);
    if (ret < 0) {
        throw new IOException(ret + ":av_frame_make_writable error");
    }

    AVCodecContext c = ost.enc;
    if (c.pix_fmt() != avutil.AV_PIX_FMT_YUV420P) {
        // as we only generate a YUV420P picture, we must convert it
        // to the codec pixel format if needed
        if (Objects.isNull(ost.sws_ctx)) {
            ost.sws_ctx = swscale.sws_getContext(c.width(), c.height(), avutil.AV_PIX_FMT_YUV420P, c.width(),
                    c.height(), c.pix_fmt(), SCALE_FLAGS, null, null, (DoublePointer) null);
            if (Objects.isNull(ost.sws_ctx)) {
                throw new IOException("sws_getContext error");
            }
        }
        fill_yuv_image(ost, ost.frame, (int) ost.next_pts, c.width(), c.height());
        swscale.sws_scale(ost.sws_ctx, ost.tmp_frame.data(), ost.tmp_frame.linesize(), 0, c.height(),
                ost.frame.data(), ost.frame.linesize());
    } else {
        fill_yuv_image(ost, ost.frame, (int) ost.next_pts, c.width(), c.height());
    }

    ost.frame.pts(ost.next_pts++);
    return ost.frame;
}

private static void fill_yuv_image(OutputStream ost, AVFrame pict, int frame_index, int width, int height) {
    int x, y, i;
    i = frame_index;

    if (Objects.isNull(ost.y)) {
        ost.y = new BytePointer(new byte[width * height]);
        ost.u = new BytePointer(new byte[width * height * 2 / 3]);
        ost.v = new BytePointer(new byte[width * height * 2 / 3]);

        pict.data(0, ost.y);
        pict.data(1, ost.u);
        pict.data(2, ost.v);
    }

    // Y
    for (y = 0; y < height; y++) {
        for (x = 0; x < width; x++) {
            // yBuffer[y * pict.linesize(0) + x] = (byte) (x + y + i * 3);
            // System.out.println("1:" + (y * pict.linesize(0) + x) + ":" + pict.linesize(0));
            pict.data(0).put(y * pict.linesize(0) + x, (byte) (x + y + i * 3));
        }
    }
    // Cb and Cr
    for (y = 0; y < height / 2; y++) {
        for (x = 0; x < width / 2; x++) {
            // uBuffer[y * pict.linesize(1) + x] = (byte) (128 + y + i * 2);
            // System.out.println("1:" + (y * pict.linesize(1) + x) + ":" + pict.linesize(1));
            // System.out.println("2:" + (y * pict.linesize(2) + x) + ":" + pict.linesize(2));
            pict.data(1).put(y * pict.linesize(1) + x, (byte) (128 + y + i * 2));
            // vBuffer[y * pict.linesize(2) + x] = (byte) (64 + x + i * 5);
            pict.data(2).put(y * pict.linesize(2) + x, (byte) (64 + x + i * 5));
        }
    }
}

private static AVFrame get_audio_frame(OutputStream ost) {
    // check if we want to generate more frames
    AVRational rational = new AVRational();
    rational.num(1);
    rational.den(1);
    if (avutil.av_compare_ts(ost.next_pts, ost.enc.time_base(), STREAM_DURATION, rational) > 0) {
        return null;
    }

    AVFrame frame = ost.tmp_frame;
    if (Objects.isNull(ost.a)) {
        ost.a = new BytePointer(new byte[frame.nb_samples() * 2 * ost.enc.ch_layout().nb_channels()]);
        frame.data(0, ost.a);
    }

    int j, i, v, off = 0;
    for (j = 0; j < frame.nb_samples(); j++) {
        v = (int) (Math.sin(ost.t) * 10000);
        for (i = 0; i < ost.enc.ch_layout().nb_channels(); i++) {
            frame.data(0).put(off++, (byte) (v & 0xff));
            frame.data(0).put(off++, (byte) ((v >> 8) & 0xff));
        }
        ost.t += ost.tincr;
        ost.tincr += ost.tincr2;
    }
    frame.pts(ost.next_pts);
    ost.next_pts += frame.nb_samples();
    return frame;
}

写入尾信息：

avformat.av_write_trailer(oc);

8. 结果演示

最终生成的效果：

完整代码：扫描左侧头像小程序获取，或私信联系。