Audio output with video processing with opencv

I think ffmpeg should be used to play audio and SDL for video in this case.

After opening the file with OpenCV and processing the frame, you can use OpenCV -> SDL to display it while retrieving the audio frames through ffmpeg and playing them with SDL.

Here is a nice collection of ffmpeg/SDL tutorials!

I also found a nice post that shows how to capture frames from a video file using ffmpeg, store them in OpenCV cv::Mat and display the result in a OpenCV window. But this way you can’t play audio since OpenCV doesn’t deal with that.

You might be interested in reading this post as well: How to avoid a growing delay with ffmpeg between sound and raw video data ?

EDIT:

I spent the last 4hrs coding a prototype to demonstrate how it’s done. This demo reads video frames through OpenCV (so you can process them) and audio through ffmpeg, and SDL is used to play both! There are 2 limitations in this demo you must be aware: 1 – it assumes you are working with an OpenCV image packed as BGR (24bits), and 2 – audio and video are not being sync! Yes, I left have some work for you to do (yeeeey). But don’t panic, page 6 has some ideas!

It’s important to sync audio and video because you will be doing some processing on the frames, and that will certainly make the video and audio go out of sync real fast since they are being played independently of each other.

The ffmpeg tutorials I suggested above are very very important to understand the code, a lot of code from this demo came from there. They show how to deal with SDL, and how to read packets of audio/video streams.

#include <highgui.h>
#include <cv.h>

extern "C"
{
#include <SDL.h>
#include <SDL_thread.h>
#include <avcodec.h>
#include <avformat.h>
}

#include <iostream>
#include <stdio.h>
//#include <malloc.h>

using namespace cv;    

#define SDL_AUDIO_BUFFER_SIZE 1024

typedef struct PacketQueue 
{
  AVPacketList *first_pkt, *last_pkt;
  int nb_packets;
  int size;
  SDL_mutex *mutex;
  SDL_cond *cond;
} PacketQueue;
PacketQueue audioq;

int audioStream = -1;
int videoStream = -1;
int quit = 0;

SDL_Surface* screen = NULL; 
SDL_Surface* surface = NULL;

AVFormatContext* pFormatCtx = NULL;
AVCodecContext* aCodecCtx = NULL;
AVCodecContext* pCodecCtx = NULL;  

void show_frame(IplImage* img)
{
    if (!screen) 
    {
        screen = SDL_SetVideoMode(img->width, img->height, 0, 0);
        if (!screen) 
        {
            fprintf(stderr, "SDL: could not set video mode - exiting\n");
            exit(1);
        }
    }

    // Assuming IplImage packed as BGR 24bits
    SDL_Surface* surface = SDL_CreateRGBSurfaceFrom((void*)img->imageData,
                img->width,
                img->height,
                img->depth * img->nChannels,
                img->widthStep,
                0xff0000, 0x00ff00, 0x0000ff, 0
                );

    SDL_BlitSurface(surface, 0, screen, 0);

    SDL_Flip(screen);
}

void packet_queue_init(PacketQueue *q) 
{
    memset(q, 0, sizeof(PacketQueue));
    q->mutex = SDL_CreateMutex();
    q->cond = SDL_CreateCond();
}

int packet_queue_put(PacketQueue *q, AVPacket *pkt) 
{
    AVPacketList *pkt1;
    if (av_dup_packet(pkt) < 0) 
    {
        return -1;
    }

    //pkt1 = (AVPacketList*) av_malloc(sizeof(AVPacketList));
    pkt1 = (AVPacketList*) malloc(sizeof(AVPacketList));
    if (!pkt1) return -1;
    pkt1->pkt = *pkt;
    pkt1->next = NULL;

    SDL_LockMutex(q->mutex);

    if (!q->last_pkt)
        q->first_pkt = pkt1;
    else
        q->last_pkt->next = pkt1;

    q->last_pkt = pkt1;
    q->nb_packets++;
    q->size += pkt1->pkt.size;
    SDL_CondSignal(q->cond);

    SDL_UnlockMutex(q->mutex);
    return 0;
}

static int packet_queue_get(PacketQueue *q, AVPacket *pkt, int block)
{
    AVPacketList *pkt1;
    int ret;

    SDL_LockMutex(q->mutex);

    for (;;) 
    {      
        if( quit) 
        {
            ret = -1;
            break;
        }

        pkt1 = q->first_pkt;
        if (pkt1) 
        {
            q->first_pkt = pkt1->next;
            if (!q->first_pkt)
                q->last_pkt = NULL;

                q->nb_packets--;
            q->size -= pkt1->pkt.size;
            *pkt = pkt1->pkt;
            //av_free(pkt1);
            free(pkt1);
            ret = 1;
            break;
        } 
        else if (!block) 
        {
            ret = 0;
            break;
        } 
        else 
        {
            SDL_CondWait(q->cond, q->mutex);
        }
    }

    SDL_UnlockMutex(q->mutex);
    return ret;
}

int audio_decode_frame(AVCodecContext *aCodecCtx, uint8_t *audio_buf, int buf_size) 
{
    static AVPacket pkt;
    static uint8_t *audio_pkt_data = NULL;
    static int audio_pkt_size = 0;

    int len1, data_size;

    for (;;) 
    {
        while (audio_pkt_size > 0) 
        {
            data_size = buf_size;
            len1 = avcodec_decode_audio2(aCodecCtx, (int16_t*)audio_buf, &data_size, 
                                         audio_pkt_data, audio_pkt_size);
            if (len1 < 0) 
            {
                /* if error, skip frame */
                audio_pkt_size = 0;
                break;
            }
            audio_pkt_data += len1;
            audio_pkt_size -= len1;
            if (data_size <= 0) 
            {
                /* No data yet, get more frames */
                continue;
            }
            /* We have data, return it and come back for more later */
            return data_size;
        }

            if (pkt.data)
            av_free_packet(&pkt);

        if (quit) return -1;

        if (packet_queue_get(&audioq, &pkt, 1) < 0) return -1;

        audio_pkt_data = pkt.data;
        audio_pkt_size = pkt.size;
    }
}

void audio_callback(void *userdata, Uint8 *stream, int len) 
{
    AVCodecContext *aCodecCtx = (AVCodecContext *)userdata;
    int len1, audio_size;

    static uint8_t audio_buf[(AVCODEC_MAX_AUDIO_FRAME_SIZE * 3) / 2];
    static unsigned int audio_buf_size = 0;
    static unsigned int audio_buf_index = 0;

    while (len > 0) 
    {
        if (audio_buf_index >= audio_buf_size) 
        {
            /* We have already sent all our data; get more */
            audio_size = audio_decode_frame(aCodecCtx, audio_buf, sizeof(audio_buf));
            if(audio_size < 0) 
            {
                /* If error, output silence */
                audio_buf_size = 1024; // arbitrary?
                memset(audio_buf, 0, audio_buf_size);
            } 
            else 
            {
                audio_buf_size = audio_size;
            }
            audio_buf_index = 0;
        }

        len1 = audio_buf_size - audio_buf_index;
        if (len1 > len)
            len1 = len;

            memcpy(stream, (uint8_t *)audio_buf + audio_buf_index, len1);
        len -= len1;
        stream += len1;
        audio_buf_index += len1;
    }
}

void setup_ffmpeg(char* filename)
{
    if (av_open_input_file(&pFormatCtx, filename, NULL, 0, NULL) != 0)
    {
        fprintf(stderr, "FFmpeg failed to open file %s!\n", filename);
        exit(-1); 
    }

    if (av_find_stream_info(pFormatCtx) < 0)
    {
        fprintf(stderr, "FFmpeg failed to retrieve stream info!\n");
        exit(-1); 
    }

    // Dump information about file onto standard error
    dump_format(pFormatCtx, 0, filename, 0);

    // Find the first video stream
    int i = 0;
    for (i; i < pFormatCtx->nb_streams; i++)
    {
        if (pFormatCtx->streams[i]->codec->codec_type == CODEC_TYPE_VIDEO && videoStream < 0) 
        {
            videoStream = i;
        }

        if (pFormatCtx->streams[i]->codec->codec_type == CODEC_TYPE_AUDIO && audioStream < 0) 
        {
            audioStream = i;
        }
    }

    if (videoStream == -1)
    {
        fprintf(stderr, "No video stream found in %s!\n", filename);
        exit(-1); 
    }  

    if (audioStream == -1)
    {
        fprintf(stderr, "No audio stream found in %s!\n", filename);
        exit(-1); 
    }  

    // Get a pointer to the codec context for the audio stream
    aCodecCtx = pFormatCtx->streams[audioStream]->codec;

    // Set audio settings from codec info
    SDL_AudioSpec wanted_spec;
    wanted_spec.freq = aCodecCtx->sample_rate;
    wanted_spec.format = AUDIO_S16SYS;
    wanted_spec.channels = aCodecCtx->channels;
    wanted_spec.silence = 0;
    wanted_spec.samples = SDL_AUDIO_BUFFER_SIZE;
    wanted_spec.callback = audio_callback;
    wanted_spec.userdata = aCodecCtx;

    SDL_AudioSpec spec;
    if (SDL_OpenAudio(&wanted_spec, &spec) < 0) 
    {
        fprintf(stderr, "SDL_OpenAudio: %s\n", SDL_GetError());
        exit(-1);
    }

    AVCodec* aCodec = avcodec_find_decoder(aCodecCtx->codec_id);
    if (!aCodec) 
    {
        fprintf(stderr, "Unsupported codec!\n");
        exit(-1);
    }
    avcodec_open(aCodecCtx, aCodec);

    // audio_st = pFormatCtx->streams[index]
    packet_queue_init(&audioq);
    SDL_PauseAudio(0); 

    // Get a pointer to the codec context for the video stream
    pCodecCtx = pFormatCtx->streams[videoStream]->codec;

    // Find the decoder for the video stream
    AVCodec* pCodec = avcodec_find_decoder(pCodecCtx->codec_id);
    if (pCodec == NULL) 
    {
        fprintf(stderr, "Unsupported codec!\n");
        exit(-1); // Codec not found
    }

    // Open codec
    if (avcodec_open(pCodecCtx, pCodec) < 0)
    {
        fprintf(stderr, "Unsupported codec!\n");
        exit(-1); // Could not open codec
    }
}


int main(int argc, char* argv[])
{
    if (argc < 2)
    {
        std::cout << "Usage: " << argv[0] << " <video>" << std::endl;
        return -1;
    }

    av_register_all();

    // Init SDL
    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_TIMER)) 
    {
        fprintf(stderr, "Could not initialize SDL - %s\n", SDL_GetError());
        return -1;
    }

    // Init ffmpeg and setup some SDL stuff related to Audio
    setup_ffmpeg(argv[1]);

    VideoCapture cap(argv[1]); // open the default camera
    if (!cap.isOpened())  // check if we succeeded
    {
        std::cout << "Failed to load file!" << std::endl;
            return -1;
    }

    AVPacket packet;
    while (av_read_frame(pFormatCtx, &packet) >= 0) 
    {
        if (packet.stream_index == videoStream) 
        {
              // Actually this is were SYNC between audio/video would happen.
              // Right now I assume that every VIDEO packet contains an entire video frame, and that's not true. A video frame can be made by multiple packets!
              // But for the time being, assume 1 video frame == 1 video packet,
              // so instead of reading the frame through ffmpeg, I read it through OpenCV. 

              Mat frame;
              cap >> frame; // get a new frame from camera

          // do some processing on the frame, either as a Mat or as IplImage.
              // For educational purposes, applying a lame grayscale conversion
          IplImage ipl_frame = frame;
          for (int i = 0; i < ipl_frame.width * ipl_frame.height * ipl_frame.nChannels; i += ipl_frame.nChannels)
          {
                ipl_frame.imageData[i] = (ipl_frame.imageData[i] + ipl_frame.imageData[i+1] + ipl_frame.imageData[i+2])/3;   //B
                ipl_frame.imageData[i+1] = (ipl_frame.imageData[i] + ipl_frame.imageData[i+1] + ipl_frame.imageData[i+2])/3; //G
                ipl_frame.imageData[i+2] = (ipl_frame.imageData[i] + ipl_frame.imageData[i+1] + ipl_frame.imageData[i+2])/3; //R
            }

            // Display it on SDL window
            show_frame(&ipl_frame); 

            av_free_packet(&packet);
        }   
        else if (packet.stream_index == audioStream) 
        {
            packet_queue_put(&audioq, &packet);
        } 
        else 
        {
            av_free_packet(&packet);
        }

        SDL_Event event;
        SDL_PollEvent(&event);
        switch (event.type) 
        {
            case SDL_QUIT:
                SDL_FreeSurface(surface);
                SDL_Quit();
                break;

            default:
                break;  
        }
    }

    // the camera will be deinitialized automatically in VideoCapture destructor

    // Close the codec
    avcodec_close(pCodecCtx);

    // Close the video file
    av_close_input_file(pFormatCtx);

    return 0;
}

On my Mac I compiled it with:

g++ ffmpeg_snd.cpp -o ffmpeg_snd -D_GNU_SOURCE=1 -D_THREAD_SAFE -I/usr/local/include/opencv -I/usr/local/include -I/usr/local/include/SDL  -Wl,-framework,Cocoa -L/usr/local/lib -lopencv_core -lopencv_imgproc -lopencv_highgui -lopencv_ml -lopencv_video -lopencv_features2d -lopencv_calib3d -lopencv_objdetect -lopencv_contrib -lopencv_legacy -lopencv_flann -lSDLmain -lSDL -L/usr/local/lib -lavfilter -lavcodec -lavformat -I/usr/local/Cellar/ffmpeg/HEAD/include/libavcodec -I/usr/local/Cellar/ffmpeg/HEAD/include/libavformat

Leave a Comment