[audio and video] camera acquisition (2-1)

Keywords: C++ Windows

Starting from this article, the author's journey of audio and video programming is about to enter the second stage. On the basis of collecting the playable files such as desktop, microphone and speaker written as mp4|mkv, the acquisition of camera is added. The camera can replace the desktop as the video input source, or the two video streams of camera and desktop can be mixed as one video input source.

Technical introduction

When collecting cameras, someone should immediately think of dshow. Yes, but I used the dshow technology in ffmpeg. Ffmpeg integrates dshow, which makes it more convenient to use. However, to get more detailed camera equipment information, you still need to use com knowledge.

Using modules (Libraries)

ffmpeg format, device library and windows system library are mainly used

Main processes and codes

1. Initialize the camera acquisition module. First, be sure to call avdevice_register_all and avformat_network_all to initialize the AVInputFormat of dshow

int CameraCaptor::init(const std::string& deviceId, const int fps)
{
	int err = ERROR_CODE_OK;
	if (m_inited) {
		return err;
	}

	do {
		m_deviceId = deviceId;

		avdevice_register_all();
		avformat_network_init();
		m_inputFmt = av_find_input_format("dshow");
		if (m_inputFmt == nullptr) {
			err = ERROR_CODE_FFMPEG_FIND_INPUT_FORMAT_FAILED;
			break;
		}

		std::string camera;
		int number;
		err = findCameraById(camera, number);
		HCMDR_ERROR_CODE_BREAK(err);

		std::map<std::string, std::multimap<AVCodecID, DEVICE::VIDEO_MEDIADATA>> mediadataList;
		err = DEVICE::VideoDevice::getCameraMediadataList(mediadataList);
		HCMDR_ERROR_CODE_BREAK(err);

		size_t captureId;
		std::multimap<AVCodecID, DEVICE::VIDEO_MEDIADATA>::const_iterator it;
		auto mediadatas = mediadataList.at(m_deviceId);
		for (captureId = 0; captureId < m_captureCodecs.size(); captureId++) {
			auto itBeg = mediadatas.lower_bound(m_captureCodecs.at(captureId));
			auto itEnd = mediadatas.upper_bound(m_captureCodecs.at(captureId));
			for (it = itBeg; it != itEnd; it++) {
				if (it->second.isDefault) {
					std::string resolution;
					resolution.assign(std::to_string(it->second.width)).append("x").append(std::to_string(it->second.height));
					err = initCamera(camera, number, resolution, it->second.fps, it->first);
					break;
				}
			}
			if (err == ERROR_CODE_OK) {
				break;
			}
		}
		if (err != ERROR_CODE_OK) {
			break;
		}

		m_fps = it->second.fps;
		m_rect = { 0, 0, it->second.width, it->second.height };
		m_pixelFmt = m_capturePixFmts.at(captureId);
		m_inited = true;
	} while (0);

	if (err != ERROR_CODE_OK) {
		LOGGER::Logger::log(LOGGER::LOG_TYPE_ERROR, "[%s] init camera captor error: %s, last error: %lu",
			__FUNCTION__, HCMDR_GET_ERROR_DESC(err), GetLastError());
		cleanup();
	}

	return err;
}

Then find the camera device according to the device id. you can see how to obtain the device id< [audio and video] audio acquisition equipment (8) >In this article, we also get the device list and compare it with the device id to get the camera name camera and the camera serial number number. Number indicates the camera with the same name

int CameraCaptor::findCameraById(std::string& camera, int& number)
{
	int err = ERROR_CODE_OK;

	do {
		std::list<DEVICE::VIDEO_DEVICE> devices;
		err = DEVICE::VideoDevice::getCameraDevices(devices);
		HCMDR_ERROR_CODE_BREAK(err);

		std::list<DEVICE::VIDEO_DEVICE>::const_iterator it;
		for (it = devices.cbegin(); it != devices.cend(); it++) {
			if (m_deviceId == it->id) {
				camera = it->name;
				break;
			}
		}
		if (it == devices.cend()) {
			err = ERROR_CODE_DEVICE_FIND_CAMERA_FAILED;
			break;
		}

		size_t count = 0;
		for (auto it = devices.cbegin(); it != devices.cend(); it++) {
			if (it->name == camera) {
				if (it->id == m_deviceId) {
					break;
				}
				count++;
			}
		}
		number = count;
	} while (0);

	if (err != ERROR_CODE_OK) {
		LOGGER::Logger::log(LOGGER::LOG_TYPE_ERROR, "[%s] find camera error: %s",
			__FUNCTION__, HCMDR_GET_ERROR_DESC(err));
	}
	return err;
}

Then get the detailed media data list of the camera. You can get the capability set information such as data type, width, height and frame rate that the camera can collect through com

int VideoDevice::getCameraMediadataList(std::map<std::string, std::multimap<AVCodecID, VIDEO_MEDIADATA>>& mediadataList)
{
	int err = ERROR_CODE_OK;
	HELPER::ComUtil com;
	do {
		ICreateDevEnum* devEnum = nullptr;
		HRESULT hr = CoCreateInstance(CLSID_SystemDeviceEnum, nullptr, CLSCTX_INPROC_SERVER,
			IID_ICreateDevEnum, (void**)&devEnum);
		if (FAILED(hr)) {
			err = ERROR_CODE_COM_CREATE_INSTANCE_FAILED;
			break;
		}

		IEnumMoniker* enumMoniker = nullptr;
		const GUID device_guid[2] = { CLSID_VideoInputDeviceCategory, CLSID_AudioInputDeviceCategory };
		hr = devEnum->CreateClassEnumerator(device_guid[0], (IEnumMoniker**)&enumMoniker, 0);
		if (FAILED(hr)) {
			err = ERROR_CODE_COM_CREATE_CLASS_ENUM_FAILED;
			break;
		}

		IMoniker* moniker = nullptr;
		while (enumMoniker->Next(1, &moniker, nullptr) == S_OK) {
			IBindCtx* bindCtx = nullptr;
			LPOLESTR displayName = nullptr;
			hr = moniker->GetDisplayName(bindCtx, nullptr, &displayName);
			if (FAILED(hr)) {
				continue;
			}
#ifdef UNICODE
			std::string deviceId = HELPER::StringConverter::convertUnicodeToUtf8(displayName);
#else
			std::string deviceId = HELPER::StringConverter::convertAsciiToUtf8(displayName);
#endif
			CoTaskMemFree(displayName);

			IBaseFilter* filter = nullptr;
			HRESULT hr = moniker->BindToObject(nullptr, nullptr, IID_IBaseFilter, reinterpret_cast<void**>(&filter));
			if (FAILED(hr)) {
				continue;
			}

			IEnumPins* enumPins = nullptr;
			IPin* pin = nullptr;
			hr = filter->EnumPins(&enumPins);
			if (FAILED(hr)) {
				filter->Release();
				continue;
			}

			std::multimap<AVCodecID, VIDEO_MEDIADATA> mediadatas;
			ULONG pinFetched = 0;
			enumPins->Reset();
			while (SUCCEEDED(enumPins->Next(1, &pin, &pinFetched)) && pinFetched > 0) {
				if (pin == nullptr) {
					continue;
				}
				PIN_INFO pinInfo;
				if (FAILED(pin->QueryPinInfo(&pinInfo)) || pinInfo.dir != PINDIR_OUTPUT) {
					pin->Release();
					continue;
				}

				IEnumMediaTypes* enumMt = nullptr;
				if (FAILED(pin->EnumMediaTypes(&enumMt))) {
					pin->Release();
					break;
				}

				AM_MEDIA_TYPE* mt = nullptr;
				ULONG mtFetched = 0;
				enumMt->Reset();
				while (SUCCEEDED(enumMt->Next(1, &mt, &mtFetched)) && mtFetched) {
					if (mt->formattype == FORMAT_VideoInfo2 &&
						mt->majortype == MEDIATYPE_Video &&
						mt->cbFormat >= sizeof(VIDEOINFOHEADER2)) {
						VIDEOINFOHEADER2* vih = reinterpret_cast<VIDEOINFOHEADER2*>(mt->pbFormat);
						VIDEO_MEDIADATA mediadata;
						AVCodecID codecId = AV_CODEC_ID_NONE;
						if (mt->subtype == MEDIASUBTYPE_H264) {
							codecId = AV_CODEC_ID_H264;
						}
						else if (mt->subtype == MEDIASUBTYPE_MJPG) {
							codecId = AV_CODEC_ID_MJPEG;
						}
						else if (mt->subtype == MEDIASUBTYPE_YUY2) {
							codecId = AV_CODEC_ID_RAWVIDEO;
						}
						mediadata.width = vih->bmiHeader.biWidth;
						mediadata.height = vih->bmiHeader.biHeight;
						mediadata.fps = static_cast<unsigned int>((1000 * 1000 * 1000) / (vih->AvgTimePerFrame * 100));
						mediadata.pictAspectX = vih->dwPictAspectRatioX;
						mediadata.pictAspectY = vih->dwPictAspectRatioY;
						if (mediadatas.find(codecId) == mediadatas.cend()) {
							mediadata.isDefault = 1;
						}
						mediadatas.insert(std::make_pair(codecId, mediadata));
					}
				}

				pin->Release();
			}
			if (!mediadatas.empty()) {
				mediadataList.insert(std::make_pair(deviceId, mediadatas));
			}					

			enumPins->Release();
			filter->Release();
		}
	} while (0);
	return err;
}

Finally, select the camera capability used to initialize the camera according to the obtained camera media capability data list and user-defined rules (collected CODEC list m_capturecodes = {av_CODEC_id_mjpeg, av_CODEC_id_h264, av_CODEC_id_rawvideo}).

int CameraCaptor::initCamera(const std::string& cameraName, int number, const std::string& resolution, int fps, AVCodecID codecId)
{
	int err = ERROR_CODE_OK;

	do {
		if (m_formatCtx == nullptr) {
			m_formatCtx = avformat_alloc_context();
		}
		if (m_formatCtx == nullptr) {
			err = ERROR_CODE_FFMPEG_ALLOC_CONTEXT_FAILED;
			break;
		}
		if (m_decodeCtx == nullptr) {
			m_decodeCtx = avcodec_alloc_context3(nullptr);
		}
		if (m_decodeCtx == nullptr) {
			err = ERROR_CODE_FFMPEG_ALLOC_CONTEXT_FAILED;
			break;
		}

		// Set framerate, video_size, video_device_number properties.
		AVDictionary* dict = nullptr;
		av_dict_set_int(&dict, "video_device_number", number, 0);
		av_dict_set_int(&dict, "framerate", fps, 0);
		av_dict_set(&dict, "video_size", resolution.c_str(), 0);
		std::string url("video=" + cameraName);
		int ret = avformat_open_input(&m_formatCtx, url.c_str(), m_inputFmt, &dict);
		av_dict_free(&dict);
		if (ret != 0 || m_formatCtx == nullptr) {
			err = ERROR_CODE_FFMPEG_OPEN_INPUT_FAILED;
			break;
		}

		int index = -1;
		for (int i = 0; i < m_formatCtx->nb_streams; i++) {
			if (m_formatCtx->streams[i] != nullptr &&
				m_formatCtx->streams[i]->codecpar != nullptr &&
				m_formatCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO &&
				m_formatCtx->streams[i]->codecpar->codec_id == codecId) {
				index = i;
				break;
			}
		}
		if (index < 0) {
			err = ERROR_CODE_FFMPEG_FIND_BEST_STREAM_FAILED;
			break;
		}

		ret = avcodec_parameters_to_context(m_decodeCtx, m_formatCtx->streams[index]->codecpar);
		if (ret < 0) {
			err = ERROR_CODE_FFMPEG_PARAMS_TO_CONTEXT_FAILED;
			break;
		}
		m_decodeCtx->pkt_timebase = m_formatCtx->streams[index]->time_base;

		AVCodec* codec = avcodec_find_decoder(m_decodeCtx->codec_id);
		if (codec == 0) {
			err = ERROR_CODE_FFPMEG_FIND_DECODER_FAILED;
			break;
		}
		m_decodeCtx->flags2 |= AV_CODEC_FLAG2_FAST;

		AVDictionary* opts = nullptr;
		av_dict_set(&opts, "threads", "auto", 0);
		av_dict_set(&opts, "refcounted_frames", "0", 0);
		ret = avcodec_open2(m_decodeCtx, codec, &opts);
		av_dict_free(&opts);
		if (ret < 0) {
			err = ERROR_CODE_FFMPEG_OPEN_CODEC_FAILED;
			break;
		}
		m_formatCtx->streams[index]->discard = AVDISCARD_DEFAULT;
	} while (0);

	return err;
}

Another key point is m_pixelFmt pixel format also obtains the corresponding pixel format according to the user-defined rules (pixel format list m_capturepixfmts = {av_pix_fmt_yuvj422p, av_pix_fmt_yuv420p, av_pix_fmt_yuyv422}).
2. Start the acquisition camera and start the acquisition thread

int CameraCaptor::start()
{
	int err = ERROR_CODE_OK;

	if (m_running) {
		LOGGER::Logger::log(LOGGER::LOG_TYPE_WARN, "[%s] camera captor already running", __FUNCTION__);
		return err;
	}

	if (!m_inited) {
		err = ERROR_CODE_UNINITIALIZED;
		LOGGER::Logger::log(LOGGER::LOG_TYPE_ERROR, "[%s] camera captor not yet initialized: %s",
			__FUNCTION__, HCMDR_GET_ERROR_DESC(err));
		return err;
	}

	m_running = true;
	m_thread = std::thread(std::bind(&CameraCaptor::captureProcess, this));

	return err;
}

Camera acquisition function, which transmits the collected data to the next layer. It should be noted that the time stamp of the frame should be set, otherwise the generated file will play too slowly

void CameraCaptor::captureProcess()
{
	int err = ERROR_CODE_OK;

	AVPacket packet;
	AVFrame* frame = av_frame_alloc();
	while (m_running) {
		int ret = av_read_frame(m_formatCtx, &packet);
		if (ret < 0) {
			continue;
		}

		ret = avcodec_send_packet(m_decodeCtx, &packet);
		while (ret >= 0) {
			ret = avcodec_receive_frame(m_decodeCtx, frame);
			if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
				break;
			}
			else if (ret < 0) {
				err = ERROR_CODE_FFMPEG_DECODE_FAILED;
				LOGGER::Logger::log(LOGGER::LOG_TYPE_ERROR, "[%s] decode packet error: %s", __FUNCTION__,
					HCMDR_GET_ERROR_DESC(err));
				if (m_onVideoCaptureError != nullptr) {
					m_onVideoCaptureError(err, m_index);
				}
				break;
			}

			frame->pts = frame->pkt_dts = av_gettime_relative();
			if (m_onVideoCaptureData != nullptr) {
				m_onVideoCaptureData(frame, m_index);
			}
		}
		if (ret == AVERROR_EOF) {
			avcodec_flush_buffers(m_decodeCtx);
		}
		av_packet_unref(&packet);
	}
	av_frame_free(&frame);
}

3. Call stop to stop the collection

int CameraCaptor::stop()
{
	int err = ERROR_CODE_OK;

	if (!m_running) {
		return err;
	}

	m_running = false;
	if (m_thread.joinable()) {
		m_thread.join();
	}

	return err;
}

In this way, the camera acquisition function is completed. To sum up, we mainly use the dshow of ffmpeg and COM component technology, and then open the camera with corresponding ability to collect according to our own needs.

Posted by kwong on Tue, 21 Sep 2021 16:19:34 -0700