c++ tensorrt 推理yolov5 (opencv-gpu、nms-cuda-EW帮帮网

yolov5_trt.h

#pragma once

#ifndef YOLOV5_TRT_H
#define YOLOV5_TRT_H

#include <opencv2/opencv.hpp>
#include <string>
#include<vector>

struct Configuration
{
	float confThreshold; // Confidence threshold
	float nmsThreshold;  // Non-maximum suppression threshold
	float objThreshold;  //Object Confidence threshold
	std::string modelpath;
};

typedef struct BoxInfo
{
	float x1;
	float y1;
	float x2;
	float y2;
	float score;
	int label;
} BoxInfo;
class YOLOv5
{
public:
	YOLOv5(Configuration config);
	~YOLOv5();
	void UnInit();
	std::string detectWarn(cv::cuda::GpuMat& frame);
	void detect(cv::Mat& frame, bool& BoxNum);
	
private:
	float confThreshold;
	float nmsThreshold;
	float objThreshold;
	int inpWidth;
	int inpHeight;
	const int max_objects = 1000;
	const int NUM_BOX_ELEMENT = 7;
	std::vector<BoxInfo> newBbox;
	std::string classes[2] = {"D", "x"};

	const bool keep_ratio = true;
	void normalize_(cv::cuda::GpuMat img);
	void nms(std::vector<BoxInfo>& input_boxes);
	cv::cuda::GpuMat resize_image(cv::cuda::GpuMat srcimg, int* newh, int* neww, int* top, int* left);

	void loadTrt(const std::string strName);

	
	int m_iInputIndex;
	int m_iOutputIndex;
	int m_iClassNums;
	int m_iBoxNums;
	cv::Size m_InputSize;
	void* m_ArrayDevMemory[2]{ 0 };
	float* output_device = nullptr;
	float* output_host = nullptr;
	
	int m_ArraySize[2]{ 0 };
	std::vector<cv::cuda::GpuMat> m_InputWrappers;
};


#endif

yolov5_trt.cpp


#include <fstream>
#include <iostream>
#include <sys/stat.h>
#include <string>

#include <NvInfer.h>       
#include <NvOnnxParser.h>
#include <cuda_runtime.h>

#include "yolov5_trt.h"
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudawarping.hpp>
#include <cuda_runtime_api.h>

#include "stdio.h"

extern "C"
{

	#include "postprocess.cuh"
};


using namespace std;
using namespace cv;
using namespace nvinfer1;
nvinfer1::ICudaEngine* m_CudaEngine;
nvinfer1::IRuntime* m_CudaRuntime;
nvinfer1::IExecutionContext* m_CudaContext;
cudaStream_t m_CudaStream;


// Logger for TRT info/warning/errors, https://github.com/onnx/onnx-tensorrt/blob/main/onnx_trt_backend.cpp
class TRT_Logger : public nvinfer1::ILogger
{
	nvinfer1::ILogger::Severity _verbosity;
	std::ostream* _ostream;

public:
	TRT_Logger(Severity verbosity = Severity::kWARNING, std::ostream& ostream = std::cout)
		: _verbosity(verbosity)
		, _ostream(&ostream)
	{
	}
	void log(Severity severity, const char* msg) noexcept override
	{
		if (severity <= _verbosity)
		{
			time_t rawtime = std::time(0);
			char buf[256];
			//strftime(&buf[0], 256, "%Y-%m-%d %H:%M:%S", std::gmtime(&rawtime));
			const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" : severity == Severity::kERROR
				? "  ERROR"
				: severity == Severity::kWARNING ? "WARNING" : severity == Severity::kINFO ? "   INFO"
				: "UNKNOWN");
			(*_ostream) << "[" << buf << " " << sevstr << "] " << msg << std::endl;
		}
	}
};


static bool ifFileExists(const char* FileName)
{
	struct stat my_stat;
	return (stat(FileName, &my_stat) == 0);
}

void YOLOv5::loadTrt(const std::string strName)
{
	TRT_Logger gLogger;

	m_CudaRuntime = createInferRuntime(gLogger);
	std::ifstream fin(strName, std::ios::in | std::ios::binary);
	std::string cached_engine = "";
	while (fin.peek() != EOF)
	{
		std::stringstream buffer;
		buffer << fin.rdbuf();
		cached_engine.append(buffer.str());
	}
	fin.close();
	m_CudaEngine = m_CudaRuntime->deserializeCudaEngine(cached_engine.data(), cached_engine.size(), nullptr);
	m_CudaContext = m_CudaEngine->createExecutionContext();
	m_CudaRuntime->destroy();
}



YOLOv5::YOLOv5(Configuration config)
{
	confThreshold = config.confThreshold;
	nmsThreshold = config.nmsThreshold;
	objThreshold = config.objThreshold;
	inpHeight = 640;
	inpWidth = 640;

	std::string model_path = config.modelpath;

	std::string strTrtName = config.modelpath;
	size_t sep_pos = model_path.find_last_of(".");
	strTrtName = model_path.substr(0, sep_pos) + ".engine"; // ".trt"
	if (ifFileExists(strTrtName.c_str()))
	{
		loadTrt(strTrtName);
	}
	else
	{
		//loadOnnx(config.modelpath);
	}


	m_iInputIndex = m_CudaEngine->getBindingIndex("images");
	m_iOutputIndex = m_CudaEngine->getBindingIndex("output0");
	Dims dims_i = m_CudaEngine->getBindingDimensions(m_iInputIndex);
	int size1 = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3];
	m_InputSize = cv::Size(dims_i.d[3], dims_i.d[2]);
	Dims dims_o = m_CudaEngine->getBindingDimensions(m_iOutputIndex);
	int size2 = dims_o.d[0] * dims_o.d[1] * dims_o.d[2];
	m_iClassNums = dims_o.d[2] - 5;    // [,,classes+5]
	m_iBoxNums = dims_o.d[1];    // [b,num_pre_boxes,classes+5]


	cudaMalloc(&m_ArrayDevMemory[m_iInputIndex], size1 * sizeof(float));
	/*m_ArrayHostMemory[m_iInputIndex] = (float*)malloc(size1 * sizeof(float));*/
	m_ArraySize[m_iInputIndex] = size1 * sizeof(float);
	cudaMalloc(&m_ArrayDevMemory[m_iOutputIndex], size2 * sizeof(float));
	//m_ArrayHostMemory[m_iOutputIndex] = (float*)malloc(size2 * sizeof(float));
	m_ArraySize[m_iOutputIndex] = size2 * sizeof(float);
	// 分配一块足够大的内存, 第一个元素是count
	

	cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float));
	// 分配CPU内存 
	/*cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float));*/
	output_host = (float*)malloc(sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float));
	m_InputWrappers.emplace_back(dims_i.d[2], dims_i.d[3], CV_32FC1, (float*)m_ArrayDevMemory[m_iInputIndex]);
	m_InputWrappers.emplace_back(dims_i.d[2], dims_i.d[3], CV_32FC1, (float*)m_ArrayDevMemory[m_iInputIndex] + dims_i.d[2] * dims_i.d[3]);
	m_InputWrappers.emplace_back(dims_i.d[2], dims_i.d[3], CV_32FC1, (float*)m_ArrayDevMemory[m_iInputIndex] + 2 * dims_i.d[2] * dims_i.d[3]);

	cudaStreamCreate(&m_CudaStream);
}

void YOLOv5::UnInit()
{

	for (auto& p : m_ArrayDevMemory)
	{
		cudaFree(p);
		p = nullptr;
	}
	/*for (auto& p : m_ArrayHostMemory)
	{
		free(p);
		p = nullptr;
	}*/
	cudaStreamDestroy(m_CudaStream);
	cudaFree(output_device);   // 释放GPU上分配解码输出的内存
	free(output_host); // 释放在主机上分配的输出结果缓冲区的内存
	output_host = nullptr;
	m_CudaContext->destroy();
	m_CudaEngine->destroy();

}

YOLOv5::~YOLOv5()
{
	UnInit();
}



cuda::GpuMat YOLOv5::resize_image(cuda::GpuMat srcimg, int* newh, int* neww, int* top, int* left)
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	/*Mat cpusrcimg;
	Mat cpudstimg;
	srcimg.download(cpusrcimg);*/
	cuda::GpuMat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			cuda::resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
			*left = int((this->inpWidth - *neww) * 0.5);
			cuda::copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, BORDER_CONSTANT, 114);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			
			cuda::resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
			
			*top = (int)(this->inpHeight - *newh) * 0.5;
			cuda::copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, BORDER_CONSTANT, 114);
			
		}
	}
	else {
		cuda::resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
	}
	return dstimg;
}


string YOLOv5::detect(cuda::GpuMat& frame)
{
	int newh = 0, neww = 0, padh = 0, padw = 0;
	cuda::GpuMat dstimg = this->resize_image(frame, &newh, &neww, &padh, &padw);
	/*cuda::Stream stream1;*/
	cuda::cvtColor(dstimg, dstimg, cv::COLOR_BGR2RGB, 0);
	cuda::GpuMat m_Normalized;
	dstimg.convertTo(m_Normalized, CV_32FC3, 1 / 255.);
	cuda::split(m_Normalized, m_InputWrappers);
	
	//void * aa = m_InputWrappers1.data();
	/*auto ret = cudaMemcpyAsync(m_ArrayDevMemory[m_iInputIndex], m_ArrayHostMemory[m_iInputIndex], m_ArraySize[m_iInputIndex], cudaMemcpyHostToDevice, m_CudaStream);*/
	auto ret1 = m_CudaContext->enqueueV2(m_ArrayDevMemory, m_CudaStream, nullptr);
	/*auto ret1 = m_CudaContext->executeV2(m_ArrayDevMemory);*/
	/*auto ret = cudaMemcpyAsync(m_ArrayHostMemory[m_iOutputIndex], m_ArrayDevMemory[m_iOutputIndex], m_ArraySize[m_iOutputIndex], cudaMemcpyDeviceToHost, m_CudaStream);*/
	/*ret = cudaStreamSynchronize(m_CudaStream);*/
	float* pdata = (float*)m_ArrayDevMemory[m_iOutputIndex];

	/*float* ptr = (float*)pdata.data();*/
	int nelem = m_iBoxNums; // 计算data有多少个数据
	int ncols = 7;                          // cx, cy, width, height, objness, classification * 2
	int nrows = nelem / ncols;
	std::vector<BoxInfo> generate_boxes;
	float ratioh = (float)frame.rows / newh, ratiow = (float)frame.cols / neww;
	cudaMemset(output_device, 0, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float));
	memset(output_host, 0, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float));
	
	decode_kernel_invoker(
		pdata, m_iBoxNums, m_iClassNums, this->confThreshold,
		this->nmsThreshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, m_CudaStream, padh, padw, ratioh, ratiow);
	cudaMemcpyAsync(output_host, output_device,
		sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float),
		cudaMemcpyDeviceToHost, m_CudaStream);

	 
	cudaStreamSynchronize(m_CudaStream);

	
	int num_boxes = min((int)output_host[0], max_objects);
	 
	for (int i = 0; i < num_boxes; i++)
	{
		
		float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
		int keep_flag = ptr[6]; // 最后一个位置就是flag的值
		if (keep_flag)  // True
		{	

			generate_boxes.emplace_back(
				BoxInfo{ ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5] });
		}
	}
	if (generate_boxes.size() < 2)
	{
		return "";
	}
	for (size_t i = 0; i < generate_boxes.size(); ++i)
	{
		int xmin = int(generate_boxes[i].x1);
		int ymin = int(generate_boxes[i].y1);
		Mat showFrame;
		frame.download(showFrame);
		rectangle(showFrame, Point(xmin, ymin), Point(int(generate_boxes[i].x2), int(generate_boxes[i].y2)), Scalar(0, 0, 255), 2);
		std::string label = format("%.2f", generate_boxes[i].score);
		label = this->classes[int(generate_boxes[i].label)] + ":" + label;
		if (classes[int(generate_boxes[i].label)] == "x") {
			label = "length_" + to_string(x_length);
		}
		putText(showFrame, label, Point(xmin, ymin - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
		frame.upload(showFrame);
	}
}

postprocess.cuh

#include <iostream>
#include "yolov5_trt.h"
#include <vector>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#ifdef __cplusplus

#ifndef checkRuntime
#define checkRuntime(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__ << std::endl;\
            assert(0);\
        }\
    }
#endif  // checkRuntime

extern "C"
{
#endif

	static __global__ void decode_kernel(
		float* predict, int num_bboxes, int num_classes, float confidence_threshold,
		float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, int padh, int padw, float ratioh, float ratiow);
	static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT);

	static __device__ float box_iou(
		float aleft, float atop, float aright, float abottom,
		float bleft, float btop, float bright, float bbottom);
	/*std::vector<BoxInfo> gpu_decode(float* predict, int rows, int cols, cudaStream_t& stream, int padh, int padw, float ratioh, float ratiow,
		float confidence_threshold = 0.45f, float nms_threshold = 0.45f);*/
	void decode_kernel_invoker(
		float* predict, int num_bboxes, int num_classes, float confidence_threshold,
		float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream, int padh, int padw, float ratioh, float ratiow);
#ifdef __cplusplus 
}
#endif

postprocess.cu

#include <iostream>
#include <string>
#include "postprocess.cuh"

using namespace std;

void decode_kernel_invoker(
	float* predict, int num_bboxes, int num_classes, float confidence_threshold,
	float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream, int padh, int padw, float ratioh, float ratiow) {
	/*
	参数解析:
	predict: 预测结果, 这个就是data, 未处理未过滤的predict
	num_bboxes: 在预测结果的（n x num_classes+ 5） tensor中, 多少行就是多少个box
	num_classes: 类别数量
	confidence_threshold: 置信度阈值
	nms_threshold: nms阈值
	invert_affine_matrix: 逆矩阵的指针
	parray: 输出结果数组
	max_objects: 最大数量框, 这边设置的是1000, 只是拿来确保有足够的内存
	NUM_BOX_ELEMENT: Box的element, left, top, right, bottom, confidence, class, keepflag 一共7个
	stream： 流
	*/
	// 这里是确保有足够的线程去处理每一个box, 也就是每一个预测结果，所以用num_boxxes
	// 确保每个block的线程不超过512
	int block = num_bboxes > 512 ? 512 : num_bboxes;
	int grid = (num_bboxes + block - 1) / block;

	/* 如果核函数有波浪线，没关系，他是正常的，你只是看不顺眼罢了 */
	decode_kernel << <grid, block, 0, stream >> > (
		predict, num_bboxes, num_classes, confidence_threshold,
		invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT, 
		padh, padw, ratioh, ratiow);
	//auto code1 = cudaPeekAtLastError();
	//cout << cudaGetErrorString(code1) << endl;

	 对当前设备的核函数进行同步，等待执行完毕，可以发现过程是否存在异常
	//auto code2 = cudaDeviceSynchronize();
	//cout << cudaGetErrorString(code2) << endl;

	// 这里是针对每张图的框，确保每个狂都能被线程处理
	// 同样确保每个block的线程不超过512
	block = max_objects > 512 ? 512 : max_objects;
	grid = (max_objects + block - 1) / block;
	fast_nms_kernel << <grid, block, 0, stream >> > (parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
}

static __global__ void decode_kernel(
	float* predict, int num_bboxes, int num_classes, float confidence_threshold,
	float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, int padh, int padw, float ratioh, float ratiow)
{
	// 确保有足够的thread, 每一个thread处理一个bounding box
	// 如果threadId超过了bounding box的数量, 这样就不会进行后续处理, 每个预测框都敲好被处理了一次
	int position = blockDim.x * blockIdx.x + threadIdx.x;
	
	if (position >= num_bboxes) {
		return;
	}

	/*
	 predict是n x 85tensor输出的首地址
	 pitem 就是每行的指针, pitem[0] - pitem[3] 是位置信息, pitem[4]是objness
	*/

	/*printf("index: %d\n", index1);*/
	float* pitem = predict + (num_classes + 5) * position;
	
	/*printf("pitem: %f\n", *pitem);*/
	float objectness = pitem[4];
	
	
	if (objectness < confidence_threshold) {
		return;
	}

	// 从这个元素开始都是confidence
	float* class_confidence = pitem + 5;
	// 这里是第一个condience, 取到数值
	float confidence = *class_confidence++;

	// for循环判断是哪个类别
	int label = 0;
	for (int i = 1; i < num_classes; i++, ++class_confidence)
	{
		if (*class_confidence > confidence)
		{
			// 如果大了, 就更新class_confidence
			confidence = *class_confidence;
			label = i; // 取到label
		}
	}

	/*
	上面的最后算出来的condifence是class_confidence只是条件概率
	当前bounding box的 confidence(置信度) =  objectness(物体概率) x class_confidence(条件概率)
	最后拿来计算置信度的confidence是最大的class_confidence
	*/
	confidence *= objectness;
	
	if (confidence < confidence_threshold) {
		return;
	}

	/*
	这里是恢复boudingbox的操作, 需要先取出来中心点(cx, cy), width, height, 然后转换为原始坐标
	*/
	float cx = *pitem++;
	float cy = *pitem++;
	float width = *pitem++;
	float height = *pitem++;
	float left = (cx - padw - 0.5 * width) * ratiow;
	float top = (cy - padh - 0.5 * height) * ratioh;
	float right = (cx - padw + 0.5 * width) * ratiow;
	float bottom = (cy - padh + 0.5 * height) * ratioh;
	
	/*affine_project(invert_affine_matrix, left,  top,    &left,  &top);
	affine_project(invert_affine_matrix, right, bottom, &right, &bottom);*/


	/*
	atomicAdd()简介:
	int atomicAdd(int* address, int val);
	这个函数执行的操作是将指定地址 address 处的值与 val 相加，并将结果写回 address 处。这个操作是原子性的，即不会受到并发写入的干扰，保证了数据的正确性。
	使用 atomicAdd 函数可以保证多个线程在对同一个内存地址进行写操作时，不会发生数据覆盖的问题。
	由于每个线程都会在输出中写入一个bounding box，因此需要使用原子操作确保每个线程写入的位置唯一
	*/

	/*
	[count, box1, box2, box3]
	因为GPU解码是多线程的, 所以需要用count记录已经处理了多少个bounding box。
	CPU单线程不需要, GPU需要确保不会将一个检测框重复输出或者漏掉。
	atomicAdd -> count +=1 返回 old_count
	这里是对parray(output_device第一个值+1)
	*/
	int index = atomicAdd(parray, 1);
	
	// 如果超过了1000, 这个线程就没必要处理后面的boxes
	if (index >= max_objects)
		return;

	// left, top, right, bottom, confidence, class, keepflag
	float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
	*pout_item++ = left;
	*pout_item++ = top;
	*pout_item++ = right;
	*pout_item++ = bottom;
	*pout_item++ = confidence;
	*pout_item++ = label;
	*pout_item++ = 1; // 1 = keep, 0 = ignore
	
}
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT)
{
	/*
	参数解析:
	bboxes：存储了所有待处理的检测框信息的一维数组；
	max_objects：最大的输出检测框数量； 案例设置的是1000, 预计一张图不会超过1000个bounding box
	threshold：用于判断两个检测框是否重叠的 IOU 阈值；
	NUM_BOX_ELEMENT：每个检测框存储的元素个数
	一般包含: left, top, right, bottom, confidence, class, keepflag
	*/

	// 计算position, 超过count不用进行下面计算了
	int position = blockDim.x * blockIdx.x + threadIdx.x;
	int count = min((int)*bboxes, max_objects);
	if (position > count) {
		return;
	}

	/*
	重叠度高, 并且类别相同，然后是condience小于另外一个, 就删掉他
	极端情况下会有误删, 如果测试cpu map的时候, 只能采用cpu nms
	日常推理的时候, 则可以使用这个NMS
	left, top, right, bottom, confidence, class, keepflag
	*/

	// 这里计算出来当前的指针, 在bboxes上
	float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
	// 便利每一个bbox
	for (int i = 0; i < count; ++i) {
		float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
		// NMS计算需要保证类别必须相同
		if (i == position || pcurrent[5] != pitem[5]) {
			continue;
		}

		// 判断置信度大小, 如果比pcurrent大，干掉pcurrent
		if (pitem[4] > pcurrent[4]) {
			// 如果两个一样大，保留编号小的那个
			if (pitem[4] == pcurrent[4] && i < position) {
				continue;
			}

			// 拿前面四个信息计算IOU
			float iou = box_iou(
				pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
				pitem[0], pitem[1], pitem[2], pitem[3]);

			if (iou > threshold) {
				pcurrent[6] = 0;  // 这里pitem跟pcurrent重合度高而且达到阈值
				return;
			}
		}
	}
}

static __device__ float box_iou(
	float aleft, float atop, float aright, float abottom,
	float bleft, float btop, float bright, float bbottom)
{

	float cleft = max(aleft, bleft);
	float ctop = max(atop, btop);
	float cright = min(aright, bright);
	float cbottom = min(abottom, bbottom);

	float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
	if (c_area == 0.0f)
		return 0.0f;

	float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
	float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
	return c_area / (a_area + b_area - c_area);
}

main.cpp

#include <iostream>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <string>
#include <filesystem>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudawarping.hpp>
#include <fstream>

namespace fs = std::filesystem;
#include <windows.h>

using namespace cv;
using namespace std;
#include "yolov5_trt.h"

std::condition_variable r_cond;
std::condition_variable w_cond;
using namespace cv;
using namespace std;
queue<Mat> srcImageQ;
queue<cuda::GpuMat> wImageQ;
clock_t startTime, endTime;
bool stop = false;
bool dstop = false;
bool sstop = false;
int totalFrame = 0;
//缓冲区最大空间
const int MAX_CACHEDATA_LENGTH = 50;
mutex m1;
mutex m2;
int nnum = 0;


Configuration yolo_nets = { 0.45, 0.5, 0.45,"C:/0WORK/project/FB/exp2/weights/best.engine" };
YOLOv5 yolo_model(yolo_nets);

void writeImageToFile(const std::string& filepath, const std::vector<uchar>& imgData, size_t length) {
	// 打开文件
	std::ofstream outputFile(filepath, std::ios::out | std::ios::binary);

	// 检查文件是否成功打开
	if (!outputFile.is_open()) {
		std::cerr << "Failed to open file for writing." << std::endl;
		return;
	}

	// 将图像数据写入文件
	outputFile.write(reinterpret_cast<const char*>(imgData.data()), static_cast<int>(length));

	// 关闭文件
	outputFile.close();

	// 检查是否写入成功
	/*if (!outputFile) {
		std::cerr << "Failed to write image data to file." << std::endl;
		return;
	}*/

	/*std::cout << "Image data successfully written to file: " << filepath << std::endl;*/
}


void readVideo(string dirString) {
	
	VideoCapture cap(dirString);
	if (!cap.isOpened()) {
		std::cerr << "Error opening video file" << std::endl;
		return;
	}
	//cv::namedWindow("YOLOv5 Object Detection", cv::WINDOW_NORMAL);
	int totalFrames = cap.get(cv::CAP_PROP_FRAME_COUNT);
	totalFrame = totalFrames;
	/*cv::Mat frame;*/
	/*int count = 0;*/
	while (true) {

		cv::Mat frame;
		if (!cap.read(frame)) {
			// 若无法读取帧，则退出循环
			break;
		}
		int width = frame.cols;
		int height = frame.rows;
		int channels = frame.channels();

		/*if ((count % 3) != 0) {
			count += 1;
			continue;}*/
		unique_lock<mutex> rl(m1);
		if (srcImageQ.size() > MAX_CACHEDATA_LENGTH) {
			r_cond.wait(rl);
		}
		srcImageQ.push(frame);
		
		
		
	}

	//while (cap.read(frame)) {

	//	/*if ((count % 3) != 0) {
	//		count += 1;
	//		continue;*/
	//	m1.lock();
	//	srcImageQ.push(frame);
	//	m1.unlock();		
	//	Sleep(0.5);
	//}
	
	stop = true;

}

void readImage(string searchPath) {

	for (const auto& entry : fs::directory_iterator(searchPath)) {
		
		fs::path dirPath = entry.path();
		string dirString = dirPath.string();
		int length = dirString.length();
		if (dirString.substr(length - 4) != ".jpg") { continue; }
		Mat srcImage = imread(dirString);
		unique_lock<mutex> rl(m1);
		if (srcImageQ.size() > MAX_CACHEDATA_LENGTH) {
			r_cond.wait(rl);
		}
		/*m1.lock();*/
		srcImageQ.push(srcImage);
		/*m1.unlock();*/
		/*cout << "read image queue " << srcImageQ.size() << endl;*/
		/*Sleep(0.5);*/
	}
	stop = true;
	
}

void detImage() {

	while (true) {
		cout << "srcImageQ" << srcImageQ.size() << endl;
		if (!srcImageQ.empty()) {
			
			Mat srcImage = srcImageQ.front();
			/*m1.lock();*/
			srcImageQ.pop();
			r_cond.notify_one();
			/*m1.unlock();*/

			cuda::GpuMat gpuFrame;
			gpuFrame.upload(srcImage);
			yolo_model.detect(gpuFrame);
			unique_lock<mutex> r2(m2);
			if (wImageQ.size() > MAX_CACHEDATA_LENGTH) {
				w_cond.wait(r2);
			}
			wImageQ.push(gpuFrame);
		if (srcImageQ.empty() && stop) {

		dstop = true;
		break; 
		}

	}
}

void writeImage() {
	
	while (true) {
		if (!wImageQ.empty()) {
			cuda::GpuMat image = wImageQ.front();
			Mat wImage;
			image.download(wImage);
			/*m2.lock();*/
			wImageQ.pop();
			/*m2.unlock();*/
			w_cond.notify_one();
			// 创建内存缓冲区
			//std::vector<uchar> buffer;

			 将图像编码为内存缓冲区
			//cv::imencode(".jpg", wImage, buffer);
			//writeImageToFile("C:/0WORK/project/FB/mp4/warn/" + to_string(count) + ".jpg", buffer, buffer.size());
			imwrite("C:/0WORK/project/FB/mp4/warn2/" + to_string(nnum) + ".jpg", wImage);
			/*cout << "imwrite image:" << count << endl;*/
			nnum++;
			
		}
		cout << "read image queue " << srcImageQ.size() << " write image queue " << wImageQ.size() << endl;
		/*if (srcImageQ.size() > 2000) {
		
			int a = 0;
		
		}*/
		if (wImageQ.empty() && dstop) {
			break;
		}
	}

}


void main() {

	/*string searchPath = "C:/0WORK/project/FB/fbData/JPEGImages";*/
	string dirString = "C:/0WORK/project/FB/mp4/FB/";
	for (const auto& entry : fs::directory_iterator(dirString)) {
		fs::path dirPath = entry.path();
		string dirString = dirPath.string();
		startTime = clock();
		thread read(readVideo, ref(dirString));
		read.detach();
		endTime = clock();
		cout << "detection infer time:" << double(endTime - startTime) / totalFrame << "ms" << endl;
		cout << "totalFrame:" << totalFrame << endl;
	}
	thread detect(detImage);
	thread write(writeImage);
	
	detect.detach();
	write.join();
}

c++ tensorrt 推理yolov5 (opencv-gpu、nms-cuda

网站公告

今日签到

热门文章

最新发布