YOLOv3处理图片优化——cuda bilinear resize
YOLOv3中处理一张1080P的图片,resize到输入416*416尺寸,调用内部接口做cpu resize,可能80%~90%的时间耗在图像解码、resize上,对比推理时间耗时严重。尝试用cuda做外部resize。
修改下工程用于Ubuntu16.04,1080ti显卡,提供个包其中需要cmakelist修改下opencv路径。
下载:https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w
resizeGPU.cu
#include "resizeGPU.cuh" //#define _DEBUG #define BLOCK_DIM 64 #define threadNum 1024 #define WARP_SIZE 32 #define elemsPerThread 1 int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU int32_t* deviceData; //оригинальное изображение в памяти GPU int32_t* hostOriginalImage; int32_t* hostResizedImage; void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource) { cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t)); return; } void freePinned() { cudaFreeHost(hostOriginalImage); cudaFreeHost(hostResizedImage); return; } void initGPU(const int maxResolutionX, const int maxResolutionY) { cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t)); cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t)); return; } void deinitGPU() { cudaFree(deviceData); cudaFree(deviceDataResized); return; } __global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/) { __shared__ int32_t tile[1024]; const float x_ratio = ((float)(w - 1)) / w2; const float y_ratio = ((float)(h - 1)) / h2; //const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM; //unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x; unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread; //__shared__ float result[threadNum*elemsPerThread]; unsigned int shift = 0; //int32_t a, b, c, d, x, y, index; while((threadId < w2*h2 && shift<elemsPerThread)) { const int32_t i = threadId / w2; const int32_t j = threadId - (i*w2); //float x_diff, y_diff, blue, red, green; const int32_t x = (int)(x_ratio * j); const int32_t y = (int)(y_ratio * i); const float x_diff = (x_ratio * j) - x; const float y_diff = (y_ratio * i) - y; const int32_t index = (y*w + x); const int32_t a = originalImage[index]; const int32_t b = originalImage[index + 1]; const int32_t c = originalImage[index + w]; const int32_t d = originalImage[index + w + 1]; // blue element // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh) const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) + (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff); // green element // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh) const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) + ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff); // red element // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh) const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) + ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff); /* resizedImage[threadId] = 0xff000000 | ((((int32_t)red) << 16) & 0xff0000) | ((((int32_t)green) << 8) & 0xff00) | ((int32_t)blue); */ tile[threadIdx.x] = 0xff000000 | ((((int32_t)red) << 16) & 0xff0000) | ((((int32_t)green) << 8) & 0xff00) | ((int32_t)blue); threadId++; //threadId+= WARP_SIZE; shift++; } __syncthreads(); threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread; resizedImage[threadId] = tile[threadIdx.x]; /* shift--; threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift; while (shift >= 0) { resizedImage[threadId] = tile[shift]; shift--; threadId--; } */ } int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2) { #ifdef _DEBUG cudaError_t error; //store cuda error codes #endif int length = w2 * h2; // Копирование исходных данных в GPU для обработки cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice); //cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice); //error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice); #ifdef _DEBUG if (error != cudaSuccess) { printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } #endif dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1); //printf("Blockdim.x %d\n", blocks.x); //printf("thrdim.x %d\n", threads.x); // Запуск ядра из (length / 256) блоков по 256 потоков, // предполагая, что length кратно 256 SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/); cudaDeviceSynchronize(); // Считывание результата из GPU cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost); return hostResizedImage; }
converter.cpp
#include "converter.hpp" int32_t* cvtMat2Int32(const cv::Mat& srcImage) { int32_t *result = new int32_t[srcImage.cols*srcImage.rows]; int offset = 0; for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3) { int32_t blue = srcImage.data[i]; int32_t green = srcImage.data[i + 1]; int32_t red = srcImage.data[i + 2]; result[offset++] = 0xff000000 | ((((int32_t)red) << 16) & 0xff0000) | ((((int32_t)green) << 8) & 0xff00) | ((int32_t)blue); } return result; } void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage) { int offset = 0; for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3) { int32_t a = pxArray[offset++]; int32_t blue = a & 0xff; int32_t green = ((a >> 8) & 0xff); int32_t red = ((a >> 16) & 0xff); outImage.data[i] = blue; outImage.data[i + 1] = green; outImage.data[i + 2] = red; } return; }
resizeCPU.cpp
#include "resizeCPU.hpp" int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2) { int32_t* temp = new int32_t[w2*h2]; int32_t a, b, c, d, x, y, index; float x_ratio = ((float)(w - 1)) / w2; float y_ratio = ((float)(h - 1)) / h2; float x_diff, y_diff, blue, red, green; int offset = 0; for (int i = 0; i<h2; i++) { for (int j = 0; j<w2; j++) { x = (int)(x_ratio * j); y = (int)(y_ratio * i); x_diff = (x_ratio * j) - x; y_diff = (y_ratio * i) - y; index = (y*w + x); a = pixels[index]; b = pixels[index + 1]; c = pixels[index + w]; d = pixels[index + w + 1]; // blue element // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh) blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) + (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff); // green element // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh) green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) + ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff); // red element // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh) red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) + ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff); temp[offset++] = 0xff000000 | ((((int32_t)red) << 16) & 0xff0000) | ((((int32_t)green) << 8) & 0xff00) | ((int32_t)blue); } } return temp; }
对比下结果,在1080ti下,resize 1080P图片到416*416尺寸,cuda resize 1.6ms,cpu resize 3.8ms,
darknet内部接口cpu resize 8.0ms。
cpu resize相比darknet resize 接口主要是移位操作有提速,cuda resize处理时间减少很多,但是需要做数据类型Mat与Int32相互转换。
酒是穿肠毒药,色是刮骨钢刀,财是惹祸根苗,气是雷烟火炮。
不过,无酒毕竟不成席,无色世上人渐稀,无财何人早早起,无气处处惹人欺。
饮酒不醉量为高,见色不迷真英豪,不义之财君莫取,忍气饶人祸自消。
酒色财气四堵墙,人人都在里边藏,谁若跳到墙外边,不是神仙也寿长。
君听我一言:做人,量体裁衣。