YOLOv3处理图片优化——cuda bilinear resize

YOLOv3中处理一张1080P的图片，resize到输入416*416尺寸，调用内部接口做cpu resize，可能80%~90%的时间耗在图像解码、resize上，对比推理时间耗时严重。尝试用cuda做外部resize。
修改下工程用于Ubuntu16.04,1080ti显卡，提供个包其中需要cmakelist修改下opencv路径。

下载：https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w

resizeGPU.cu

 
#include "resizeGPU.cuh"
//#define _DEBUG
 
#define BLOCK_DIM 64
#define threadNum 1024
#define WARP_SIZE 32
#define elemsPerThread 1
 
int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
int32_t* deviceData; //оригинальное изображение в памяти GPU
int32_t* hostOriginalImage;
int32_t* hostResizedImage;
 
void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
{
    cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
    cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
    memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));
 
    return;
}
 
void freePinned()
{
    cudaFreeHost(hostOriginalImage);
    cudaFreeHost(hostResizedImage);
 
    return;
}
 
void initGPU(const int maxResolutionX, const int maxResolutionY)
{
    cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
    cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));
 
    return;
}
 
void deinitGPU()
{
    cudaFree(deviceData);
    cudaFree(deviceDataResized);
 
    return;
}
 
__global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
{
    __shared__ int32_t tile[1024];
    const float x_ratio = ((float)(w - 1)) / w2;
    const float y_ratio = ((float)(h - 1)) / h2;
    //const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
    //unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
    unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
    //__shared__ float result[threadNum*elemsPerThread];
    unsigned int shift = 0;
    //int32_t a, b, c, d, x, y, index;
    while((threadId < w2*h2 && shift<elemsPerThread))
    {
        const int32_t i = threadId / w2;
        const int32_t j = threadId - (i*w2);
        //float x_diff, y_diff, blue, red, green;
        
        const int32_t x = (int)(x_ratio * j);
        const int32_t y = (int)(y_ratio * i);
        const float x_diff = (x_ratio * j) - x;
        const float y_diff = (y_ratio * i) - y;
        const int32_t index = (y*w + x);
        const int32_t a = originalImage[index];
        const int32_t b = originalImage[index + 1];
        const int32_t c = originalImage[index + w];
        const int32_t d = originalImage[index + w + 1];
        // blue element
        // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
        const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
            (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
 
        // green element
        // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
        const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
            ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
 
        // red element
        // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
        const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
            ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
 
        /*
        resizedImage[threadId] =
            0xff000000 |
            ((((int32_t)red) << 16) & 0xff0000) |
            ((((int32_t)green) << 8) & 0xff00) |
            ((int32_t)blue);
        */
        tile[threadIdx.x] =
            0xff000000 |
            ((((int32_t)red) << 16) & 0xff0000) |
            ((((int32_t)green) << 8) & 0xff00) |
            ((int32_t)blue);
 
        threadId++;
        //threadId+= WARP_SIZE;
        shift++;
    }
    
    __syncthreads();
    threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
    resizedImage[threadId] = tile[threadIdx.x];
    /*
    shift--;
    threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;
    while (shift >= 0)
    {
        resizedImage[threadId] = tile[shift];
        shift--;
        threadId--;
    }
    */
}
 
 
 
int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
{
#ifdef _DEBUG
    cudaError_t error; //store cuda error codes
#endif
    int length = w2 * h2;
 
    // Копирование исходных данных в GPU для обработки
    cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
    //cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
    //error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
#ifdef _DEBUG
    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)
", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }
#endif
 
    dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
    dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
    //printf("Blockdim.x %d
", blocks.x);
    //printf("thrdim.x %d
", threads.x);
 
    // Запуск ядра из (length / 256) блоков по 256 потоков,
    // предполагая, что length кратно 256
    SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);
 
 
    cudaDeviceSynchronize();
    // Считывание результата из GPU
    cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);
 
    return hostResizedImage;
}

converter.cpp

#include "converter.hpp"
 
int32_t* cvtMat2Int32(const cv::Mat& srcImage)
{
    int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
    int offset = 0;
 
    for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
    {
        int32_t blue = srcImage.data[i];
        int32_t green = srcImage.data[i + 1];
        int32_t red = srcImage.data[i + 2];
        result[offset++] =
            0xff000000 |
            ((((int32_t)red) << 16) & 0xff0000) |
            ((((int32_t)green) << 8) & 0xff00) |
            ((int32_t)blue);
    }
 
    return result;
}
 
void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
{
    int offset = 0;
    for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
    {
        int32_t a = pxArray[offset++];
        int32_t blue = a & 0xff;
        int32_t green = ((a >> 8) & 0xff);
        int32_t red = ((a >> 16) & 0xff);
        outImage.data[i] = blue;
        outImage.data[i + 1] = green;
        outImage.data[i + 2] = red;
    }
    return;
}

resizeCPU.cpp

 
#include "resizeCPU.hpp"
 
int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
{
    int32_t* temp = new int32_t[w2*h2];
    int32_t a, b, c, d, x, y, index;
    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    float x_diff, y_diff, blue, red, green;
    int offset = 0;
    for (int i = 0; i<h2; i++)
    {
        for (int j = 0; j<w2; j++)
        {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff = (x_ratio * j) - x;
            y_diff = (y_ratio * i) - y;
            index = (y*w + x);
            a = pixels[index];
            b = pixels[index + 1];
            c = pixels[index + w];
            d = pixels[index + w + 1];
 
            // blue element
            // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
            blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
                (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
 
            // green element
            // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
            green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
                ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
 
            // red element
            // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
            red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
                ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
 
            temp[offset++] =
                0xff000000 |
                ((((int32_t)red) << 16) & 0xff0000) |
                ((((int32_t)green) << 8) & 0xff00) |
                ((int32_t)blue);
        }
    }
    return temp;
}

对比下结果，在1080ti下，resize 1080P图片到416*416尺寸，cuda resize 1.6ms，cpu resize 3.8ms，

darknet内部接口cpu resize 8.0ms。

cpu resize相比darknet resize 接口主要是移位操作有提速，cuda resize处理时间减少很多，但是需要做数据类型Mat与Int32相互转换。

酒是穿肠毒药，色是刮骨钢刀，财是惹祸根苗，气是雷烟火炮。不过，无酒毕竟不成席，无色世上人渐稀，无财何人早早起，无气处处惹人欺。饮酒不醉量为高，见色不迷真英豪，不义之财君莫取，忍气饶人祸自消。酒色财气四堵墙，人人都在里边藏，谁若跳到墙外边，不是神仙也寿长。君听我一言：做人，量体裁衣。