cuda 编程1

本文参考链接：
《CUDA C Programming Guide》(《CUDA C 编程指南》)导读 https://zhuanlan.zhihu.com/p/53773183?from_voters_page=true

/* main.cu */
#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"   
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

//内核函数
__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out, 
                                uint imgheight, uint imgwidth)
{
    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;

    if (idx < imgwidth && idy < imgheight)  //有的线程会跑到图像外面去，不执行即可
    {
        uchar3 rgb = d_in[idy * imgwidth + idx];
        d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
    }
}

//用于对比的CPU串行代码
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
                                uint imgheight, uint imgwidth)
{
    for(int i = 0; i < imgheight; i++)
    {
        for(int j = 0; j < imgwidth; j++)
        {
            d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
                                     + 0.587f * d_in[(i * imgwidth + j)*3 + 1]
                                     + 0.114f * d_in[(i * imgwidth + j)*3 + 2];
        }
    }
}

int main(void)
{
    Mat srcImage = imread("/data_2/dog2.jpg");
    imshow("srcImage", srcImage);
    waitKey(0);

    const uint imgheight = srcImage.rows;
    const uint imgwidth = srcImage.cols;

    Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));

    uchar3 *d_in;   //向量类型，3个uchar
    unsigned char *d_out;

    //首先分配GPU上的内存
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char));

    //将主机端数据拷贝到GPU上
    cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    //每个线程处理一个像素
    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
        (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    clock_t start, end;
    start = clock();
#if 0 //cuda
    //启动内核
    rgb2grayincuda<< <blocksPerGrid, threadsPerBlock>> >(d_in, d_out, imgheight, imgwidth);
    //执行内核是一个异步操作，因此需要同步以测量准确时间
    cudaDeviceSynchronize();
    end = clock();
    printf("cuda exec time is %.8f
", (double)(end-start)/CLOCKS_PER_SEC);
    //拷贝回来数据
    cudaMemcpy(grayImage.data, d_out, imgheight*imgwidth*sizeof(unsigned char), cudaMemcpyDeviceToHost);
    //释放显存
    cudaFree(d_in);
    cudaFree(d_out);
#endif

#if 1 //cpu
    rgb2grayincpu(srcImage.data, grayImage.data,imgheight, imgwidth);

     //执行内核是一个异步操作，因此需要同步以测量准确时间
    //cudaDeviceSynchronize();
    end = clock();
    printf("cpu exec time is %.8f
", (double)(end-start)/CLOCKS_PER_SEC);

#endif
    imshow("grayImage", grayImage);
    waitKey(0);
    return 0;
}

//CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
project(testcuda)
find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)
include_directories("/home/yhl/software_install/opencv3.2/include")
cuda_add_executable(testcuda main.cu)
target_link_libraries(testcuda ${OpenCV_LIBS})

cuda 运行：cuda exec time is 0.00005800
cpu 运行：cpu exec time is 0.00115700

例子2：
参考链接
https://zhuanlan.zhihu.com/p/34587739

#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"   
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

int main(void)
{
int dev = 0;
    cudaDeviceProp devProp;
    //CHECK(cudaGetDeviceProperties(&devProp, dev));
    cudaGetDeviceProperties(&devProp, dev);
    std::cout << "使用GPU device " << dev << ": " << devProp.name << std::endl;
    std::cout << "SM的数量：" << devProp.multiProcessorCount << std::endl;
    std::cout << "每个线程块的共享内存大小：" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
    std::cout << "每个线程块的最大线程数：" << devProp.maxThreadsPerBlock << std::endl;
    std::cout << "每个EM的最大线程数：" << devProp.maxThreadsPerMultiProcessor << std::endl;
    std::cout << "每个EM的最大线程束数：" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
}

输出如下：
使用GPU device 0: GeForce GTX 1080
SM的数量：20
每个线程块的共享内存大小：48 KB
每个线程块的最大线程数：1024
每个EM的最大线程数：2048
每个EM的最大线程束数：64

cuda编程，10 篇博客，深入浅出谈CUDA

https://blog.csdn.net/sunmc1204953974/category_6156113.html