cuda_opencv 矩阵相加

实现矩阵相加
 1 #include <stdlib.h>
 2 #include <stdio.h>
 3 #include <opencv/cv.h>
 4 #include <opencv/highgui.h>
 5 #include <opencv2/opencv.hpp>
 6 
 7 #include "cuda_runtime.h"
 8 #include "device_launch_parameters.h"
 9 using namespace std;
10 using namespace cv;
11 
12 __global__ void Add_kernel(const int2* d_A, const int2* d_B,int2*  d_C,int width, int height)
13 {
14     int x = threadIdx.x + blockIdx.x * blockDim.x;
15     int y = threadIdx.y + blockIdx.y * blockDim.y;
16 
17     if(x < width && y < height)
18     {
19         int offset = x + y*width;
20         d_C[offset].x = d_A[offset].x + d_B[offset].x;
21         d_C[offset].y = d_A[offset].y + d_B[offset].y;
22     }
23 }
24 int main()
25 {
26     Mat img(3, 4, CV_32S, Scalar_<int>(0));
27 
28     cout<<img<<endl;
29     cout<<endl;
30 
31 
32     for(int i = 0 ; i < img.rows; i++)
33     {
34         for(int j = 0 ; j < img.cols; j++)
35         {
36             img.at<int>(i,j)=i+j;
37         }
38     }
39     cout<<endl;
40 
41     cout<<img<<endl;
42 
43 
44     size_t memSize = img.step * img.rows;
45     int2* d_A = NULL;
46     int2* d_B = NULL;
47     int2* d_C = NULL;
48     cudaMalloc((void**)&d_A, memSize);
49     cudaMalloc((void**)&d_B, memSize);
50     cudaMalloc((void**)&d_C, memSize);
51 
52     cudaMemcpy(d_A,img.data,memSize, cudaMemcpyHostToDevice);
53     cudaMemcpy(d_B,img.data,memSize, cudaMemcpyHostToDevice);
54 
55     dim3 threads(16, 16);
56     dim3 grids((img.rows + threads.x - 1)/threads.x,(img.cols + threads.y - 1)/threads.y);
57     Add_kernel<<<grids,threads>>>(d_A, d_B, d_C, img.rows, img.cols);
58 
59     cudaMemcpy(img.data, d_C,memSize,cudaMemcpyDeviceToHost);
60     cout<<"GPU"<<endl;
61     cout<<img<<endl;
62     cudaFree(d_A);
63     cudaFree(d_B);
64     cudaFree(d_C);
65 
66     system("pause");
67     return 0;
68 }