0_Simple__matrixMul + 0_Simple__matrixMul

0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc

矩阵乘法，使用一维线程块和共享内存。并且在静态代码和运行时编译两种条件下使用。

▶ 源代码：静态使用

  1 #include <stdio.h>
  2 #include <assert.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_functions.h>
  6 #include <helper_cuda.h>
  7 
  8 template <int BLOCK_SIZE> __global__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
  9 {
 10     int bx = blockIdx.x;
 11     int by = blockIdx.y;
 12     int tx = threadIdx.x;
 13     int ty = threadIdx.y;
 14 
 15     int aBegin = wA * BLOCK_SIZE * by;  // A的行程起点
 16     int aEnd   = aBegin + wA - 1;       // A的行程终点
 17     int aStep  = BLOCK_SIZE;            // A的跨度（一个 block 为宽 BLOCK_SIZE 的一维条带，各线程分别对应其中的一个元素）
 18     int bBegin = BLOCK_SIZE * bx;       // B的行程起点
 19     int bStep  = BLOCK_SIZE * wB;       // B的跨度（一个 block 为高 BLOCK_SIZE 的一维条带，各线程分别对应其中的一个元素）
 20     float Csub = 0;
 21     
 22     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
 23     {
 24         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 25         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 26         As[ty][tx] = A[a + wA * ty + tx];
 27         Bs[ty][tx] = B[b + wB * ty + tx];
 28         __syncthreads();
 29 
 30 #pragma unroll// 循环展开为 BLOCK_SIZE 个赋值语句，提高效率
 31         for (int k = 0; k < BLOCK_SIZE; ++k)
 32             Csub += As[ty][k] * Bs[k][tx];
 33         __syncthreads();
 34     }
 35 
 36     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 37     C[c + wB * ty + tx] = Csub;
 38 }
 39 
 40 void constantInit(float *data, int size, float val)
 41 {
 42     for (int i = 0; i < size; ++i)
 43         data[i] = val;
 44 }
 45 
 46 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
 47 {
 48     unsigned int size_A = dimsA.x * dimsA.y;
 49     unsigned int mem_size_A = sizeof(float) * size_A;
 50     float *h_A = (float *)malloc(mem_size_A);
 51     unsigned int size_B = dimsB.x * dimsB.y;
 52     unsigned int mem_size_B = sizeof(float) * size_B;
 53     float *h_B = (float *)malloc(mem_size_B);
 54     constantInit(h_A, size_A, 1.0f);
 55     constantInit(h_B, size_B, 0.01f);
 56     dim3 dimsC(dimsB.x, dimsA.y, 1);
 57     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 58     float *h_C = (float *) malloc(mem_size_C);
 59     float *d_A, *d_B, *d_C;
 60     cudaMalloc((void **) &d_A, mem_size_A);
 61     cudaMalloc((void **) &d_B, mem_size_B);
 62     cudaMalloc((void **) &d_C, mem_size_C);
 63     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
 64     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
 65 
 66     // 热身
 67     dim3 threads(block_size, block_size);
 68     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 69     if (block_size == 16)
 70         matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 71     else
 72         matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 73     printf("done
");
 74     cudaDeviceSynchronize();
 75 
 76     printf("Computing result using CUDA Kernel...
");
 77     cudaEvent_t start;
 78     cudaEventCreate(&start);
 79     cudaEvent_t stop;
 80     cudaEventCreate(&stop);
 81     cudaEventRecord(start, NULL);
 82 
 83     int nIter = 300;
 84     for (int j = 0; j < nIter; j++)
 85     {
 86         if (block_size == 16)
 87             matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 88         else
 89             matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
 90     }
 91     cudaEventRecord(stop, NULL);
 92     cudaEventSynchronize(stop);
 93 
 94     float msecTotal = 0.0f;
 95     cudaEventElapsedTime(&msecTotal, start, stop);
 96     float msecPerMatrixMul = msecTotal / nIter;
 97     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
 98     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
 99     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block
",
100         gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
101     cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
102     
103     // 检查结果，要求相对误差：|<x, y>_cpu - <x,y>_gpu| / <|x|, |y|>  < eps    
104     printf("Checking computed result for correctness: ");
105     bool correct = true;
106     double eps = 1.e-6 ; // machine zero
107     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
108     {
109         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
110         double dot_length = dimsA.x;
111         double abs_val = fabs(h_C[i]);
112         double rel_err = abs_err/abs_val/dot_length ;
113         if (rel_err > eps)
114         {
115             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
", i, h_C[i], dimsA.x*valB, eps);
116             correct = false;
117         }
118     }
119     printf("%s
", correct ? "Result = PASS" : "Result = FAIL");
120 
121     free(h_A);
122     free(h_B);
123     free(h_C);
124     cudaFree(d_A);
125     cudaFree(d_B);
126     cudaFree(d_C);
127     printf("
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
");
128     if (correct)
129         return EXIT_SUCCESS;
130     else
131         return EXIT_FAILURE;
132 }
133 
134 int main(int argc, char **argv)
135 {
136     printf("[Matrix Multiply Using CUDA] - Starting...
");
137 
138     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
139     {
140         printf("Usage -device=n (n >= 0 for deviceID)
");
141         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)
");
142         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)
");
143         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.
");
144         exit(EXIT_SUCCESS);
145     }
146 
147     int devID = 0;// 指定设备，默认用0号设备
148     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
149     {
150         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
151         cudaSetDevice(devID);
152     }
153     cudaDeviceProp deviceProp;
154     cudaGetDevice(&devID);
155     cudaGetDeviceProperties(&deviceProp, devID);
156 
157     if (deviceProp.computeMode == cudaComputeModeProhibited)
158     {
159         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().
");
160         exit(EXIT_SUCCESS);
161     }
162 
163     int block_size = (deviceProp.major < 2) ? 16 : 32;
164 
165     dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
166     dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
167 
168     // 使用命令行指定的A、B的维度参数
169     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
170         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
171     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
172         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
173     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
174         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
175     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
176         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
177     if (dimsA.x != dimsB.y)
178     {
179         printf("Error: outer matrix dimensions must be equal. (%d != %d)
",
180                dimsA.x, dimsB.y);
181         exit(EXIT_FAILURE);
182     }
183     printf("MatrixA(%d,%d), MatrixB(%d,%d)
", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
184 
185     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
186 
187     getchar();
188     exit(matrix_result);
189 }

▶ 源代码：运行时编译

 1 /*matrixMul_kernel.cu*/
 2 template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
 3 {
 4     int bx = blockIdx.x;
 5     int by = blockIdx.y;
 6     int tx = threadIdx.x;
 7     int ty = threadIdx.y;
 8     int aBegin = wA * BLOCK_SIZE * by;
 9     int aEnd   = aBegin + wA - 1;
10     int aStep  = BLOCK_SIZE;
11     int bBegin = BLOCK_SIZE * bx;
12     int bStep = BLOCK_SIZE * wB;
13     float Csub = 0;
14     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
15     {
16         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
17         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
18         As[ty][tx] = A[a + wA * ty + tx];
19         Bs[ty][tx] = B[b + wB * ty + tx];
20         __syncthreads();
21 #pragma unroll
22         for (int k = 0; k < BLOCK_SIZE; ++k)
23             Csub += As[ty][k] * Bs[k][tx];
24         __syncthreads();
25     }
26     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
27     C[c + wB * ty + tx] = Csub;
28 }
29 
30 extern "C" __global__ void  matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
31 {
32     matrixMulCUDA<16>(C,A,B,wA,wB);
33 }
34 
35 extern "C" __global__ void  matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
36 {
37     matrixMulCUDA<32>(C,A,B,wA,wB);
38 }

  1 /*matrixMul.cpp*/
  2 #include <stdio.h>
  3 #include <assert.h>
  4 #include <cuda_runtime.h>
  5 #include "device_launch_parameters.h"
  6 #include "nvrtc_helper.h"
  7 #include <helper_functions.h>
  8 
  9 void constantInit(float *data, int size, float val)
 10 {
 11     for (int i = 0; i < size; ++i)
 12         data[i] = val;
 13 }
 14 
 15 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
 16 {
 17     // Allocate host memory for matrices A and B
 18     unsigned int size_A = dimsA.x * dimsA.y;
 19     unsigned int mem_size_A = sizeof(float) * size_A;
 20     float *h_A = (float *)malloc(mem_size_A);
 21     unsigned int size_B = dimsB.x * dimsB.y;
 22     unsigned int mem_size_B = sizeof(float) * size_B;
 23     float *h_B = (float *)malloc(mem_size_B);
 24     const float valB = 0.01f;
 25     constantInit(h_A, size_A, 1.0f);
 26     constantInit(h_B, size_B, valB);
 27     CUdeviceptr d_A, d_B, d_C;
 28 
 29     char *ptx, *kernel_file;
 30     size_t ptxSize;
 31     kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
 32     compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
 33     CUmodule module = loadPTX(ptx, argc, argv);
 34 
 35     dim3 dimsC(dimsB.x, dimsA.y, 1);
 36     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 37     float *h_C = (float *) malloc(mem_size_C);
 38     cuMemAlloc(&d_A, mem_size_A);
 39     cuMemAlloc(&d_B, mem_size_B);
 40     cuMemAlloc(&d_C, mem_size_C);
 41     cuMemcpyHtoD(d_A, h_A, mem_size_A);
 42     cuMemcpyHtoD(d_B, h_B, mem_size_B);
 43 
 44     dim3 threads(block_size, block_size);
 45     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
 46 
 47     printf("Computing result using CUDA Kernel...
");
 48 
 49     CUfunction kernel_addr;
 50     if (block_size == 16)
 51       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16");
 52     else
 53       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32");
 54 
 55     void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x };
 56 
 57     // Execute the kernel
 58     int nIter = 300;
 59 
 60     for (int j = 0; j < nIter; j++)
 61     {
 62         cuLaunchKernel(kernel_addr,
 63             grid.x, grid.y, grid.z,
 64             threads.x, threads.y, threads.z,
 65             0, 0, &arr[0], 0);
 66         cuCtxSynchronize();
 67     }
 68     cuMemcpyDtoH(h_C, d_C, mem_size_C);
 69 
 70     printf("Checking computed result for correctness: ");
 71     bool correct = true;
 72     double eps = 1.e-6 ;
 73     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
 74     {
 75         double abs_err = fabs(h_C[i] - (dimsA.x * valB);
 76         double dot_length = dimsA.x;
 77         double abs_val = fabs(h_C[i]);
 78         double rel_err = abs_err/abs_val/dot_length ;
 79         if (rel_err > eps)
 80         {
 81             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
", i, h_C[i], dimsA.x*valB, eps);
 82             correct = false;
 83         }
 84     }
 85     printf("%s
", correct ? "Result = PASS" : "Result = FAIL");
 86 
 87     printf("
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
");
 88     free(h_A);
 89     free(h_B);
 90     free(h_C);
 91     cuMemFree(d_A);
 92     cuMemFree(d_B);
 93     cuMemFree(d_C);
 94     if (correct)
 95         return EXIT_SUCCESS;
 96     else
 97         return EXIT_FAILURE;
 98 }
 99 
100 int main(int argc, char **argv)
101 {
102     printf("[Matrix Multiply Using CUDA] - Starting...
");
103 
104     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
105     {
106         printf("Usage -device=n (n >= 0 for deviceID)
");
107         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)
");
108         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)
");
109         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.
");
110         exit(EXIT_SUCCESS);
111     }
112 
113     int block_size = 32;
114     dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
115     dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
116 
117     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
118         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
119     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
120         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
121     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
122         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
123     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
124         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
125     if (dimsA.x != dimsB.y)
126     {
127         printf("Error: outer matrix dimensions must be equal. (%d != %d)
", dimsA.x, dimsB.y);
128     }   exit(EXIT_FAILURE);
129     printf("MatrixA(%d,%d), MatrixB(%d,%d)
", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
130 
131     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
132 
133     getchar();
134     exit(matrix_result);
135 }

▶ 输出结果：

[Matrix Multiply Using CUDA] - Starting...
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

MatrixA(320,320), MatrixB(640,320)
Computing result using CUDA Kernel...
done
Performance= 22.95 GFlop/s, Time= 5.712 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势：

● 程序写得很烂，各种声明、初始化杂糅。

● 一个根据cuda错误种类返回错误描述的函数

extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);

● 预编译命令展开循环

1 #pragma unroll
2 for (i = 0; i < m; i++)
3     c[i] = a[i] + b[i];

等价于

1 c[0] = a[0] + b[0];
2 c[1] = a[1] + b[1];
3 c[2] = a[2] + b[2];
4 ...
5 c[m-1] = a[m-1] + b[m-1];

#pragma unroll 命令后面可接数字，表明展开前多少次迭代，例如 #pragma unroll 4

● 核函数泛型编程。可以在调用核函数时传入一个常量参数，变相使用动态数组来规定共享内存等数组的大小。

1 template <int BLOCK_SIZE> __global__ void functionName(void)
2 {
3     __shared__ int shareArray[BLOCK_SIZE];
4     ...
5 }    
6 
7 cunctionName<16> << < blocksize, threadsize >> >();

● 热身，在多次重复实验前提前算一次。对缓存有帮助，有效减小实验结果（计算耗时）的方差。