0_Simple__matrixMulCUBLAS

使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则，关于乘法函数中详细的参数说明和调用规则见另一篇随笔。

▶ 源代码：

  1 #include <assert.h>
  2 #include <helper_string.h>
  3 #include <cuda_runtime.h>
  4 #include <cublas_v2.h>
  5 #include <helper_functions.h>
  6 #include <helper_cuda.h>
  7 
  8 #ifndef min
  9 #define min(a,b) ((a < b) ? a : b)
 10 #endif
 11 #ifndef max
 12 #define max(a,b) ((a > b) ? a : b)
 13 #endif
 14 
 15 // 存放各矩阵维数的结构体
 16 typedef struct _matrixSize
 17 {
 18     unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
 19 } sMatrixSize;
 20 
 21 // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位，没有查错机制。
 22 void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 23 {
 24     for (unsigned int i = 0; i < hA; ++i)           // 从上往下数 i 行
 25     {
 26         for (unsigned int j = 0; j < wB; ++j)       // 从左往右数 j 列
 27         {
 28             double sum = 0;
 29             for (unsigned int k = 0; k < wA; ++k)   // 行程长
 30             {
 31                 double a = A[i * wA + k];           // 中间过程用 double，结果输出 float
 32                 double b = B[k * wB + j];
 33                 sum += a * b;
 34             }
 35             C[i * wB + j] = (float)sum;
 36         }
 37     }
 38 }
 39 
 40 // 初始化数组
 41 void randomInit(float *data, int size)
 42 {
 43     for (int i = 0; i < size; ++i)
 44         data[i] = rand() / (float)RAND_MAX;
 45 }
 46 
 47 //输出两个矩阵的不相等的值及其位置，允许容差为 fListTol ，最多输出 iListLength 个
 48 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
 49 {
 50     printf("Listing first %d Differences > %.6f...
", iListLength, fListTol);
 51     int i, j, k;
 52     int error_count = 0;
 53 
 54     for (j = 0; j < height; j++)
 55     {
 56         if (error_count < iListLength)
 57             printf("
  Row %d:
", j);
 58 
 59         for (i = 0; i < width; i++)
 60         {
 61             k = j * width + i;
 62             float fDiff = fabs(data1[k] - data2[k]);
 63             if (fDiff > fListTol)
 64             {
 65                 if (error_count < iListLength)
 66                     printf("    Loc(%d,%d)	CPU=%.5f	GPU=%.5f	Diff=%.6f
", i, j, data1[k], data2[k], fDiff);
 67                 error_count++;
 68             }
 69         }
 70     }
 71     printf(" 
  Total Errors = %d
", error_count);
 72 }
 73 
 74 // 初始化设备。包括选择设备，设定维数
 75 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
 76 {
 77     cudaError_t error;
 78 
 79     // 选择设备，略去了检查错误部分
 80     devID = 0;
 81     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
 82     {
 83         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
 84         error = cudaSetDevice(devID);
 85     }
 86     error = cudaGetDevice(&devID);
 87     if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
 88         iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
 89     iSizeMultiple = max(min(iSizeMultiple, 10), 1);
 90     cudaDeviceProp deviceProp;
 91     error = cudaGetDeviceProperties(&deviceProp, devID);
 92     printf("GPU Device %d: "%s" with compute capability %d.%d

", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
 93 
 94     // Fermi 以上构架的计算机使用更大的线程块（blockDim），这里用了32
 95     int block_size = (deviceProp.major < 2) ? 16 : 32;
 96 
 97     matrix_size.uiWA = 3 * block_size * iSizeMultiple;
 98     matrix_size.uiHA = 4 * block_size * iSizeMultiple;
 99     matrix_size.uiWB = 2 * block_size * iSizeMultiple;
100     matrix_size.uiHB = 3 * block_size * iSizeMultiple;
101     matrix_size.uiWC = 2 * block_size * iSizeMultiple;
102     matrix_size.uiHC = 4 * block_size * iSizeMultiple;
103 
104     printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)
",                                                       
105         matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
106     if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
107     {
108        printf("ERROR: Matrix sizes do not match!
");
109        exit(-1);
110     }
111 }
112 
113 // 计算矩阵乘法部分
114 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
115 {
116     cudaDeviceProp deviceProp;
117     cudaGetDeviceProperties(&deviceProp, devID);
118     int block_size = (deviceProp.major < 2) ? 16 : 32;
119 
120     unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
121     unsigned int mem_size_A = sizeof(float) * size_A;
122     float *h_A = (float *)malloc(mem_size_A);
123     unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
124     unsigned int mem_size_B = sizeof(float) * size_B;
125     float *h_B = (float *)malloc(mem_size_B);
126 
127     srand(2006);
128     randomInit(h_A, size_A);
129     randomInit(h_B, size_B);
130 
131     float *d_A, *d_B, *d_C;
132     unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
133     unsigned int mem_size_C = sizeof(float) * size_C;
134     //float *h_C      = (float *) malloc(mem_size_C);           // TM 没有用！
135     float *h_CUBLAS = (float *) malloc(mem_size_C);             // 保存 d_C 回传的结果
136     cudaMalloc((void **) &d_A, mem_size_A);
137     cudaMalloc((void **) &d_B, mem_size_B);
138     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
139     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
140     cudaMalloc((void **) &d_C, mem_size_C);
141 
142     dim3 threads(block_size, block_size);
143     dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
144 
145     printf("Computing result using CUBLAS...");
146     int nIter = 30;
147 
148     //cuBLAS代码块
149     {
150         const float alpha = 1.0f;
151         const float beta  = 0.0f;
152         cublasHandle_t handle;
153         cudaEvent_t start, stop;
154 
155         cublasCreate(&handle);
156 
157         //热身，注意转置的问题，不采用 <<< >>> 调用核函数
158         cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 
159             &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
160 
161         cudaEventCreate(&start);
162         cudaEventCreate(&stop);
163 
164         cudaEventRecord(start, NULL);
165 
166         for (int j = 0; j < nIter; j++)
167         {
168             cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
169                 &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
170         }
171         printf("done.
");
172         cudaEventRecord(stop, NULL);
173         cudaEventSynchronize(stop);
174         float msecTotal = 0.0f;
175         cudaEventElapsedTime(&msecTotal, start, stop);
176 
177         // 计算了耗时、操作数以及操作速度
178         float msecPerMatrixMul = msecTotal / nIter;
179         double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
180         double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
181         printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops
", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
182 
183         cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
184         cublasDestroy(handle);
185     }
186 
187     printf("Computing result using host CPU...");
188     float *reference = (float *)malloc(mem_size_C);
189     matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
190     printf("done.
");
191 
192     bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
193 
194     if (resCUBLAS != true)
195         printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
196 
197     printf("Comparing CUBLAS Matrix Multiply with CPU results: %s
", (true == resCUBLAS) ? "PASS" : "FAIL");
198     printf("
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
");
199 
200     free(h_A);
201     free(h_B);
202     free(h_CUBLAS);
203     free(reference);
204     cudaFree(d_A);
205     cudaFree(d_B);
206     cudaFree(d_C);
207 
208     if (resCUBLAS == true)
209         return EXIT_SUCCESS;
210     else
211         return EXIT_FAILURE;
212 }
213 
214 int main(int argc, char **argv)
215 {
216     printf("[Matrix Multiply CUBLAS] - Starting...
");
217 
218     int devID = 0, sizeMult = 5;
219     sMatrixSize matrix_size;
220 
221     initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
222 
223     int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
224 
225     getchar();
226     return matrix_result;
227 }

▶ 输出结果：

[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

MatrixA(640,480), MatrixB(480,320), MatrixC(640,320)
Computing result using CUBLAS...done.
Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= 196608000 Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势：

● 代码依然很烂。
多用了一个 h_C 根本没有用上，其作用被 h_CUBLAS 取代了，而且源代码中有free(h_C)却没有free(h_CUBLAS)。

1 float *h_C = (float *)malloc(mem_size_C);
2 ...
3 free(h_C);

● 句柄的创造与销毁，定义于cublas_api.h 中

1 /*cublas_api.h*/
2 typedef struct cublasContext *cublasHandle_t;
3 
4 /*cublas_v2.h*/
5 #define cublasCreate cublasCreate_v2
6 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle);
7  
8 #define cublasDestroy cublasDestroy_v2
9 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);

● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的，而且对应不同的数据类型（实数、复数）和数据精度（单精度、双精度）一共有四个函数。

 1 #define cublasSgemm cublasSgemm_v2
 2 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
 3 (
 4     cublasHandle_t handle,
 5     cublasOperation_t transa, cublasOperation_t transb,
 6     int m, int n, int k,
 7     const float *alpha,
 8     const float *A, int lda,
 9     const float *B, int ldb,
10     const float *beta,
11     float *C, int ldc
12 );

● 定义在 helper_image.h 中的一个函数，用于比较两个长为 len 数组是否相等，允许容差为epsilon

1 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)

● 调用cublas计算矩阵乘法的过程摘要（详细的参数说明和调用规则见另一篇随笔）

 1 ...// 准备d_A，d_B，d_C，其中 d_A 和 d_B 中存放了需要相乘的两个矩阵，d_C初始化自动为零矩阵
 2 
 3 // 规定使用的线程块和线程尺寸
 4 dim3 threads(1, 1);
 5 dim3 grid(1, 1);
 6 
 7 // 常数因子，计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
 8 const float alpha = 1.0f;
 9 const float beta = 0.0f;
10 
11 // 创建句柄，需要在计算完成后销毁
12 cublasHandle_t handle;
13 cublasCreate(&handle);
14 
15 // 调用计算函数，注意参数顺序
16 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB);
17 
18 cublasDestroy(handle);
19 
20 ...// 回收计算结果，顺序可以和销毁句柄交换