cuda(2)---方阵乘法

　　在GPU加速计算中，矩阵乘法是绕不开的话题，所以笔者就从简单的方阵乘法入手，进行cuda的入门学习

所以就写了一个简单cuda 方阵乘法的内核。

　　矩阵乘法的原理，我就不在这里赘述了，学过线性代数，都应该比较清楚。直接就上代码：

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 
  4 #include <stdio.h>
  5 #include <iostream>
  6 #include<time.h>
  7 
  8 typedef float DataType;
  9 
 10 #define WIDTH 32
 11 #define Matrix_h 2
 12 #define Matrix_w 2
 13 
 14 #define Arr_Size(arr)  (sizeof(arr)/sizeof(arr[0][0]))
 15  
 16 __global__ void MatrixMulKernel(DataType *M, DataType *N, DataType *P, int width) {
 17     //calcualte the row index of the P element and M
 18     int row = blockIdx.x*blockDim.x + threadIdx.x;
 19     //calcualte the col index of the P element and N
 20     int col = blockIdx.y*blockDim.y + threadIdx.y;
 21 
 22     if (row < width && col < width) {
 23         float pvalue = 0;
 24         //each thread computes one element of the block sub-matrix
 25         for (int k = 0; k < width; ++k) {
 26             pvalue += M[row*width + k] * N[k*width + col];
 27         }
 28         P[row*width + col] = pvalue;
 29     }
 30 }
 31 
 32 void MatrixMulCPU(DataType M[][Matrix_h], DataType N[][Matrix_h], DataType P[][Matrix_h], int width) {
 33     
 34     for (int i = 0; i < width; ++i) {
 35         for (int j = 0; j < width; ++j) {
 36             DataType pvalue = 0;
 37             for (int k = 0; k < width; ++k) {
 38                 pvalue += M[i][k] * N[k][j];
 39             }
 40             P[i][j] = pvalue;
 41         }
 42         
 43     }
 44 }
 45 
 46 
 47 int main(void) {
 48 
 49     DataType A[Matrix_w][Matrix_h]={
 50         {1,2},
 51         {3,4}
 52     };
 53 
 54     DataType B[Matrix_w][Matrix_h] = {
 55         {5,6},
 56         {7,8}
 57 
 58     };
 59     DataType C[Matrix_w][Matrix_h] = { 0 };
 60 
 61     DataType*d_A, *d_B, *d_C;
 62 
 63 
 64     cudaMalloc((void**)&d_A, sizeof(DataType)*Arr_Size(A));
 65     cudaMalloc((void**)&d_B, sizeof(DataType)*Arr_Size(B));
 66     cudaMalloc((void**)&d_C, sizeof(DataType)*Arr_Size(C));
 67     cudaMemcpy(d_A, A, sizeof(DataType)*Arr_Size(A), cudaMemcpyHostToDevice);
 68     cudaMemcpy(d_B, B, sizeof(DataType)*Arr_Size(B), cudaMemcpyHostToDevice);
 69 
 70     dim3 Block(WIDTH, WIDTH, 1);
 71     dim3 Grid((Matrix_w - 1) / WIDTH + 1, (Matrix_h - 1) / WIDTH + 1, 1);
 72 
 73     clock_t time_start, time_end;
 74     time_start = clock();
 75     MatrixMulKernel << <Grid, Block >> > (d_A, d_B, d_C, Matrix_w);
 76     cudaThreadSynchronize();
 77     time_end = clock();
 78     cudaMemcpy(C, d_C, sizeof(DataType)*Arr_Size(C), cudaMemcpyDeviceToHost);
 79     std::cout << "GPU time=" << (double)(time_end - time_start) / CLOCKS_PER_SEC * 1000 << "ms" << std::endl;
 80     std::cout << "GPU_result:" << std::endl;
 81     for (int i = 0; i < Matrix_w; ++i) {
 82         for (int j = 0; j < Matrix_h; ++j) {
 83             printf("C[%d][%d]=%f ", i, j, C[i][j]);
 84         }
 85         printf("
");
 86     }
 87 
 88     time_start = clock();
 89     MatrixMulCPU(A, B, C, Matrix_w);
 90     time_end = clock();
 91 
 92     std::cout << "CPU time=" << (double)(time_end - time_start) / CLOCKS_PER_SEC * 1000 << "ms" << std::endl;
 93     std::cout << "CPU_result:" << std::endl;
 94     for (int i = 0; i < Matrix_w; ++i) {
 95         for (int j = 0; j < Matrix_h; ++j) {
 96             printf("C[%d][%d]=%f ", i, j, C[i][j]);
 97         }
 98         printf("
");
 99     }
100 
101     cudaFree(d_A);
102     cudaFree(d_B);
103     cudaFree(d_C);
104     return 0;
105 }

Matrix by matrix

　　运行结果如下图：