CUDA实例练习(十二):矩阵相乘

 1 #include <stdio.h>
 2 #include <cuda_runtime.h>
 3 #include <device_launch_parameters.h>
 4 #include <stdlib.h>
 5 #include <time.h>
 6 
 7 #define THREAD_NUM 256
 8 #define MATRIX_SIZE 1000
 9 const int blocks_num = MATRIX_SIZE * (MATRIX_SIZE + THREAD_NUM - 1) / THREAD_NUM;
10 void matgen(float *a, int n);
11 
12 ///生成随机矩阵
13 void matgen(float *a, int n){
14     int i, j;
15 
16     for (i = 0; i < n; i++){
17         for (j = 0; j < n; j++){
18             a[i * n + j] = (float)rand();
19             //printf("%f ", a[i*n + j]);
20         }
21     }
22 }
23 
24 //__global__函数  并行计算矩阵乘法
25 __global__ static void matMult(const float *a, const float *b, float * c,int n){
26     //表示目前的thread是第几个thread(由0开始计算)
27     const int tid = threadIdx.x;
28 
29     //表示目前的thread属于第几个block(由0开始计算)
30     const int bid = blockIdx.x;
31 
32     //从bid和tid计算出这个thread应该计算的row和column
33     const int idx = bid * THREAD_NUM + tid;
34     const int row = idx / n;
35     const int column = idx % n;
36 
37     //计算矩阵乘法
38     if (row < n && column < n){
39         float t = 0;
40         for (int i = 0; i < n; i++){
41             t += a[row * n + i] * b[i * n + column];
42         }
43         c[row * n + column] = t;
44     }
45 }
46 
47 int main(void){
48     cudaEvent_t stop, start;
49     cudaEventCreate(&start);
50     cudaEventCreate(&stop);
51 
52     //定义矩阵
53     float *a, *b, *c;
54     int n = MATRIX_SIZE;
55     float elapsedTime = 0;
56 
57     //分配内存
58     a = (float *)malloc(sizeof(float) * n * n);
59     b = (float *)malloc(sizeof(float) * n * n);
60     c = (float *)malloc(sizeof(float) * n * n);
61 
62     //设置随机数种子
63     srand(0);
64 
65     //随机生成矩阵
66     matgen(a, n);
67     matgen(b, n);
68 
69     //分配GPU内存
70     float *d_a, *d_b, *d_c;
71     cudaMalloc((void**)&d_a, sizeof(float) * n * n);
72     cudaMalloc((void**)&d_b, sizeof(float) * n * n);
73     cudaMalloc((void**)&d_c, sizeof(float) * n * n);
74     cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice);
75     cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice);
76 
77     cudaEventRecord(start, 0);
78     matMult << <blocks_num, THREAD_NUM, 0 >> >(d_a, d_b, d_c,n);
79     cudaThreadSynchronize();
80     cudaEventRecord(stop, 0);
81     cudaEventSynchronize(stop);
82     cudaEventElapsedTime(&elapsedTime, start, stop);
83     printf("%f
", elapsedTime);
84 
85     cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
86     /*for (int i = 0; i < 100; i++){
87         printf("%f ", c[i]);
88     }*/
89 
90     cudaFree(d_a);
91     cudaFree(d_b);
92     cudaFree(d_c);
93 
94     return 0;
95 }
原文地址:https://www.cnblogs.com/zhangshuwen/p/7423164.html