CUDA实战2

1.clock()函数是C/C++中的计时函数，相关的数据类型是clock_t，使用clock函数可以计算运行某一段程序所需的时间，如下所示程序计算从10000000逐渐减一直到0所需的时间。

注：每次运行所需时间可能会不一样

 1 #include "cuda_runtime.h"
 2 #include "device_launch_parameters.h"
 3 #include <stdio.h>
 4 #include <time.h>
 5 int main()
 6 {
 7     //测试clock_t的使用
 8     clock_t start, end;
 9     long n = 10000000L;
10     double duration;
11     printf("使 %ld 循环减一变为 0 所需的时间是：",n);
12     //开始时间
13     start = clock();
14     //循环减一
15     while(n--);
16     //结束时间
17     end = clock();
18     //计算整个过程的时间结束时间减开始时间）, 
19     //CLOCKS_PER_SEC是"time.h"文件中定义的常量，
20     //表示一秒钟包含多少时钟计时单元（即毫秒）。
21     duration = (double)(end-start) / CLOCKS_PER_SEC;
22     printf(" %f 秒
",duration);
23     return 0;
24 }

View Code

CLOCKS_PER_SEC，它用来表示一秒钟会有多少个时钟计时单元，其定义如下：

#define CLOCKS_PER_SEC ((clock_t)1000)

可以看到每过千分之一秒（1毫秒），调用clock()函数返回的值就加1。

可以使用公式clock()/CLOCKS_PER_SEC来计算一个进程自身的运行时间。

矢量求和运算

假设我们有两组数据，我们需要将这两组数据中对应的元素两两相加，并将结果保存在第三个数组中。

  1 //CUDA的头文件
  2 #include "cuda_runtime.h"
  3 #include "device_launch_parameters.h"
  4 //C语言的头文件
  5 #include <stdio.h>
  6 #include <time.h>
  7 
  8 #define N 6000
  9 #define thread_num 1024
 10 
 11 //GPU函数声明
 12 __global__ void add(int* a, int* b, int* c);
 13 //CPU函数声明
 14 void add_CPU(int *a, int *b,int *c);
 15 
 16 int main()
 17 {
 18     //GPU方法计时声明
 19     float time_CPU, time_GPU;
 20     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
 21     //CPU方法计时声明
 22     float time_cpu, time_gpu;
 23     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
 24     int a[N], b[N], c[N], c_CPU[N];
 25     int *dev_a, *dev_b, *dev_c;
 26     
 27     int block_num;
 28     block_num = (N + thread_num - 1)/thread_num;
 29     
 30     //在GPU上分配内存
 31     cudaMalloc((void**)&dev_a, N*sizeof(int));
 32     cudaMalloc((void**)&dev_b, N*sizeof(int));
 33     cudaMalloc((void**)&dev_c, N*sizeof(int));
 34     
 35     //在CPU上进行赋值
 36     for(int i = 0; i < N; i++)
 37     {
 38         a[i] = -i;
 39         b[i] = i*i;
 40     }
 41     
 42  
 43     //记录当前时间
 44     start_cpu = clock();
 45     
 46     add_CPU(a, b, c_CPU);
 47     
 48     stop_cpu = clock();
 49     //记录当前时间
 50     printf("Tne time for CPU:	%f(ms)
", (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC);
 51     
 52     
 53     //输出CPU结果
 54     printf("
Result from CPU:
");
 55     for(int i = 0; i<N; i++)
 56     {
 57         printf("CPU:	%d+%d=%d
",a[i],b[i],c_CPU[i]);
 58     }
 59     
 60     //GPU计算
 61     cudaMemcpy(dev_a,a,N*sizeof(int), cudaMemcpyHostToDevice);
 62     cudaMemcpy(dev_b,b,N*sizeof(int), cudaMemcpyHostToDevice);
 63     
 64     //创建Event
 65     cudaEventCreate(&start_GPU);
 66     cudaEventCreate(&stop_GPU);
 67     
 68     //记录当时时间
 69     cudaEventRecord(start_GPU,0);
 70     start_gpu = clock();
 71     //调用核函数
 72     add<<<block_num,thread_num>>>(dev_a,dev_b,dev_c);
 73     
 74     stop_gpu = clock();
 75     //记录当时时间
 76     cudaEventRecord(stop_GPU,0);
 77     cudaEventSynchronize(start_GPU);
 78     cudaEventSynchronize(stop_GPU);
 79     cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU);
 80     printf("
The time from GPU :	%f(ms)
",time_GPU);
 81     
 82     //将device复制到host
 83     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
 84     //将GPU中的结果拷贝出来
 85     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
 86     
 87     //输出
 88     printf("
Result from GPU:
");
 89     for(int i = 0; i<N; i++)
 90     {
 91         printf("GPU:	%d+%d=%d
",a[i],b[i],c[i]);
 92     }
 93     cudaEventDestroy(start_GPU);
 94     cudaEventDestroy(stop_GPU);
 95     
 96     //释放内存
 97     cudaFree(dev_a);
 98     cudaFree(dev_b);
 99     cudaFree(dev_c);
100     printf("
The time for CPU by event:	%f(ms)
", time_CPU);
101     printf("The time for GPU by event:	%f(ms)
", time_GPU);
102     
103     time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
104     time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC;
105     printf("
The time for CPU by host:	%f(ms)
", time_cpu);
106     printf("The time for GPU by host:	%f(ms)
", time_gpu);
107     
108     
109     return 0;
110 }
111 //GPU函数
112 __global__ void add(int *a, int *b, int *c)
113 {
114     int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据
115     if (tid < N)
116     {
117         c[tid] = a[tid] + b[tid];
118     }
119 }
120 
121 //CPU函数
122 void add_CPU(int *a, int *b, int *c)
123 {
124     for (int i = 0; i < N; i++)
125     {
126         c[i] = a[i] + b[i];
127     }
128 }

3.CUDA实现矩阵乘

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <time.h>
  4 #include "cuda_runtime.h"
  5 #include "device_launch_parameters.h"
  6 #define M 1024
  7 #define K 1024
  8 #define N 1024
  9 
 10 void initial(double* list,int row,int col)
 11 {
 12     double *num = list;
 13 //    srand((unsigned)time(NULL));
 14     for (int i=0; i<row*col; i++)
 15     {
 16         num[i] = rand()%10;
 17     }
 18 }
 19 
 20 void CpuMatrix(double *A,double *B,double *C)
 21 {
 22        int i,j,k;
 23        
 24        for( i=0; i<M; i++)
 25        {
 26            for(j=0; j<N; j++)
 27            {
 28                double sum = 0;
 29                for(int k=0; k<K; k++)
 30                {
 31                    sum += A[i*K + k] * B[k * N + j];
 32                }
 33                C[i * N + j] = sum;
 34            }
 35        }
 36 }
 37 
 38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C)
 39 {
 40     int ix = threadIdx.x + blockDim.x * blockIdx.x;
 41     int iy = threadIdx.y + blockDim.y * blockIdx.y;
 42     
 43     if(ix<K && iy<M)
 44     {
 45     double sum = 0;
 46     for( int k = 0; k < K;k++)
 47     {
 48         sum += dev_A[iy*K + k] * dev_B[k*N + ix];
 49     }
 50     dev_C[iy * N + ix] = sum;
 51    }
 52 }
 53 
 54 void printMatrix(double *list,int row,int col)
 55 {
 56     double *p = list;
 57     for(int i=0; i<row; i++)
 58     {
 59         for(int j=0; j<col; j++)
 60         {
 61             printf("%10lf",p[j]);
 62         }
 63         p = p + col;
 64         printf("
");
 65     }
 66 }
 67 
 68 int main(int argc,char **argv)
 69 {
 70     clock_t start_cpu,stop_cpu,start_gpu,stop_gpu;
 71     double time_cpu,time_gpu;
 72     
 73     float time_CPU, time_GPU;
 74     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
 75     
 76     //printf("Amatrix:(%d*%d)
",M,K);
 77     int Axy = M*K;
 78     int Abytes = Axy * sizeof(double);
 79     
 80 //    printf("Bmatrix:(%d*%d)
",K,N);
 81     int Bxy = K*N;
 82     int Bbytes = Bxy * sizeof(double);
 83     
 84     int nxy = M*N;
 85     int nbytes = nxy * sizeof(double);
 86     
 87     double *host_A, *host_B, *host_C, *c_CPU;
 88     host_A = (double*)malloc(Abytes);
 89     host_B = (double*)malloc(Bbytes);
 90     host_C = (double*)malloc(nbytes);
 91     c_CPU = (double*)malloc(nbytes);
 92     
 93     //初始化
 94     initial(host_A,M,K);
 95     //输出
 96     printf("A:(%d,%d):
",M,K);
 97 //    printMatrix(host_A,M,K);
 98     
 99     initial(host_B,K,N);
100     //输出
101     printf("B:(%d,%d):
",K,N);
102 //    printMatrix(host_B,K,N);
103    
104     start_cpu = clock();
105     CpuMatrix(host_A,host_B,host_C);
106     stop_cpu = clock();
107     
108     printf("The time from CPU is 	%f(ms)
",(float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
109     //输出
110     printf("Host_C:(%d,%d):
",M,N);
111 //    printMatrix(host_C,M,N);
112     
113     //GPU计算
114     double *dev_A,*dev_B,*dev_C;
115     cudaMalloc((void**)&dev_A,Axy*sizeof(double));
116     cudaMalloc((void**)&dev_B,Bxy*sizeof(double));
117     cudaMalloc((void**)&dev_C,nxy*sizeof(double));
118     dim3 block(1024,1024);
119     dim3 grid(32,32); 
120     cudaMemcpy(dev_A,host_A,Abytes,cudaMemcpyHostToDevice);
121     cudaMemcpy(dev_B,host_B,Bbytes,cudaMemcpyHostToDevice);
122     //创建Event 
123     cudaEventCreate(&start_GPU);
124     cudaEventCreate(&stop_GPU);
125     cudaEventRecord(start_GPU,0);
126     start_gpu = clock();
127     
128     GpuMatrix<<<grid,block>>>(dev_A,dev_B,dev_C);
129     
130     stop_gpu = clock();
131     cudaEventRecord(stop_GPU,0);
132     cudaEventSynchronize(start_GPU);
133     cudaEventSynchronize(stop_GPU);
134     //计算时间差
135     cudaEventElapsedTime(&time_GPU,start_GPU,stop_GPU);
136     printf("The time from GPU is 	%f(ms)
",time_GPU);
137     //消除Event
138     cudaEventDestroy(start_GPU);
139     cudaEventDestroy(stop_GPU);
140     
141     cudaMemcpy(c_CPU,dev_C,nbytes,cudaMemcpyDeviceToHost);
142     //输出
143     printf("device_C:(%d,%d):
",M,N);
144 //    printMatrix(c_CPU,M,N);
145     
146     //释放内存
147      cudaFree(dev_A);
148      cudaFree(dev_B);
149      cudaFree(dev_C);
150      free(host_A);
151      free(host_B);
152      free(host_C);
153      free(c_CPU);
154      
155      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
156      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
157      printf("
The time for CPU by host:	%f(ms)
", time_cpu);
158      printf("The time for GPU by host:	%f(ms)
", time_gpu);
159      return 0;
160 
161 }