0_Simple__asyncAPI

▶ CPU - GPU 异步操作

▶ 源代码

 1 #include <stdio.h>
 2 #include <cuda_runtime.h>
 3 #include "device_launch_parameters.h"
 4 #include <helper_cuda.h>
 5 #include <helper_functions.h>
 6 
 7 __global__ void increment_kernel(int *g_data, int inc_value)
 8 {
 9     int idx = blockIdx.x * blockDim.x + threadIdx.x;
10     g_data[idx] = g_data[idx] + inc_value;
11 }
12 
13 bool correct_output(int *data, const int n, const int x)
14 {
15     for (int i = 0; i < n; i++)
16     {
17         if (data[i] != x)
18         {
19             printf("Error! data[%d] = %d, ref = %d
", i, data[i], x);
20             return false;
21         }
22     }
23     return true;
24 }
25 
26 int main(int argc, char *argv[])
27 {
28     printf("Start.
");
29     int devID = findCudaDevice(argc, (const char **)argv);  // 通过命令行参数选择设备，可以为空
30     cudaDeviceProp deviceProps;
31     cudaGetDeviceProperties(&deviceProps, devID);
32     printf("CUDA device [%s]
", deviceProps.name);
33 
34     const int n = 16 * 1024 * 1024;
35     const int nbytes = n * sizeof(int);
36     const int value = 26;
37 
38     int *a, *d_a;
39     cudaMallocHost((void **)&a, nbytes);
40     cudaMalloc((void **)&d_a, nbytes);
41     memset(a, 0, nbytes);
42     cudaMemset(d_a, 255, nbytes);
43 
44     cudaEvent_t start, stop;                // GPU 端计时器
45     cudaEventCreate(&start);
46     cudaEventCreate(&stop);
47 
48     StopWatchInterface *timer = NULL;       // CPU 端计时器
49     sdkCreateTimer(&timer);
50     sdkResetTimer(&timer);
51 
52     dim3 threads = dim3(512, 1, 1);
53     dim3 blocks = dim3(n / threads.x, 1, 1);
54 
55     sdkStartTimer(&timer);                  // 注意 GPU 计时器是夹在 CPU 计时器内的，但是 GPU 函数都是异步的
56     cudaEventRecord(start, 0);
57     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
58     increment_kernel << <blocks, threads, 0, 0 >> > (d_a, value);
59     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
60     cudaEventRecord(stop, 0);
61     sdkStopTimer(&timer);
62 
63     unsigned long int counter = 0;          // 记录 GPU 运行完成以前 CPU 运行了多少次 while 的循环 
64     while (cudaEventQuery(stop) == cudaErrorNotReady)
65         counter++;
66 
67     float gpu_time = 0.0f;                  // 此时保证 GPU 运行完成，才能记录时间
68     cudaEventElapsedTime(&gpu_time, start, stop);
69 
70     printf("time spent by GPU: %.2f
", gpu_time);
71     printf("time spent by CPU: %.2f
", sdkGetTimerValue(&timer));
72     printf("CPU executed %lu iterations while waiting for GPU to finish
", counter);
73     printf("
	Finish: %s.", correct_output(a, n, value) ? "Pass" : "Fail");
74 
75     cudaEventDestroy(start);
76     cudaEventDestroy(stop);
77     cudaFreeHost(a);
78     cudaFree(d_a);
79     getchar();
80     return 0;
81 }

● 输出结果：

GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

CUDA device [GeForce GTX 1070]
time spent by GPU: 11.50
time spent by CPU: 0.05
CPU executed 3026 iterations while waiting for GPU to finish

        Finish!

▶ 新姿势：

● 调用主函数时的第0个参数作为程序名字符串，可以用于输出。

1 int main(int argc, char *argv[])
2 ...
3 printf("%s", argv[0]);

● 在没有附加 flag 的情况下申请主机内存，注意使用cudaFreeHost释放

1 int *a, nbytes = n * sizeof(int);
2 cudaMallocHost((void **)&a, nbytes);
3 ...
4 cudaFreeHost(a);

● 记录 CPU 调用 CUDA 所用的时间

1 StopWatchInterface *timer = NULL;
2 sdkCreateTimer(&timer);
3 sdkResetTimer(&timer);
4 sdkStartTimer(&timer);
5     
6 ...// 核函数调用
7     
8 sdkStopTimer(&timer);
9 printf("%.2f ms", sdkGetTimerValue(&timer));

● 查看GPU队列状态的函数

extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);

■ stop为放置到流中的一个事件，cudaEventQuery(stop)返回该事件的状态，等于cudaSuccess（值等于0）表示已经发生；等于cudaErrorNotReady（值等于35）表示尚未发生。源代码中利用这段时间让CPU空转，记录了迭代次数。

while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;

● stdlib.h 中关于返回成功和失败的宏

1 #define EXIT_SUCCESS 0
2 #define EXIT_FAILURE  1

● 示例文件中的错误检查函数（定义在helper_cuda.h中），报告出错文件、行号、函数名，并且重启cudaDevice。

 1 #define checkCudaErrors(val)  check((val), #val, __FILE__, __LINE__)
 2 
 3 template< typename T >
 4 void check(T result, char const *const func, const char *const file, int const line)
 5 {
 6     if (result)
 7     {
 8         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" 
",
 9             file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
10         DEVICE_RESET// Make sure we call CUDA Device Reset before exiting
11         exit(EXIT_FAILURE);
12     }
13 }
14 
15 #define DEVICE_RESET  cudaDeviceReset();