CUDA实例练习（八）：原子操作（直方图）

直方图概念：给定一个包含一组元素的数据集，直方图表示每个元素的出现频率。

一、在CPU上计算直方图

 1 #include "book.h"
 2 #include <stdio.h>
 3 #include <cuda_runtime.h>
 4 #include <device_launch_parameters.h>
 5 #include <time.h>
 6 
 7 #define SIZE    (100*1024*1024)
 8 
 9 int main(void) {
10     unsigned char *buffer =
11         (unsigned char*)big_random_block(SIZE);
12 
13     // capture the start time
14     clock_t         start, stop;
15     start = clock();
16 
17     unsigned int    histo[256];
18     for (int i = 0; i<256; i++)
19         histo[i] = 0;
20 
21     for (int i = 0; i < SIZE; i++)
22         histo[buffer[i]]++;
23     stop = clock();
24     float   elapsedTime = (float)(stop - start) /
25         (float)CLOCKS_PER_SEC * 1000.0f;
26     printf("Time to generate:  %3.1f ms
", elapsedTime);
27 
28     long histoCount = 0;
29     for (int i = 0; i<256; i++) {
30         histoCount += histo[i];
31     }
32     
33     printf("Histogram Sum:  %ld
", histoCount);
34 
35     free(buffer);
36     return 0;
37 }

二、在GPU上使用全局内存原子操作计算直方图

 1 #include <stdio.h>
 2 #include <cuda_runtime.h>
 3 #include <device_launch_parameters.h>
 4 #include "book.h"
 5 #include "gpu_anim.h"
 6 #define SIZE (100*1024*1024)
 7 
 8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
 9     int i = threadIdx.x + blockIdx.x * blockDim.x;
10     int stride = blockDim.x * gridDim.x;
11     while (i < size){
12         atomicAdd(&histo[buffer[i]], 1);
13         i += stride;
14     }
15 }
16 int main(void){
17     unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
18     /*测量执行性能，初始化计时事件*/
19     cudaEvent_t start, stop;
20     HANDLE_ERROR(cudaEventCreate(&start));
21     HANDLE_ERROR(cudaEventCreate(&stop));
22     HANDLE_ERROR(cudaEventRecord(start, 0));
23 
24     //在GPU上为文件的数据分配内存
25     unsigned char *dev_buffer;
26     unsigned int *dev_histo;
27     HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE));
28     HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));
29     HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int)));
30     HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int)));
31     cudaDeviceProp prop;
32     HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
33     int blocks = prop.multiProcessorCount;
34     histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo);
35     unsigned int histo[256];
36     HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost));
37     //得到停止时间并显示计时结果
38     HANDLE_ERROR(cudaEventRecord(stop, 0));
39     HANDLE_ERROR(cudaEventSynchronize(stop));
40     float elapsedTime;
41     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
42     printf("Time to generate: %3.1f ms
", elapsedTime);
43 
44     long histoCount = 0;
45     for (int i = 0; i < 256; i++){
46         histoCount += histo[i];
47     }
48     printf("Histogram Sum: %1d
", histoCount);
49 
50     //验证与基于CPU计算得到的结果是相同的
51     for (int i = 0; i < SIZE; i++)
52         histo[buffer[i]]--;
53     for (int i = 0; i < 256; i++){
54         if (histo[i] != 0)
55             printf("Failure at %d!
", i);
56     }
57     //在程序结束时要释放已分配的CUDA事件，GPU内存和主机内存
58     HANDLE_ERROR(cudaEventDestroy(start));
59     HANDLE_ERROR(cudaEventDestroy(stop));
60     cudaFree(dev_histo);
61     cudaFree(dev_buffer);
62     free(buffer);
63     return 0;
64 }

在GPU上运行时间比在CPU上运行时间长，性能不理想。

三、在GPU上使用共享内存原子操作计算直方图

 1 #include <stdio.h>
 2 #include <cuda_runtime.h>
 3 #include <device_launch_parameters.h>
 4 #include "book.h"
 5 #include "gpu_anim.h"
 6 #define SIZE (100*1024*1024)
 7 
 8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
 9     __shared__ unsigned int temp[256];
10     temp[threadIdx.x] = 0;
11     __syncthreads();
12     int i = threadIdx.x + blockIdx.x * blockDim.x;
13     int offset = blockDim.x *gridDim.x;
14     while (i<size){
15         atomicAdd(&temp[buffer[i]], 1);
16         i += offset;
17     }
18     __syncthreads();
19     atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
20 }
21 int main(void){
22     unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
23     /*测量执行性能，初始化计时事件*/
24     cudaEvent_t start, stop;
25     HANDLE_ERROR(cudaEventCreate(&start));
26     HANDLE_ERROR(cudaEventCreate(&stop));
27     HANDLE_ERROR(cudaEventRecord(start, 0));
28 
29     //在GPU上为文件的数据分配内存
30     unsigned char *dev_buffer;
31     unsigned int *dev_histo;
32     HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE));
33     HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));
34     HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int)));
35     HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int)));
36     cudaDeviceProp prop;
37     HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
38     int blocks = prop.multiProcessorCount;
39     histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo);
40     unsigned int histo[256];
41     HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost));
42     //得到停止时间并显示计时结果
43     HANDLE_ERROR(cudaEventRecord(stop, 0));
44     HANDLE_ERROR(cudaEventSynchronize(stop));
45     float elapsedTime;
46     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
47     printf("Time to generate: %3.1f ms
", elapsedTime);
48 
49     long histoCount = 0;
50     for (int i = 0; i < 256; i++){
51         histoCount += histo[i];
52     }
53     printf("Histogram Sum: %1d
", histoCount);
54 
55     //验证与基于CPU计算得到的结果是相同的
56     for (int i = 0; i < SIZE; i++)
57         histo[buffer[i]]--;
58     for (int i = 0; i < 256; i++){
59         if (histo[i] != 0)
60             printf("Failure at %d!
", i);
61     }
62     //在程序结束时要释放已分配的CUDA事件，GPU内存和主机内存
63     HANDLE_ERROR(cudaEventDestroy(start));
64     HANDLE_ERROR(cudaEventDestroy(stop));
65     cudaFree(dev_histo);
66     cudaFree(dev_buffer);
67     free(buffer);
68     return 0;
69 }

运行时间缩短很多，性能提升明显。