0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc

介绍了线程束表决函数的实例（其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html），并在静态和运行时编译两种条件下进行使用。

▶ 源代码：静态

 1 // simpleVote_kernel.cuh
 2 #ifndef SIMPLEVOTE_KERNEL_CU
 3 #define SIMPLEVOTE_KERNEL_CU
 4 
 5 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
 6 {
 7     int tx = threadIdx.x;
 8     int mask = 0xffffffff;
 9     result[tx] = __any_sync(mask, input[tx]);
10 }
11 
12 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
13 {
14     int tx = threadIdx.x;
15     int mask = 0xffffffff;
16     result[tx] = __all_sync(mask, input[tx]);
17 }
18 
19 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
20 {
21     int tx = threadIdx.x;
22     unsigned int mask = 0xffffffff;
23     bool *offs = info + (tx * 3);// 将每个线程指向等距间隔的元素，表明表决函数的运算结果可以进行分发
24 
25     // 第一组 “下标模 3 得 0” 的元素为 0，第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
26     *offs = __any_sync(mask, tx >= warp_size * 3 / 2);    
27     // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0，第二组后半段和第三组 “下标模 3 得 1” 的元素为 1    
28     *(offs + 1) = (tx >= warp_size * 3 / 2)? true: false;
29     // 第一组和第二组 “下标模 3 得 2” 的元素为 0，第三组 “下标模 3 得 2” 的元素为 1         
30     *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false;
31     // 最终结果应该是：
32     //   1   2   3   4      15  16  17  18      30  31  32
33     // 000 000 000 000 ... 000 000 000 000 ... 000 000 000 
34     // 100 100 100 100 ... 100 100 110 110 ... 110 110 110
35     // 111 111 111 111 ... 111 111 111 111 ... 111 111 111
36 }
37 #endif

  1 // simpleVoteIntrinsics.cu
  2 #include <stdio.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_functions.h>
  6 #include <helper_cuda.h>
  7 #include "simpleVote_kernel.cuh"
  8 
  9 #define WARP_SIZE   32
 10 #define GROUP       4
 11 
 12 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组，size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
 13 {
 14     for (int i = 0; i < size / 4; i++)
 15         VOTE_PATTERN[i] = 0x00000000; 
 16 
 17     for (int i = size / 4; i < size / 2; i++)
 18         VOTE_PATTERN[i] = (i & 0x01) ? i : 0; 
 19 
 20     for (int i = size / 2; i < 3 * size / 4; i++)
 21         VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; 
 22 
 23     for (int i = 3 * size / 4; i < size; i++)
 24         VOTE_PATTERN[i] = 0xffffffff; 
 25 }
 26 // 数组检查函数，type == 1：把数组元素全部加起来，结果非零就报错；type == 0：把数组元素全部加起来，结果不等于 WARP_SIZE 就报错
 27 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
 28 {
 29     int i, sum;
 30     for (sum = 0, i = start; i < end; i++)
 31         sum += h_result[i];
 32     if (type&&sum > 0 || !type&& sum != WARP_SIZE)
 33     {
 34         printf("
	<%s>[%d - %d]:", name, start, end-1);
 35         for (i = start; i < end; i++)
 36             printf("%d,", h_result[i]);
 37         printf("
");
 38     }
 39     return type?(sum > 0):(sum != WARP_SIZE);
 40 }
 41 
 42 // 数组检查的中间函数，type == 1：使用(1,0,0,0)的模式调用数组检查函数；type == 0：使用(1,1,1,0)的模式调用数组检查函数
 43 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
 44 {
 45     int error_count = 0;
 46 
 47     error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type?1:1,"Vote.Any");
 48     error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type?0:1,"Vote.Any");
 49     error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type?0:1,"Vote.Any");
 50     error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type?0:0,"Vote.Any");
 51 
 52     printf("%s
", !error_count ? "Passed" : "Failed");
 53     return error_count;
 54 }
 55 int checkResultsVoteKernel(bool *hinfo, int totalThread)
 56 {
 57     int i, error_count;
 58     for (i = error_count = 0; i < totalThread * 3; i++)
 59     {
 60         switch (i % 3)
 61         {
 62             case 0:
 63                 if (hinfo[i] != (i >= totalThread * 1))     // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
 64                     error_count++;
 65                 break;
 66             case 1:
 67                 if (hinfo[i] != (i >= totalThread * 3 / 2)) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
 68                     error_count++;
 69                 break;
 70             case 2:
 71                 if (hinfo[i] != (i >= totalThread * 2))     // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
 72                     error_count++;
 73                 break;
 74         }
 75     }
 76     printf("%s
", !error_count ? "Passed" : "Failed");
 77     return error_count;
 78 }
 79 
 80 int main()
 81 {
 82     printf("
	Start.
");
 83     int totalElement;
 84     unsigned int *h_input, *h_result;
 85     unsigned int *d_input, *d_result;
 86     bool *dinfo = NULL, *hinfo = NULL;
 87     int error_count[3] = { 0, 0, 0 };
 88     cudaSetDevice(0);
 89 
 90     //使用长度为 4 个线程束的数组，刚好分为 4 个组（全零，后交替非零，前交替非零，全非零）进行表决
 91     totalElement = WARP_SIZE * GROUP;
 92     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 
 93     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
 94     cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int)); 
 95     cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
 96     genVoteTestPattern(h_input, totalElement); 
 97     cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice);
 98 
 99     //测试一，any
100     printf("
	Test 1: ");
101     voteAny << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result); 
102     cudaDeviceSynchronize();
103     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);    
104     error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1);
105 
106     // 测试二，all
107     printf("
	Test 2: ");
108     voteAll << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result);
109     cudaDeviceSynchronize();
110     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
111     error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0);
112 
113     // 测试三，使用长度为 9 个线程束的数组，但调用内核时只使用数量为 3 个线程束的线程，即分为 3 组，每组 WARP_SIZE * 3 个元素
114     printf("
	Test 3: ");
115     totalElement = WARP_SIZE * 3 * 3;
116     hinfo = (bool *)calloc(totalElement, sizeof(bool));
117     cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
118     cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
119     vote3 << <1, totalElement / 3 >> > (dinfo, WARP_SIZE); 
120     cudaDeviceSynchronize(); 
121     cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
122     error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3);
123 
124     // 清理工作
125     cudaFree(d_input);
126     cudaFree(d_result);
127     free(h_input);
128     free(h_result);
129     free(hinfo);
130     cudaFree(dinfo);
131     printf("	
Finish.
");
132     getchar();
133     return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS;
134 }

▶ 输出结果：

    Start.

    Test 1: Passed

    Test 2: Passed

    Test 3: Passed

    Finish.

▶ 源代码：运行时编译（删掉了相同的注释）

 1 // simpleVote_kernel.cuh
 2 #ifndef SIMPLEVOTE_KERNEL_CU
 3 #define SIMPLEVOTE_KERNEL_CU
 4 
 5 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
 6 {
 7     int tx = threadIdx.x;
 8     int mask = 0xffffffff;
 9     result[tx] = __any_sync(mask, input[tx]);
10 }
11 
12 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
13 {
14     int tx = threadIdx.x;
15     int mask = 0xffffffff;
16     result[tx] = __all_sync(mask, input[tx]);
17 }
18 
19 extern "C" __global__ void vote3(bool *info, int warp_size)
20 {
21     int tx = threadIdx.x;
22     unsigned int mask = 0xffffffff;
23     bool *offs = info + (tx * 3);
24     *offs = __any_sync(mask, tx >= warp_size * 3 / 2);    
25     *(offs + 1) = (tx >= warp_size * 3 / 2) ? true : false;        
26     *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false;
27 }
28 #endif

  1 // simpleVoteIntrinsics.cu
  2 #include <stdio.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include "nvrtc_helper.h"
  6 #include <helper_functions.h>
  7 
  8 #define WARP_SIZE   32
  9 #define GROUP       4
 10 
 11 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
 12 {
 13     for (int i = 0; i < size / 4; i++)
 14         VOTE_PATTERN[i] = 0x00000000;
 15 
 16     for (int i = size / 4; i < size / 2; i++)
 17         VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
 18 
 19     for (int i = size / 2; i < 3 * size / 4; i++)
 20         VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
 21 
 22     for (int i = 3 * size / 4; i < size; i++)
 23         VOTE_PATTERN[i] = 0xffffffff;
 24 }
 25 
 26 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
 27 {
 28     int i, sum;
 29     for (sum = 0, i = start; i < end; i++)
 30         sum += h_result[i];
 31     if (type&&sum > 0 || !type&& sum != WARP_SIZE)
 32     {
 33         printf("
	<%s>[%d - %d]:", name, start, end - 1);
 34         for (i = start; i < end; i++)
 35             printf("%d,", h_result[i]);
 36         printf("
");
 37     }
 38     return type ? (sum > 0) : (sum != WARP_SIZE);
 39 }
 40 
 41 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
 42 {
 43     int error_count = 0;
 44 
 45     error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type ? 1 : 1, "Vote.Any");
 46     error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type ? 0 : 1, "Vote.Any");
 47     error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type ? 0 : 1, "Vote.Any");
 48     error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type ? 0 : 0, "Vote.Any");
 49 
 50     printf("%s
", !error_count ? "Passed" : "Failed");
 51     return error_count;
 52 }
 53 int checkResultsVoteKernel(bool *hinfo, int totalThread)
 54 {
 55     int i, error_count;
 56     for (i = error_count = 0; i < totalThread * 3; i++)
 57     {
 58         switch (i % 3)
 59         {
 60         case 0:
 61             if (hinfo[i] != (i >= totalThread * 1))
 62                 error_count++;
 63             break;
 64         case 1:
 65             if (hinfo[i] != (i >= totalThread * 3 / 2))
 66                 error_count++;
 67             break;
 68         case 2:
 69             if (hinfo[i] != (i >= totalThread * 2))
 70                 error_count++;
 71             break;
 72         }
 73     }
 74     printf("%s
", !error_count ? "Passed" : "Failed");
 75     return error_count;
 76 }
 77 
 78 int main()
 79 {
 80     printf("
	Start.
");
 81     int totalElement;
 82     unsigned int *h_input, *h_result;
 83     CUdeviceptr d_input, d_result;// unsigned long long
 84     bool *hinfo = NULL;
 85     CUdeviceptr dinfo;
 86     int error_count[3] = { 0, 0, 0 };
 87     //cudaSetDevice(0); 
 88 
 89     // 编译 PTX
 90     char *ptx, *kernel_file;
 91     size_t ptxSize; 
 92     kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\simpleVoteIntrinsics_nvrtc\simpleVote_kernel.cuh";
 93     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);// (1, NULL) 为主函数接受的参数个数和参数
 94     CUmodule module = loadPTX(ptx, 1, NULL);
 95 
 96     totalElement = WARP_SIZE * GROUP;
 97     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
 98     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
 99     cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
100     cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
101     genVoteTestPattern(h_input, totalElement);
102     cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int));
103 
104     //测试一，any
105     printf("
	Test 1: ");
106     dim3 gridBlock(1, 1);
107     dim3 threadBlock(totalElement, 1);
108     CUfunction kernel_addr;
109     cuModuleGetFunction(&kernel_addr, module, "voteAny");
110     void *arr1[] = { (void *)&d_input, (void *)&d_result };
111     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0);
112     cuCtxSynchronize();
113     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
114     error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1);
115 
116     // 测试二，all
117     printf("
	Test 2: ");
118     cuModuleGetFunction(&kernel_addr, module, "voteAll");
119     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0);
120     cuCtxSynchronize();
121     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
122     error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0);
123 
124     // 测试三
125     printf("
	Test 3: ");
126     totalElement = WARP_SIZE * 3 * 3;
127     hinfo = (bool *)calloc(totalElement, sizeof(bool));
128     cuMemAlloc(&dinfo, totalElement * sizeof(bool));
129     cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
130     threadBlock = dim3(totalElement / 3, 1);                    // 改变线程块尺寸
131     cuModuleGetFunction(&kernel_addr, module, "vote3");
132     int size = WARP_SIZE;
133     void *arr2[] = { (void *)&dinfo, (void *)&size };
134     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr2[0], 0);
135     cuCtxSynchronize();
136     cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
137     error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3);
138 
139     // 清理工作
140     cuMemFree(d_input);
141     cuMemFree(d_result);
142     free(h_input);
143     free(h_result);
144     free(hinfo);
145     cuMemFree(dinfo);
146     printf("	
Finish.
");
147     getchar();
148     return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS;
149 }

▶ 输出结果：

        Start.
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability

        Test 1: Passed

        Test 2: Passed

        Test 3: Passed

Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客，注意 CUDA9.0 改进了部分函数，废弃了旧的部分函数。