0_Simple__cdpSimplePrint + 0_Simple_

0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort

▶ CUDA 动态并行实现快排算法（单线程的递归调用）

▶ 源代码：动态并行递归调用线程块。要点：添加 -rdc=true 选项（生成 relocatable device code，相当于执行分离编译），以及链接库 cudadevrt.lib （用于动态并行，不同于运行时库 cudart.lib）

  1 #include <stdio.h>
  2 #include <cuda.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_cuda.h> 
  6 #include <helper_string.h>
  7 
  8 __device__ int g_blockId = 0;                                       // 线程块的全局编号，供所有线程读写
  9 
 10 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId)  // 打印当前线程块信息，包括深度，当前块号，
 11 {
 12     if (threadIdx.x == 0)
 13     {
 14         if (depth == 0)
 15             printf("BLOCK %d launched by the host
", blockId);
 16         else
 17         {
 18             char buffer[32];
 19             for (int i = 0; i < depth; ++i)                         // 对应更多层级，每层前面都有相应层数的 "|  "
 20             {
 21                 buffer[3 * i + 0] = '+';
 22                 buffer[3 * i + 1] = ' ';
 23                 buffer[3 * i + 2] = ' ';
 24             }
 25             buffer[3 * depth] = '';
 26             printf("%sBLOCK %d launched by thread %d of block %d
", buffer, blockId, parent_threadId, parent_blockId);
 27         }
 28     }
 29     __syncthreads();
 30 }
 31 
 32 __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归
 33 {
 34     __shared__ int s_blockId;                                       // 当前线程块的编号
 35 
 36     if (threadIdx.x == 0)                                           // 读取当前 g_blockId 到 s_blockId 中，并将 g_blockId 加一
 37         s_blockId = atomicAdd(&g_blockId, 1);
 38     __syncthreads();
 39 
 40     print_info(depth, s_blockId, parent_threadId, parent_blockId);  // 打印当前线程块信息，
 41 
 42     if (++depth >= max_depth)                                       // 达到最大递归深度则退出，否则继续调用 cdp_kernel()
 43         return;
 44     cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId);
 45 }
 46 
 47 int main(int argc, char **argv)
 48 {
 49     printf("CUDA Dynamic Parallelism
");
 50     int max_depth = 3;
 51     int device_count = 0, device = -1;
 52 
 53     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式
 54     {
 55         printf("Usage: %s depth=<max_depth>	(where max_depth is a value between 1 and 8).
", argv[0]);
 56         exit(EXIT_SUCCESS);
 57     }
 58     if (checkCmdLineFlag(argc, (const char **)argv, "depth"))       // 手动设置递归深度
 59     {
 60         max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
 61         if (max_depth < 1 || max_depth > 8)
 62         {
 63             printf("depth parameter has to be between 1 and 8
");
 64             exit(EXIT_FAILURE);
 65         }
 66     }
 67     if (checkCmdLineFlag(argc, (const char **)argv, "device"))      // 命令行指定设备
 68     {
 69         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
 70         cudaDeviceProp properties;
 71         cudaGetDeviceProperties(&properties, device);
 72         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
 73             printf("Running on GPU %d (%s)
", device, properties.name);
 74         else
 75         {
 76             printf("ERROR: required GPU with compute SM 3.5 or higher.
Current GPU compute SM %d.%d
", properties.major, properties.minor);
 77             exit(EXIT_FAILURE);
 78         }
 79     }
 80     else
 81     {
 82         cudaGetDeviceCount(&device_count);
 83         for (int i = 0; i < device_count; ++i)
 84         {
 85             cudaDeviceProp properties;
 86             cudaGetDeviceProperties(&properties, i);
 87             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
 88             {
 89                 device = i;
 90                 printf("Running on GPU %d (%s)", i, properties.name);
 91                 break;
 92             }
 93             printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name);
 94         }
 95     }
 96     if (device == -1)
 97     {
 98         printf("required GPU with compute SM 3.5 or higher.");
 99         exit(EXIT_WAIVED);
100     }
101     cudaSetDevice(device);
102 
103     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
104     printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:

");
105     cdp_kernel << <2, 2 >> >(max_depth, 0, 0, -1);
106     cudaGetLastError();
107     cudaDeviceSynchronize();
108 
109     getchar();
110     exit(EXIT_SUCCESS);
111 }

● 输出结果：主机调用 2 个线程块，每个线程块 2 个线程，每个线程按同样规模递归调用，共 2*4 个二级核函数，2*4*4 个三级核函数，一共 42 个线程块

CUDA Dynamic Parallelism
Running on GPU 0 (GeForce GTX 1070)Launching cdp_kernel() with CUDA Dynamic Parallelism:

BLOCK 0 launched by the host
BLOCK 1 launched by the host
+  BLOCK 2 launched by thread 0 of block 1
+  BLOCK 3 launched by thread 0 of block 1
+  BLOCK 4 launched by thread 0 of block 0
+  BLOCK 5 launched by thread 0 of block 0
+  +  BLOCK 10 launched by thread 0 of block 3
+  +  BLOCK 11 launched by thread 0 of block 3
+  +  BLOCK 7 launched by thread 0 of block 2
+  +  BLOCK 6 launched by thread 0 of block 2
+  +  BLOCK 12 launched by thread 0 of block 5
+  +  BLOCK 13 launched by thread 0 of block 5
+  +  BLOCK 8 launched by thread 0 of block 4
+  +  BLOCK 9 launched by thread 0 of block 4
+  +  BLOCK 15 launched by thread 1 of block 3
+  +  BLOCK 14 launched by thread 1 of block 3
+  +  BLOCK 19 launched by thread 1 of block 2
+  +  BLOCK 16 launched by thread 1 of block 5
+  +  BLOCK 17 launched by thread 1 of block 5
+  +  BLOCK 18 launched by thread 1 of block 2
+  +  BLOCK 21 launched by thread 1 of block 4
+  +  BLOCK 20 launched by thread 1 of block 4
+  BLOCK 22 launched by thread 1 of block 1
+  BLOCK 23 launched by thread 1 of block 1
+  BLOCK 24 launched by thread 1 of block 0
+  BLOCK 25 launched by thread 1 of block 0
+  +  BLOCK 28 launched by thread 0 of block 23
+  +  BLOCK 27 launched by thread 0 of block 22
+  +  BLOCK 29 launched by thread 0 of block 24
+  +  BLOCK 26 launched by thread 0 of block 23
+  +  BLOCK 31 launched by thread 0 of block 24
+  +  BLOCK 30 launched by thread 0 of block 22
+  +  BLOCK 33 launched by thread 0 of block 25
+  +  BLOCK 32 launched by thread 0 of block 25
+  +  BLOCK 34 launched by thread 1 of block 23
+  +  BLOCK 35 launched by thread 1 of block 23
+  +  BLOCK 36 launched by thread 1 of block 22
+  +  BLOCK 37 launched by thread 1 of block 22
+  +  BLOCK 38 launched by thread 1 of block 25
+  +  BLOCK 39 launched by thread 1 of block 25
+  +  BLOCK 40 launched by thread 1 of block 24
+  +  BLOCK 41 launched by thread 1 of block 24

▶ 涨姿势：

● 在核函数中递归地调用核函数，注意函数调用的格式

▶ 源代码：动态并行实现快排算法，输出结果只有 Finish!

  1 #include <stdio.h>
  2 #include <cuda.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_cuda.h> 
  6 #include <helper_string.h>
  7 
  8 #define MAX_DEPTH       16
  9 #define INSERTION_SORT  32
 10 
 11 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序，单线程完成
 12 {
 13     for (int i = left; i <= right; ++i)
 14     {
 15         unsigned min_val = data[i];
 16         int min_idx = i;
 17         for (int j = i + 1; j <= right; ++j)                            // 找最小元素及其下标
 18         {
 19             unsigned val_j = data[j];
 20             if (val_j < min_val)
 21             {
 22                 min_idx = j;
 23                 min_val = val_j;
 24             }
 25         }
 26         if (i != min_idx)                                               // 交换第 i 号元素到指定的位置上
 27         {
 28             data[min_idx] = data[i];
 29             data[i] = min_val;
 30         }
 31     }
 32 }
 33 
 34 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)    // 快排主体，内含递归调用
 35 {
 36     if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)           // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排
 37     {
 38         selection_sort(data, left, right);
 39         return;
 40     }
 41     unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / 2];
 42     while (lptr <= rptr)
 43     {
 44         unsigned int lval = *lptr, rval = *rptr;                        // 指定左指针指向的值和右指针指向的值
 45         while (lval < pivot)                                            // 递增左指针，等价于 lptr++; lval = *lptr;        
 46             lval = *(++lptr);
 47         while (rval > pivot)                                            // 递减右指针        
 48             rval = *(--rptr);
 49         if (lptr <= rptr)                                               // 交换左右指针指向的值
 50         {
 51             *lptr++ = rval;
 52             *rptr-- = lval;
 53         }
 54     }
 55     if (left < rptr - data)                                             // 将左右分区放到两个不同的流中
 56     {
 57         cudaStream_t s0;
 58         cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking);          // 指定该流不与 0 号流进行同步
 59         cdp_simple_quicksort << < 1, 1, 0, s0 >> >(data, left, rptr - data, depth + 1);
 60         cudaStreamDestroy(s0);
 61     }
 62     if (lptr - data < right)
 63     {
 64         cudaStream_t s1;
 65         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
 66         cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, lptr - data, right, depth + 1);
 67         cudaStreamDestroy(s1);
 68     }
 69 }
 70 
 71 void run_qsort(unsigned int *data, unsigned int n)                      // 快排入口
 72 {
 73     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);        // 设置最大递归深度    
 74     cdp_simple_quicksort << < 1, 1 >> >(data, 0, n - 1, 0);
 75     cudaDeviceSynchronize();
 76 }
 77 
 78 int main(int argc, char **argv)
 79 {
 80     cudaSetDevice(0);
 81     const int n = 1024;
 82 
 83     unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n);    
 84     srand(2047);
 85     for (unsigned i = 0; i < n; i++)
 86         h_data[i] = rand() % n;
 87 
 88 
 89     unsigned int *d_data;
 90     cudaMalloc((void **)&d_data, n * sizeof(unsigned int));
 91     cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice);
 92 
 93     run_qsort(d_data, n);
 94     
 95     cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
 96 
 97     for (int i = 1; i < n; ++i)
 98     {
 99         if (h_data[i - 1] > h_data[i])
100         {
101             printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d
", h_data[i - 1], h_data[i]);
102             break;
103         }
104     }
105     printf("Finish!
");
106 
107     free(h_data);
108     cudaFree(d_data);
109     getchar();
110     exit(EXIT_SUCCESS);
111 }

▶ 新姿势：

● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref（定义于 helper_string.h 中）

 1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
 2 {
 3     bool bFound = false;
 4     if (argc >= 1)
 5     {
 6         for (int i = 1; i < argc; i++)
 7         {
 8             int string_start = stringRemoveDelimiter('-', argv[i]);
 9             const char *string_argv = &argv[i][string_start], 
10             const char*equal_pos = strchr(string_argv, '=');
11             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
12             int length = (int)strlen(string_ref);
13             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
14             {
15                 bFound = true;
16                 continue;
17             }
18         }
19     }
20     return bFound;
21 }
22 
23 inline int stringRemoveDelimiter(char delimiter, const char *string)    // 去除特定的符号，上述函数的中用于去除参数前面的 - 或 --
24 {
25     int string_start = 0;
26     while (string[string_start] == delimiter)
27         string_start++;
28     if (string_start >= (int)strlen(string) - 1)
29         return 0;
30     return string_start;
31 }
32 
33 #define STRNCASECMP _strnicmp                                           // 比较字符串（定义于string.h中）
34 
35 _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);

● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数（定义于 helper_string.h 中）

 1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
 2 {
 3     bool bFound = false;
 4     int value = -1;
 5     if (argc >= 1)
 6     {
 7         for (int i = 1; i < argc; i++)
 8         {
 9             int string_start = stringRemoveDelimiter('-', argv[i]);
10             const char *string_argv = &argv[i][string_start];
11             int length = (int)strlen(string_ref);
12             if (!STRNCASECMP(string_argv, string_ref, length))
13             {
14                 if (length + 1 <= (int)strlen(string_argv))
15                 {
16                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
17                     value = atoi(&string_argv[length + auto_inc]);
18                 }
19                 else
20                     value = 0;
21                 bFound = true;
22                 continue;
23             }
24         }
25     }
26     if (bFound)
27         return value;
28     return 0;
29 }

● 指定最大递归深度

extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);

● 带有标识符的 cudaStreamCreateWithFlags ，设置流的优先级

extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

■ 对比 cudaStreamCreate

extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);