0_Simple__cppOverload

▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

▶ 源代码

 1 // cppOverload_kernel.cu
 2 __global__ void simple_kernel(const int *pIn, int *pOut, int a)
 3 {
 4     __shared__ int sData[THREAD_N];
 5     int tid = threadIdx.x + blockDim.x * blockIdx.x;
 6 
 7     sData[threadIdx.x] = pIn[tid];
 8     __syncthreads();
 9     pOut[tid] = sData[threadIdx.x] * a + tid;
10 }
11 
12 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
13 {
14     __shared__ int2 sData[THREAD_N];
15     int tid = threadIdx.x + blockDim.x * blockIdx.x;
16 
17     sData[threadIdx.x] = pIn[tid];
18     __syncthreads();
19     pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;
20 }
21 
22 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
23 {
24     __shared__ int sData1[THREAD_N], sData2[THREAD_N];
25     int tid = threadIdx.x + blockDim.x * blockIdx.x;
26 
27     sData1[threadIdx.x] = pIn1[tid];
28     sData2[threadIdx.x] = pIn2[tid];
29     __syncthreads();
30     pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
31 }

  1 // cppOverload.cu
  2 #include <stdio.h>
  3 #include <helper_cuda.h>
  4 #include <helper_math.h>
  5 #include <helper_string.h>
  6 
  7 #define THREAD_N            256
  8 #include "cppOverload_kernel.cu"                                            // 源代码文件中使用了 THREAD_N，必须先定义
  9 
 10 #define N                   1024
 11 #define DIV_UP(a, b)        (((a) + (b) - 1) / (b))
 12 #define OUTPUT_ATTR(attr)                                               
 13     printf("Shared Size:           %d
", (int)attr.sharedSizeBytes);   
 14     printf("Constant Size:         %d
", (int)attr.constSizeBytes);    
 15     printf("Local Size:            %d
", (int)attr.localSizeBytes);    
 16     printf("Max Threads Per Block: %d
", attr.maxThreadsPerBlock);     
 17     printf("Number of Registers:   %d
", attr.numRegs);                
 18     printf("PTX Version:           %d
", attr.ptxVersion);             
 19     printf("Binary Version:        %d
", attr.binaryVersion);             
 20 
 21 bool check_func1(int *hInput, int *hOutput, int a)
 22 {
 23     for (int i = 0; i < N; ++i)
 24     {
 25         int cpuRes = hInput[i] * a + i;
 26         if (hOutput[i] != cpuRes)
 27             return false;
 28     }
 29     return true;
 30 }
 31 
 32 bool check_func2(int2 *hInput, int *hOutput, int a)
 33 {
 34     for (int i = 0; i < N; i++)
 35     {
 36         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
 37         if (hOutput[i] != cpuRes)
 38             return false;
 39     }
 40     return true;
 41 }
 42 
 43 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
 44 {
 45     for (int i = 0; i < N; i++)
 46     {
 47         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
 48             return false;
 49     }
 50     return true;
 51 }
 52 
 53 int main(int argc, const char *argv[])
 54 {    
 55     int deviceID = cudaSetDevice(0);
 56 
 57     int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;
 58     cudaMalloc(&dInput, sizeof(int)*N * 2);
 59     cudaMalloc(&dOutput, sizeof(int)*N);
 60     cudaMallocHost(&hInput, sizeof(int)*N * 2);
 61     cudaMallocHost(&hOutput, sizeof(int)*N);
 62     
 63     for (int i = 0; i < N * 2; i++)
 64         hInput[i] = i;
 65     cudaMemcpy(dInput, hInput, sizeof(int)*N * 2, cudaMemcpyHostToDevice);
 66   
 67     const int a = 2;
 68     void(*func1)(const int *, int *, int) = simple_kernel;
 69     void(*func2)(const int2 *, int *, int) = simple_kernel;
 70     void(*func3)(const int *, const int *, int *, int) = simple_kernel;
 71     struct cudaFuncAttributes attr;
 72 
 73     // function 1
 74     memset(&attr, 0, sizeof(attr));
 75     cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);                      // 运行前分析资源占用
 76     cudaFuncGetAttributes(&attr, *func1);
 77     OUTPUT_ATTR(attr);
 78     (*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);
 79     cudaDeviceSynchronize();
 80     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);    
 81     printf("simple_kernel(const int *pIn, int *pOut, int a) %s

", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED");
 82 
 83     // function 2
 84     memset(&attr, 0, sizeof(attr));
 85     cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);
 86     cudaFuncGetAttributes(&attr, *func2);
 87     OUTPUT_ATTR(attr);
 88     (*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a);    // 强行转换成 int2*，反正也是对其的
 89     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
 90     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s

", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED");
 91 
 92     // function 3
 93     memset(&attr, 0, sizeof(attr));
 94     cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);
 95     cudaFuncGetAttributes(&attr, *func3);
 96     OUTPUT_ATTR(attr);
 97     (*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);
 98     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);    
 99     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s

", check_func3(&hInput[0], &hInput[N], hOutput, a) ? "PASSED" : "FAILED");
100 
101     cudaFree(dInput);
102     cudaFree(dOutput);
103     cudaFreeHost(hOutput);
104     cudaFreeHost(hInput);    
105     getchar();
106     return 0;
107 }

● 输出结果：

Shared Size:           1024
Constant Size:         0
Local Size:            0
Max Threads Per Block: 1024
Number of Registers:   12
PTX Version:           60
Binary Version:        60
simple_kernel(const int *pIn, int *pOut, int a) PASSED

Shared Size:           2048
Constant Size:         0
Local Size:            0
Max Threads Per Block: 1024
Number of Registers:   13
PTX Version:           60
Binary Version:        60
simple_kernel(const int2 *pIn, int *pOut, int a) PASSED

Shared Size:           2048
Constant Size:         0
Local Size:            0
Max Threads Per Block: 1024
Number of Registers:   14
PTX Version:           60
Binary Version:        60
simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

▶ 涨姿势：

● cuda 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义：

 1 struct __device_builtin__ cudaFuncAttributes
 2 {
 3     size_t sharedSizeBytes; // 共享内存大小
 4     size_t constSizeBytees; // 常量内存大小
 5     size_t localSizeBytes;  // 局部内存大小
 6     int maxThreadsPerBlock; // 每线程块线最大程数量
 7     int numRegs;            // 寄存器数量
 8     int ptxVersion;         // PTX版本号
 9     int binaryVersion;      // 机器码版本号
10     int cacheModeCA;        // 是否使用编译指令 -Xptxas --dlcm=ca
11 };

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

1 struct cudaFuncAttributes attr;
2 memset(&attr, 0, sizeof(attr));
3 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);
4 cudaFuncGetAttributes(&attr, *function);

■ 涉及的函数

 1 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
 2 
 3 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
 4 {
 5     return cudaErrorUnknown;
 6 }
 7 
 8 #define OUTPUT_ATTR(attr)                                           
 9     printf("Shared Size:   %d
", (int)attr.sharedSizeBytes);       
10     printf("Constant Size: %d
", (int)attr.constSizeBytes);        
11     printf("Local Size:    %d
", (int)attr.localSizeBytes);        
12     printf("Max Threads Per Block: %d
", attr.maxThreadsPerBlock); 
13     printf("Number of Registers: %d
", attr.numRegs);              
14     printf("PTX Version: %d
", attr.ptxVersion);                   
15     printf("Binary Version: %d
", attr.binaryVersion);