0_Simple__fp16ScalarProduct

▶ 使用cuda内置无符号整数结构（__half2）及其汇编函数，计算两个向量的内积。

▶ 源代码

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <time.h>
  4 #include "cuda_runtime.h"
  5 #include "device_launch_parameters.h"
  6 #include "cuda_fp16.h"
  7 #include "helper_cuda.h"
  8 
  9 // 将数组 v 进行二分规约加法，使用 __forceinline__ 强制内联
 10 __forceinline__ __device__ void reduceInShared(half2 * const v)
 11 {
 12     if (threadIdx.x < 64)
 13         v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
 14     __syncthreads();
 15     for (int i = 32; i > 0; i /= 2)
 16     {
 17         if (threadIdx.x < 32)
 18             v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + i]);
 19         __syncthreads();
 20     }
 21 }
 22 
 23 // 将数组 a 与 b 相加后进行规约加法，输入还包括指向结果的指针 h_result 及数组大小
 24 __global__ void scalarProductKernel(half2 const * const a, half2 const * const b, float * const h_result, size_t const size)
 25 {    
 26     __shared__ half2 shArray[128];
 27     const int stride = gridDim.x * blockDim.x;
 28 
 29     shArray[threadIdx.x] = __float2half2_rn(0.f);                               // 浮点数转无符号整数，这里相当于初始化为 0
 30     
 31     half2 value = __float2half2_rn(0.f);                                        
 32     for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride)  // 半精度混合乘加，value = a[i] * b[i] + value
 33         value = __hfma2(a[i], b[i], value);                                     
 34     shArray[threadIdx.x] = value;
 35     __syncthreads();
 36 
 37     reduceInShared(shArray);                                                    // 规约得 a 和 b 的内积，因为使用了内联，共享内存指针可以传入
 38 
 39     if (threadIdx.x == 0)                                                       // 0 号线程负责写入结果
 40     {
 41         half2 result = shArray[0];
 42         h_result[blockIdx.x] = (float)(__low2float(result) + __high2float(result));
 43     }
 44 }
 45 
 46 void generateInput(half2 * a, size_t size)                                      // 生成随机数组
 47 {
 48     for (size_t i = 0; i < size; ++i)
 49     {
 50         unsigned temp = rand();
 51         temp &= 0x83FF83FF;                                                     // 2214560767(10), 10000011111111111000001111111111(2)
 52         temp |= 0x3C003C00;                                                     // 1006648320(10),   111100000000000011110000000000(2)
 53         a[i] = *(half2*)&temp;
 54     }
 55 }
 56 
 57 int main(int argc, char *argv[])
 58 {
 59     srand(time(NULL));
 60     const int blocks = 128, threads = 128;
 61     size_t size = blocks * threads * 16;
 62 
 63     int devID = 0;
 64     cudaDeviceProp devProp;
 65     cudaGetDeviceProperties(&devProp, devID);
 66     if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3))
 67     {
 68         printf("required GPU with compute SM 5.3 or higher.
");
 69         return EXIT_WAIVED;
 70     }
 71 
 72     half2 *h_vec[2], *d_vec[2];
 73     float *h_result, *d_result;
 74     for (int i = 0; i < 2; ++i)
 75     {
 76         cudaMallocHost((void**)&h_vec[i], size * sizeof*h_vec[i]);
 77         cudaMalloc((void**)&d_vec[i], size * sizeof*d_vec[i]);
 78     }
 79     cudaMallocHost((void**)&h_result, blocks * sizeof*h_result);
 80     cudaMalloc((void**)&d_result, blocks * sizeof*d_result);
 81     for (int i = 0; i < 2; ++i)
 82     {
 83         generateInput(h_vec[i], size);
 84         cudaMemcpy(d_vec[i], h_vec[i], size * sizeof*h_vec[i], cudaMemcpyHostToDevice);
 85     }
 86     scalarProductKernel << <blocks, threads >> >(d_vec[0], d_vec[1], d_result, size);
 87     cudaMemcpy(h_result, d_result, blocks * sizeof * h_result, cudaMemcpyDeviceToHost);
 88     cudaDeviceSynchronize();
 89 
 90     float result = 0;
 91     for (int i = 0; i < blocks; ++i)
 92         result += h_result[i];
 93     printf("Result: %f 
", result);
 94 
 95     for (int i = 0; i < 2; ++i)
 96     {
 97         cudaFree(d_vec[i]);
 98         cudaFreeHost(h_vec[i]);
 99     }
100     cudaFree(d_result);
101     cudaFreeHost(h_result);
102     getchar();
103     return EXIT_SUCCESS;
104 }

● 输出结果

GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

Result: 853856.000000

▶ 涨姿势

● CUDA 无符号半精度整数，就是用 unsigned short 对齐到 2 Byte 来封装的

1 typedef struct __align__(2) { unsigned short x; } __half;
2 
3 typedef struct __align__(4) { unsigned int x; } __half2; 
4 
5 #ifndef CUDA_NO_HALF
6 typedef __half half;
7 typedef __half2 half2;
8 #endif

● 关于 __inline__ 和 __forceinline__

参考stackoverflow。https://stackoverflow.com/questions/19897803/forceinline-effect-at-cuda-c-device-functions

与C中__forceinline__类似，忽略编译器的建议，强制实现内联函数。如果函数只调用累次那么优化没有效果，但是如果调用了多次（如内联函数出现在循环中），则会产生明显的提升。另外，在递归中一般不用。

● 关于 __CUDACC__ 和 __CUDA_ARCH__

■ 参考 stackoverflow【https://stackoverflow.com/questions/8796369/cuda-and-nvcc-using-the-preprocessor-to-choose-between-float-or-double】

■ __CUDACC__ 使用 nvcc 进行编译时有定义。

■ __CUDA_ARCH__ 编译主机代码时无定义（无论是否使用 nvcc）；编译设备代码时有定义，且值等于编译命令指定的计算能力号。

■ 范例代码：（为了方便查看，使用了缩进）

 1 #ifdef __CUDACC__
 2     #warning using nvcc
 3 
 4     template <typename T>　　　　　　　　　　　　　　　　　　// 一般的核函数
 5     __global__ void add(T *x, T *y, T *z)
 6     {
 7         int idx = threadIdx.x + blockDim.x * blockIdx.x;
 8         z[idx] = x[idx] + y[idx];
 9     }
10 
11     #ifdef __CUDA_ARCH__
12         #warning device code trajectory
13         #if __CUDA_ARCH__ > 120
14             #warning compiling with datatype double
15             template void add<double>(double *, double *, double *);
16         #else
17             #warning compiling with datatype float
18             template void add<float>(float *, float *, float *);
19         #endif
20     #else
21         #warning nvcc host code trajectory
22     #endif
23 #else
24     #warning non - nvcc code trajectory
25 #endif

■ 编译及输出结果

$ ln -s cudaarch.cu cudaarch.cc
$ gcc -c cudaarch.cc -o cudaarch.o
cudaarch.cc:26:2: warning: #warning non-nvcc code trajectory

$ nvcc -arch=sm_11 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o
cudaarch.cu:3:2: warning: #warning using nvcc
cudaarch.cu:14:2: warning: #warning device code trajectory
cudaarch.cu:19:2: warning: #warning compiling with datatype float
cudaarch.cu:3:2: warning: #warning using nvcc
cudaarch.cu:23:2: warning: #warning nvcc host code trajectory
ptxas info    : Compiling entry function '_Z3addIfEvPT_S1_S1_' for 'sm_11'
ptxas info    : Used 4 registers, 12+16 bytes smem

$ nvcc -arch=sm_20 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o
cudaarch.cu:3:2: warning: #warning using nvcc
cudaarch.cu:14:2: warning: #warning device code trajectory
cudaarch.cu:16:2: warning: #warning compiling with datatype double
cudaarch.cu:3:2: warning: #warning using nvcc
cudaarch.cu:23:2: warning: #warning nvcc host code trajectory
ptxas info    : Compiling entry function '_Z3addIdEvPT_S1_S1_' for 'sm_20'
ptxas info    : Used 8 registers, 44 bytes cmem[0]

● 用到的汇编函数

 1 // 表明主机和设备共有代码
 2 #define __CUDA_FP16_DECL__ __host__ __device__
 3 
 4 // 浮点数转无符号整数
 5 __CUDA_FP16_DECL__ __half2 __float2half2_rn(const float f)
 6 {
 7     __half2 val;
 8     asm("{.reg .f16 low;
"
 9         "  cvt.rn.f16.f32 low, %1;
"
10         "  mov.b32 %0, {low,low};}
" : "=r"(val.x) : "f"(f));
11     return val;
12 }
13 
14 // 计算无符号整数 a + b
15 #define BINARY_OP_HALF2_MACRO(name)                                             
16     do                                                                          
17     {                                                                           
18         __half2 val;                                                            
19         asm("{"#name".f16x2 %0,%1,%2;
}" :"=r"(val.x) : "r"(a.x), "r"(b.x));   
20         return val;                                                             
21     }                                                                           
22     while(0);                                                                   
23 
24 __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
25 {
26     BINARY_OP_HALF2_MACRO(add);
27 }
28 
29 // 计算无符号整数 a * b + c
30 #define TERNARY_OP_HALF2_MACRO(name)                                                        
31     do                                                                                      
32     {                                                                                       
33         __half2 val;                                                                        
34         asm("{"#name".f16x2 %0,%1,%2,%3;
}" : "=r"(val.x) : "r"(a.x), "r"(b.x), "r"(c.x)); 
35         return val;                                                                         
36     }                                                                                       
37     while(0);                                                                               
38 
39 __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
40 {
41     TERNARY_OP_HALF2_MACRO(fma.rn);
42 }
43 
44 // 将无符号整数的低 2 字节转化为浮点数
45 __CUDA_FP16_DECL__ float __low2float(const __half2 l)
46 {
47     float val;
48     asm("{.reg .f16 low,high;
"
49         "  mov.b32 {low,high},%1;
"
50         "  cvt.f32.f16 %0, low;}
" : "=f"(val) : "r"(l.x));
51     return val;
52 }
53 
54 // 将无符号整数的高 2 字节转化为浮点数
55 __CUDA_FP16_DECL__ float __high2float(const __half2 l)
56 {
57     float val;
58     asm("{.reg .f16 low,high;
"
59         "  mov.b32 {low,high},%1;
"
60         "  cvt.f32.f16 %0, high;}
" : "=f"(val) : "r"(l.x));
61     return val;
62 }