0_Simple__simpleAtomicIntrinsics + 0_Simple__simpleAtomicIntrinsics_nvrtc

原子操作。并且在静态代码和运行时编译两种条件下使用。

▶ 源代码:静态使用

 1 #ifndef _SIMPLEATOMICS_KERNEL_H_
 2 #define _SIMPLEATOMICS_KERNEL_H_
 3 //#include "device_launch_parameters.h"
 4 
 5 __global__ void testKernel(int *g_odata)
 6 {
 7     const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
 8 
 9     // 算术运算原子指令
10     atomicAdd(&g_odata[0], 10);                 // 0号位加 10
11 
12     atomicSub(&g_odata[1], 10);                 // 1号位减 10
13 
14     atomicExch(&g_odata[2], tid);               // 2号位与 tid 号值交换(获得最后一个访问的 tid 号)
15 
16     atomicMax(&g_odata[3], tid);                // 3号位获得最大的 tid 号
17 
18     atomicMin(&g_odata[4], tid);                // 4号位获得最小的 tid 号
19 
20     atomicInc((unsigned int *)&g_odata[5], 16); // 5号位做模 17 的加法(g_odata[5] == 15 时加 1 得 16,再加 1 得 0)
21 
22     atomicDec((unsigned int *)&g_odata[6], 136);// 6号位做模 137 的减法(g_odata[5] == 0 时减 1 得 136,再减 1 得 135)
23 
24     atomicCAS(&g_odata[7], tid - 1, tid);       // 7号位迭代 (g_odata[7] == tid - 1) ? tid : (g_odata[7]); 
25                                                 // 即以 g_odata[7] 初值为起点,增量为 1 的子序列的最大长度(一旦有增量为 1 的元素插到前面去该值就再也不变)
26     // 位运算原子指令
27     atomicAnd(&g_odata[8], 2*tid+7);            // 8号位为 1,注意 (2k+7)%2 == 1 但 (2k+7)%(2^m) == 0 或 1,即仅最后一位能保证为 1
28 
29     atomicOr(&g_odata[9], 1 << tid);            // 9号位为 -1,所有的位均为 1
30 
31     atomicXor(&g_odata[10], tid);               // 10号位为 255,注意异或运算具有交换律和结合律,硬算
32 }
33 
34 #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
 1 /*simpleAtomicIntrinsics_cpu.cpp*/
 2 #include <stdio.h>
 3 
 4 extern "C" int computeGold(int *gpuData, const int len);
 5 
 6 int computeGold(int *gpuData, const int len)
 7 {
 8     if (gpuData[0] != 10 * len)
 9     {
10         printf("atomicAdd failed
");
11         return false;
12     }
13     if (gpuData[1] != -10 * len)
14     {
15         printf("atomicSub failed
");
16         return false;
17     }
18     if (gpuData[2] < 0 || gpuData[2] >= len)// gpuData[2] ∈ [0, len)
19     {
20         printf("atomicExch failed
");
21         return false;
22     }
23     if (gpuData[3] != len - 1)
24     {
25         printf("atomicMax failed
");
26         return false;
27     }
28     if (gpuData[4]!=0)
29     {
30         printf("atomicMin failed
");
31         return false;
32     }
33     if (gpuData[5] != len % 17)
34     {
35         printf("atomicInc failed
");
36         return false;
37     }
38     if (gpuData[6] != 137 - len % 137)
39     {
40         printf("atomicDec failed
");
41         return false;
42     }
43     if (gpuData[7] < 0 || gpuData[7] >= len)// gpuData[7] ∈ [0, len)
44     {
45         printf("atomicCAS failed
");
46         return false;
47     }
48     if (gpuData[8] != 1)
49     {
50         printf("atomicAnd failed
");
51         return false;
52     }
53     if (gpuData[9] != -1) 
54     {
55         printf("atomicOr failed
");
56         return false;
57     }
58     if (gpuData[10] != 0xff)
59     {
60         printf("atomicXor failed
");
61         return false;
62     }
63     return true;
64 }
 1 #include <stdio.h>
 2 #include <windows.h>
 3 #include <cuda_runtime.h>
 4 #include <helper_functions.h>
 5 #include <helper_cuda.h>
 6 #include "simpleAtomicIntrinsics_kernel.cuh"
 7 
 8 #define WINDOWS_LEAN_AND_MEAN
 9 #define NOMINMAX
10 
11 extern "C" bool computeGold(int *gpuData, const int len);
12 
13 bool runTest()
14 {   
15     bool testResult = false;
16     unsigned int numThreads = 256;
17     unsigned int numBlocks = 64;
18     unsigned int numData = 11;
19     unsigned int memSize = sizeof(int) * numData;
20 
21     int *h_data = (int *) malloc(memSize);
22     for (unsigned int i = 0; i < numData; h_data[i] = 0, i++);  // 初始化为全零
23     h_data[8] = h_data[10] = 0xff;                              // 搞点非零值
24 
25     int *d_data;
26     cudaMalloc((void **) &d_data, memSize);
27     cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice);
28 
29     // 输出运算前的结果
30     printf("
	Before:");                                             
31     for (int i = 0; i < numData; i++)
32         printf("%8d,", h_data[i]);
33     printf("
");
34 
35     // 计算和计时
36     StopWatchInterface *timer;
37     sdkCreateTimer(&timer);
38     sdkStartTimer(&timer);
39 
40     testKernel << <numBlocks, numThreads >> > (d_data);
41     getLastCudaError("Kernel execution failed");
42 
43     sdkStopTimer(&timer);
44     printf("
Processing time: %f ms
", sdkGetTimerValue(&timer));
45     sdkDeleteTimer(&timer);
46 
47     cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost);
48 
49     // 输出运算后的结果
50     printf("
	After :");                                             
51     for (int i = 0; i < numData; i++)
52         printf("%8d,", h_data[i]);
53     printf("
");
54 
55     testResult = computeGold(h_data, numThreads * numBlocks);
56 
57     free(h_data);
58     cudaFree(d_data);
59 
60     return testResult;
61 }
62 
63 int main()
64 {
65     bool testResult;                    
66 
67     printf("
	Started!
");
68 
69     testResult = runTest();
70 
71     printf("
	Completed! main function returned %s
", testResult ? "OK!" : "ERROR!");
72     getchar();
73 
74     return 0;
75 }

▶ 源代码:即时编译

1 /*simpleAtomicIntrinsics_kernel.cuh 发生变化的地方*/
2 extern "C" __global__ void testKernel(int *g_odata)
1 /*simpleAtomicIntrinsics_cpu.cpp 完全一样*/
 1 /*simpleAtomicIntrinsics.cpp*/
 2 #include <stdio.h>
 3 #include <windows.h>
 4 #include <cuda_runtime.h>
 5 #include <nvrtc_helper.h>
 6 #include <helper_functions.h>// includes cuda.h and cuda_runtime_api.h
 7 
 8 #define WINDOWS_LEAN_AND_MEAN
 9 #define NOMINMAX
10 
11 extern "C" bool computeGold(int *gpuData, const int len);
12 
13 bool runTest()
14 {
15     bool testResult = false;
16     unsigned int numThreads = 256;
17     unsigned int numBlocks = 64;
18     unsigned int numData = 11;
19     unsigned int memSize = sizeof(int) * numData;
20 
21     //即时编译过程
22     char *kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", NULL);
23     char *ptx;
24     size_t ptxSize;
25     compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
26     CUmodule module = loadPTX(ptx, 1, NULL);
27     CUfunction kernel_addr;
28     cuModuleGetFunction(&kernel_addr, module, "testKernel");
29 
30     int *h_data = (int *) malloc(memSize);
31     for (unsigned int i = 0; i < numData; h_data[i] = 0, i++);
32     h_data[8] = h_data[10] = 0xff;
33 
34     CUdeviceptr d_data;
35     cuMemAlloc(&d_data, memSize);
36     cuMemcpyHtoD(d_data, h_data, memSize);
37 
38     dim3 cudaBlockSize(numThreads,1,1);
39     dim3 cudaGridSize(numBlocks, 1, 1);
40     void *arr[] = { (void *)&d_data };
41     cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
42         cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0);
43 
44     cuCtxSynchronize();
45 
46     cuMemcpyDtoH(h_data, d_data, memSize);
47 
48     testResult = computeGold(h_data, numThreads * numBlocks);
49 
50     free(h_data);
51     cuMemFree(d_data);
52 
53     return testResult;
54 }
55 
56 int main()
57 {
58     bool testResult;
59 
60     printf("
	Started!
");
61 
62     testResult = runTest();
63 
64     printf("
	Completed! main function returned %s
", testResult ? "OK!" : "ERROR!");
65     getchar();
66 
67     return 0;
68 }

▶ 输出结果:

    Started!

    Before:       0,       0,       0,       0,       0,       0,       0,       0,     255,       0,     255,

Processing time: 0.035352 ms

    After :  163840, -163840,   16287,   16383,       0,      13,      56,       7,       1,      -1,     255,

    Completed! main function returned OK!

▶ 涨姿势

● 一个有趣的数列:命 x0 = 0,xn = xn-1 XOR n,则有 x4n == 4n,x4n+1 = 1, x4n+2 == 4n+3, x4n+3 == 0。当改变初值的时候该表达式发生变化,结果如下图。三种颜色分别代表初始选作右边三个值的时候的结果。

● 解毒 device_atomic_functions.h 与原子操作。只保留了有效部分,去掉了注释和留白。

  1 #if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
  2     #define __DEVICE_ATOMIC_FUNCTIONS_HPP__
  3 
  4     #if defined(__CUDACC_RTC__) // 主机编译
  5         #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __host__ __device__
  6     #else                       // 设备编译
  7         #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
  8     #endif
  9 
 10     #if defined(__cplusplus) && defined(__CUDACC__)
 11 
 12 #include "builtin_types.h"
 13 #include "host_defines.h"
 14 
 15 // 整数原子加法。返回 *address 旧值,*address += val;
 16 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
 17 {
 18     return __iAtomicAdd(address, val);
 19 }
 20 
 21 // 无符号整数原子加法
 22 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
 23 {
 24     return __uAtomicAdd(address, val);
 25 }
 26 
 27 // 整数原子减法,注意转换为加法来运算。返回 *address 旧值,*address -= val;。
 28 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
 29 {
 30     return __iAtomicAdd(address, (unsigned int)-(int)val);
 31 }
 32 
 33 // 无符号整数原子减法
 34 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
 35 {
 36     return __uAtomicAdd(address, (unsigned int)-(int)val);
 37 }
 38 
 39 // 整数原子替换。返回 *address 旧值,*address = val;
 40 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
 41 {
 42     return __iAtomicExch(address, val);
 43 }
 44 
 45 // 无符号整数原子替换
 46 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
 47 {
 48     return __uAtomicExch(address, val);
 49 }
 50 
 51 // 浮点原子替换
 52 __DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
 53 {
 54     return __fAtomicExch(address, val);
 55 }
 56 
 57 // 整数原子取小。返回 *address 旧值,*address = MIN(*adress, val);
 58 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
 59 {
 60     return __iAtomicMin(address, val);
 61 }
 62 
 63 // 无符号整数原子取小
 64 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
 65 {
 66     return __uAtomicMin(address, val);
 67 }
 68 
 69 // 整数原子取大。返回 *address 旧值,*address = MAX(*adress, val);
 70 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
 71 {
 72     return __iAtomicMax(address, val);
 73 }
 74 
 75 // 无符号整数原子取大
 76 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
 77 {
 78     return __uAtomicMax(address, val);
 79 }
 80 
 81 // 无符号整数原子模加法。返回 *address 旧值,*address = (*adress + 1) % (val + 1);
 82 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
 83 {
 84     return __uAtomicInc(address, val);
 85 }
 86 
 87 // 无符号整数原子模减法。返回 *address 旧值,*address = (*adress + val) % (val + 1);
 88 // 不用 (*adress - 1) 是为了把结果控制在 [0, val] 中,防止变成负数,这与 C 中的 % 运算不同                                      
 89 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
 90 {                                                                                               
 91     return __uAtomicDec(address, val);
 92 }
 93 
 94 // 整数原子按位且。返回 *address 旧值,*adress &= val;
 95 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
 96 {
 97     return __iAtomicAnd(address, val);
 98 }
 99 
100 // 无符号整数原子按位且
101 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
102 {
103     return __uAtomicAnd(address, val);
104 }
105 
106 // 整数原子按位或。返回 *address 旧值,*adress |= val;
107 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
108 {
109     return __iAtomicOr(address, val);
110 }
111 
112 // 无符号整数原子按位或
113 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
114 {
115     return __uAtomicOr(address, val);
116 }
117 
118 // 整数原子按位异或。返回 *address 旧值,*adress ^= val;
119 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
120 {
121     return __iAtomicXor(address, val);
122 }
123 
124 // 无符号整数原子按位异或
125 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
126 {
127     return __uAtomicXor(address, val);
128 }
129 
130 // 整数原子比较赋值。返回 *address 旧值,*address = (*address == compare) ? val : *address;
131 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
132 {
133     return __iAtomicCAS(address, compare, val);
134 }
135 
136 // 无符号整数原子比较赋值
137 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
138 {
139     return __uAtomicCAS(address, compare, val);
140 }
141 
142 // 无符号长整数原子加法
143 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
144 {
145     return __ullAtomicAdd(address, val);
146 }
147 
148 // 无符号长整数原子替换
149 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
150 {
151     return __ullAtomicExch(address, val);
152 }
153 
154 // 无符号长整数原子比较赋值
155 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
156 {
157     return __ullAtomicCAS(address, compare, val);
158 }
159 
160 // 原子存在量词
161 __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
162 {
163     return (bool)__any((int)cond);
164 }
165 
166 // 原子全称量词
167 __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
168 {
169     return (bool)__all((int)cond);
170 }
171 
172     #endif /* __cplusplus && __CUDACC__ */
173 
174 #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
175 
176 #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */

● 原子操作函数声明在 device_functions.h 中。

  当设备计算能力 > 320 或 > 600 时开放各原子操作对应的 block 和 system 函数,并开放对应 long long、float、double 型数据的同一个函数,例如:

 1 #if !defined(__CUDACC_RTC__) || __CUDA_ARCH__ >= 600
 2 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val);
 3 
 4 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val);
 5 
 6 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val);
 7 
 8 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val);
 9 
10 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);
11 #endif /* !__CUDACC_RTC__ || __CUDA_ARCH__ >= 600 */

  计算能力 600 以上能用的所有原子操作:

  1 #define __DEVICE_FUNCTIONS_STATIC_DECL__ __host__ __device__ __cudart_builtin__
  2 
  3 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd(int *p, int val);
  4 
  5 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_block(int *p, int val);
  6 
  7 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_system(int *p, int val);
  8 
  9 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd(unsigned int *p, unsigned int val);
 10 
 11 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_block(unsigned int *p, unsigned int val);
 12 
 13 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_system(unsigned int *p, unsigned int val);
 14 
 15 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd(unsigned long long *p, unsigned long long val);
 16 
 17 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_block(unsigned long long *p, unsigned long long val);
 18 
 19 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_system(unsigned long long *p, unsigned long long val);
 20 
 21 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd(float *p, float val);
 22 
 23 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val);
 24 
 25 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val);
 26 
 27 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val);
 28 
 29 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val);
 30 
 31 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);
 32 
 33 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch(int *p, int val);
 34 
 35 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_block(int *p, int val);
 36 
 37 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_system(int *p, int val);
 38 
 39 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch(unsigned int *p, unsigned int val);
 40 
 41 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_block(unsigned int *p, unsigned int val);
 42 
 43 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_system(unsigned int *p, unsigned int val);
 44 
 45 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch(unsigned long long *p, unsigned long long val);
 46 
 47 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_block(unsigned long long *p, unsigned long long val);
 48 
 49 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_system(unsigned long long *p, unsigned long long val);
 50 
 51 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch(float *p, float val);
 52 
 53 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_block(float *p, float val);
 54 
 55 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_system(float *p, float val);
 56 
 57 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin(int *p, int val);
 58 
 59 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_block(int *p, int val);
 60 
 61 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_system(int *p, int val);
 62 
 63 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin(long long *p, long long val);
 64 
 65 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_block(long long *p, long long val);
 66 
 67 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_system(long long *p, long long val);
 68 
 69 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin(unsigned int *p, unsigned int val);
 70 
 71 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_block(unsigned int *p, unsigned int val);
 72 
 73 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_system(unsigned int *p, unsigned int val);
 74 
 75 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin(unsigned long long *p, unsigned long long val);
 76 
 77 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_block(unsigned long long *p, unsigned long long val);
 78 
 79 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_system(unsigned long long *p, unsigned long long val);
 80 
 81 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax(int *p, int val);
 82 
 83 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_block(int *p, int val);
 84 
 85 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_system(int *p, int val);
 86 
 87 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax(long long *p, long long val);
 88 
 89 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_block(long long *p, long long val);
 90 
 91 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_system(long long *p, long long val);
 92 
 93 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax(unsigned int *p, unsigned int val);
 94 
 95 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_block(unsigned int *p, unsigned int val);
 96 
 97 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_system(unsigned int *p, unsigned int val);
 98 
 99 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax(unsigned long long *p, unsigned long long val);
100 
101 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_block(unsigned long long *p, unsigned long long val);
102 
103 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_system(unsigned long long *p, unsigned long long val);
104 
105 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc(unsigned int *p, unsigned int val);
106 
107 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_block(unsigned int *p, unsigned int val);
108 
109 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_system(unsigned int *p, unsigned int val);
110 
111 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec(unsigned int *p, unsigned int val);
112 
113 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_block(unsigned int *p, unsigned int val);
114 
115 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_system(unsigned int *p, unsigned int val);
116 
117 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS(int *p, int compare, int val);
118 
119 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_block(int *p, int compare, int val);
120 
121 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_system(int *p, int compare, int val);
122 
123 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS(unsigned int *p, unsigned int compare, unsigned int val);
124 
125 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_block(unsigned int *p, unsigned int compare, unsigned int val);
126 
127 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_system(unsigned int *p, unsigned int compare     unsigned int val);
128 
129 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);
130 
131 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_block(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);
132 
133 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_system(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);
134 
135 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd(int *p, int val);
136 
137 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_block(int *p, int val);
138 
139 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_system(int *p, int val);
140 
141 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicAnd(long long int *p, long long int val);
142 
143 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_block(long long *p, long long val);
144 
145 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_system(long long *p, long long val);
146 
147 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd(unsigned int *p, unsigned int val);
148 
149 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_block(unsigned int *p, unsigned int val);
150 
151 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_system(unsigned int *p, unsigned int val);
152 
153 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicAnd(unsigned long long int *p     unsigned long long int val);
154 
155 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_block(unsigned long long *p, unsigned long long val);
156 
157 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_system(unsigned long long *p, unsigned long long val);
158 
159 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr(int *p, int val);
160 
161 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_block(int *p, int val);
162 
163 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_system(int *p, int val);
164 
165 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicOr(long long int *p, long long int val);
166 
167 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_block(long long *p, long long val);
168 
169 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_system(long long *p, long long val);
170 
171 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr(unsigned int *p, unsigned int val);
172 
173 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_block(unsigned int *p, unsigned int val);
174 
175 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_system(unsigned int *p, unsigned int val);
176 
177 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicOr(unsigned long long int *p     unsigned long long int val);
178 
179 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_block(unsigned long long *p, unsigned long long val);
180 
181 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_system(unsigned long long *p, unsigned long long val);
182 
183 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor(int *p, int val);
184 
185 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_block(int *p, int val);
186 
187 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_system(int *p, int val);
188 
189 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicXor(long long int *p, long long int val);
190 
191 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_block(long long *p, long long val);
192 
193 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_system(long long *p, long long val);
194 
195 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor(unsigned int *p, unsigned int val);
196 
197 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_block(unsigned int *p, unsigned int val);
198 
199 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_system(unsigned int *p, unsigned int val);
200 
201 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicXor(unsigned long long int *p     unsigned long long int val);
202 
203 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_block(unsigned long long *p, unsigned long long val);
204 
205 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_system(unsigned long long *p, unsigned long long val);
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7777814.html