OpenCL 存储器次序的验证

▶ 《OpenCL异构并行编程实战》P224 的代码,先放上来,坐等新设备到了再执行

 1 //kernel.cl
 2 __global volatile atomic_int globalAtom = ATOMIC_VAR_INIT(0);   // 全局原子对象
 3 __kernel void memoryOrderTest01(__global int *dst)
 4 {
 5     __local volatile atomic_int localAtom;                      // 本地原子对象   
 6     atomic_init(&localAtom, 0);
 7     const int gid = get_global_id(0);
 8     work_group_barrier(CLK_LOCAL_MEM_FENCE);
 9     if (gid == 0)                                               // 0 号工作项尝试写入 1
10     {
11         atomic_store_explicit(&localAtom, 1, memory_order_seq_cst, memory_scope_work_group);
12         atomic_store_explicit(&globalAtom, 1, memory_order_seq_cst, memory_scope_device);
13     }
14     //atomic_work_item_fence(CLK_LOCAL_MEM_FENCE, memory_order_acq_rel, memory_scope_work_group);
15     if (gid == 64)
16     {
17         int a, count;
18         for (a = 0, count = 1; a == 0 && count < 10000; count++)
19             a = atomic_load_explicit(&localAtom, memory_order_seq_cst, memory_scope_work_group);
20         dst[0] = !!a;
21         dst[2] = count;
22         for (count = 1; a == 0 && count < 10000; count++)
23             a = atomic_load_explicit(&globalAtom, memory_order_seq_cst, memory_scope_device);
24         dst[1] = !!a;
25     }
26     work_group_barrier(0);// 必须添加,将 0 号工作项的副作用暴露给其他工作项
27 }
28 
29 __kernel void memoryOrderTest02(__global int *dst)
30 {
31     __local volatile atomic_int localAtom;
32     atomic_init(&localAtom, 0);
33     const int gid = get_global_id(0);
34     work_group_barrier(CLK_LOCAL_MEM_FENCE);
35     if (gid == 0)
36     {
37         atomic_store(&localAtom, 1);
38         atomic_store(&globalAtom, 1);
39     }
40     //atomic_work_item_fence(CLK_LOCAL_MEM_FENCE, memory_order_acq_rel, memory_scope_work_group);
41     if (gid == 64)
42     {
43         int a, count;
44         for (a = 0, count = 1; a == 0 && count < 10000; count++)
45             a = atomic_load(&localAtom);
46         dst[0] = !!a;
47         dst[2] = count;
48         for (count = 1; a == 0 && count < 10000; count++)
49             a = atomic_load(&globalAtom);
50         dst[1] = !!a;
51     }
52     work_group_barrier(0);
53 }
 1 //main.c
 2 #include <stdio.h>  
 3 #include <stdlib.h>  
 4 #include <cl.h>
 5 
 6 const char *sourceCode = "D:/Code/kernel.cl";
 7 
 8 int readSource(const char* kernelPath, char *source)// 读取文本文件,存储为 char *,返回代码长度
 9 {
10     FILE *fp;
11     long int size;
12     //printf("readSource, Program file: %s
", kernelPath);
13     fopen_s(&fp, kernelPath, "rb");
14     if (!fp)
15     {
16         printf("Open kernel file failed
");
17         exit(-1);
18     }
19     if (fseek(fp, 0, SEEK_END) != 0)
20     {
21         printf("Seek end of file faildd
");
22         exit(-1);
23     }
24     if ((size = ftell(fp)) < 0)
25     {
26         printf("Get file position failed
");
27         exit(-1);
28     }
29     rewind(fp);
30     if ((source = (char *)malloc(size + 1)) == NULL)
31     {
32         printf("Allocate space failed
");
33         exit(-1);
34     }
35     fread(source, 1, size, fp);
36     fclose(fp);
37     source[size] = '';
38     return size + 1;
39 }
40 
41 int main()
42 {
43     const int nElement = 2048, dataSize = nElement * sizeof(float);
44     int i, host[nElement] = { 0 };
45     char info[1024];
46 
47     // 初始化平台
48     cl_int status;
49     cl_platform_id platform;
50     clGetPlatformIDs(1, &platform, NULL);
51     cl_device_id device[2];
52     clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, device, NULL);
53     cl_context_properties contextProp[] = { CL_CONTEXT_PLATFORM,(cl_context_properties)(platform), 0 };
54     cl_context context = clCreateContext(contextProp, 1, device, NULL, contextProp, &status);
55     cl_command_queue_properties queueProp = 0;// useless
56     cl_command_queue queue = clCreateCommandQueueWithProperties(context, device[0], NULL, &status);    
57 
58     cl_mem buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status);
59 
60     char *source;
61     size_t sourceLength = readSource(sourceCode, source);
62     cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceLength, &status);
63     status = clBuildProgram(program, 1, device, "-cl-std=CL2.0", NULL, NULL);
64     if (status)
65     {
66         clGetProgramBuildInfo(program, device[0], CL_PROGRAM_BUILD_LOG, 1024, info, NULL);
67         printf("Build log:
%s
", info);
68     }
69     cl_kernel kernel = clCreateKernel(program, "memoryOrderTest", &status);            
70     clSetKernelArg(kernel, 0, sizeof(cl_mem), buffer);
71     size_t globalSize = nElement, localSize = 256;
72     clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
73     clFinish(queue);
74 
75     clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, dataSize, host, 0, NULL, NULL);
76     
77     printf("Local memory result: %d, global memory result: %d, waiting count: %d
", host[0], host[1], host[2]);
78        
79     clReleaseContext(context);
80     clReleaseCommandQueue(queue);
81     clReleaseProgram(program);    
82     clReleaseKernel(kernel);
83     clReleaseMemObject(buffer);
84     getchar();
85     return 0;
86 }
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9045657.html