OpenCL 矢量存取

▶ 函数 vloadn 和 vstoren 来实现全局存储器和局部存储器之间的向量拷贝

● 代码

 1 #include <stdio.h>  
 2 #include <stdlib.h>
 3 #include <cl.h>
 4 
 5 const int nElement = 4096;
 6 const char *programSource = "                           
 7 __kernel void prog(__global int *A, __global int *B)    
 8 {                                                       
 9     int idx = get_global_id(0);                         
10     int4 temp = vload4(idx, A);                         
11     vstore4(temp, idx, B);                              
12     return;                                             
13 }                                                       
14 ";
15 
16 int main()
17 {
18     const size_t datasize = sizeof(int) * nElement;
19     int i, *A, *B;
20     cl_int status;
21 
22     A = (int*)malloc(datasize);
23     B = (int*)malloc(datasize);
24     for (i = 0; i < nElement; A[i] = i, B[i] = 0, i++);
25 
26     cl_platform_id platform;
27     clGetPlatformIDs(1, &platform, NULL);
28     cl_device_id device;
29     clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
30     cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
31     cl_command_queue cmdQueue = clCreateCommandQueue(context, device, 0, &status);
32     cl_mem bufferA, bufferB;
33     bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status);
34     bufferB = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status);
35     clEnqueueWriteBuffer(cmdQueue, bufferA, CL_FALSE, 0, datasize, A, 0, NULL, NULL);
36     cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &status);
37     status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);    
38     cl_kernel kernel = clCreateKernel(program, "prog", &status);
39     clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
40     clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
41     size_t globalSize[1] = { nElement }, localSize[1] = { 128 };
42     status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, NULL);
43     clEnqueueReadBuffer(cmdQueue, bufferB, CL_TRUE, 0, datasize, B, 0, NULL, NULL);
44 
45     for (i = 0; i < nElement; i++)
46     {
47         if (B[i] != i)
48             break;
49     }
50     printf("Output is %s.
", (i == nElement) ? "correct" : "incorrect");
51 
52     free(A);
53     free(B);
54     clReleaseContext(context);
55     clReleaseMemObject(bufferA);
56     clReleaseMemObject(bufferB);
57     clReleaseCommandQueue(cmdQueue);
58     clReleaseProgram(program);
59     clReleaseKernel(kernel);
60     getchar();
61     return 0;
62 }

● 输出结果

Output is correct.

● 教训

■ 核函数代码中每个 "" 的后面不要有任何东西,包括空格。因为 "" 在预处理以后会消失,其后的内容会被当成下一行的内容,而空格会在IDE中使 "" 失效,导致编译错误
■ 核函数代码中不要有 "//" 型的行注释,理由类似。会使得 "//" 以后的代码全部失效
■ 可以改用字符串连接来写核函数代码,如:

1 const char *programSource =
2 "__kernel void prog(__global int *A, __global int *B)   
"
3 "{                                                      
"
4 "    int idx = get_global_id(0);                        
"
5 "    int4 temp = vload4(idx, A);                        
"
6 "    vstore4(temp, idx, B);                             
"
7 "    return;                                            
"
8 "}                                                      
"
9 ";                                                      
"
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/8931357.html