OpenCL入门

1，OpenCL程序包含两部分：

设备上执行：GPU 主机上运行：CPU

需要使用OpenCL语言编写Kernel函数。

2，

(1)Kernel:设备程序执行的入口点，是唯一可以从主机上调用执行的函数。

eg:

Function:

void vector_add_cpu (const float* src_a,
               const float* src_b,
               float*  res,
               const int num)
{
   for (int i = 0; i < num; i++)
      res[i] = src_a[i] + src_b[i];
}

OpenCL Kernel:

__kernel void vector_add_gpu (__global const float* src_a,
                     __global const float* src_b,
                     __global float* res,
           const int num)
{
   /* get_global_id(0) 返回正在执行的这个线程的ID。
   许多线程会在同一时间开始执行同一个kernel，
   每个线程都会收到一个不同的ID，所以必然会执行一个不同的计算。*/
   const int idx = get_global_id(0);

   /* 每个work-item都会检查自己的id是否在向量数组的区间内。
   如果在，work-item就会执行相应的计算。*/
   if (idx < num)
      res[idx] = src_a[idx] + src_b[idx];
}

如何编写Kernel，如何表达并行性，执行模型是什么样的？

SIMT: single instruction multi thread

work-item(工作项): 最小的执行单元。当一个Kernel开始执行，可定义数量的work-item开始运行，每个都执行相同的代码。其中，每个work-item有一个id，可以在kernel中被访问，运行在work-item上的kernel通过id找到work-item需要处理的数据。

work-group(工作组)：允许work-item之间的通信和协作。也有唯一的可被kernel读取的id。

ND-range：下一个组织级别，定义work-group的组织形式。

Attention:

a. 关键字__kernel定义函数为kernel, 必须返回void。

b. 关键字__global位于参数前面，定义了参数内存的存放位置。

c. 所有Kernel必须写在".cl"文件中，".cl"文件必须只包含OpenCL代码。

(2)Host（主机）

首先建立基本的OpenCL运行环境：

Platform: Host加OpenCL框架管理下的若干设备构成Platform。通过Platform，应用程序可以与设备共享资源并在设备上执行Kernel。Platform通过cl_platform表现。

Platform初始化：

// return error code
cl_int oclGetPlatformID(cl_platform_id *platforms) //Pointer to the platform object

Device: 通过cl_device表现。

// Return the error code
cl_int clGetDeviceIDs(cl_platform_id platform,
cl_device_type device_type, //Bitfield identifying the type. For GPU we use CL_DEVICE_TYPE_GPU
cl_uint num_entries, //Number of Devices, typically 1
cl_device_id *devices, //Pointer to the device object
cl_unit *num_devices) //Puts here the number of devices matching the device_type

Context: 定义整个OpenCL环境，包括OpenCL kernel、设备、内存管理、命令队列等。Context使用cl_context表现。

// Returns the contxt
cl_context cl_CreateContext(onst cl_context_properties *properties, //Bitwise with the properties(ee specification)
cl_unit num_devices, //Number of devices
const cl_device_id *devices, //Pointer to devices object
void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data),
void *user_data, 
cl_int *errcode_ret) //error code result

Command_Queue: 存储需要在设备上执行的OpenCL指令的队列。Command_Queue建立在一个Context指定的设备上，多个Command_Queue允许应用程序在不需要同步的情况下执行多条无关联的指令。

cl_command_queue clCreateCommandQueue(cl_context context,
cl_device_id device,
cl_command_queue_properties properties, //Bitwise with the properties
cl_int *errcode_ret) //Error code result

环境初始化方法：

cl_int error = 0;   // Used to handle error codes
cl_platform_id platform;
cl_context context;
cl_command_queue queue;
cl_device_id device;

// Platform
error = oclGetPlatformID(&platform);
if (error != CL_SUCCESS) {
   cout << "Error getting platform id: " << errorMessage(error) << endl;
   exit(error);
}
// Device
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
   cout << "Error getting device ids: " << errorMessage(error) << endl;
   exit(error);
}
// Context
context = clCreateContext(0, 1, &device, NULL, NULL, &error);
if (error != CL_SUCCESS) {
   cout << "Error creating context: " << errorMessage(error) << endl;
   exit(error);
}
// Command-queue
queue = clCreateCommandQueue(context, device, 0, &error);
if (error != CL_SUCCESS) {
   cout << "Error creating command queue: " << errorMessage(error) << endl;
   exit(error);
}

接着，分配内存。

针对介绍Kernel时举的例子，需要分配三个向量的内存空间，且至少将其中的两个初始化。

const int size = 1234567
float* src_a_h = new float[size];
float* src_b_h = new float[size];
float* res_h = new float[size];
// Initialize both vectors
for (int i = 0; i < size; i++) {
   src_a_h = src_b_h = (float) i;
}

而在Devices上分配内存，我们需要使用cl_mem类型：

//Returns the cl_mem object referencing the memory allocated on the device
cl_mem clCreateBuffer(cl_context context, //the context where the memory will be allocated
cl_mem_flags flags,
size_t size // Size in bytes
void *host_ptr,
cl_int *errcode_ret)

其中，flags是逐位的，选项包括：

CL_MEM_READ_WRITE

CL_MEM_WRITE_ONLY

CL_MEM_READ_ONLY

CL_MEM_USE_HOST_PTR

CL_MEM_ALLOC_HOST_PTR

CL_MEM_COPY_HOST_PTR – 从 host_ptr处拷贝数据

而clCreateBuffer的使用方法为：

const int mem_size = sizeof(float)*size;

// Allocates a buffer of size mem_size and copies mem_size bytes from src_a_h

cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error);

cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error);

cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);

(3)程序与Kernel

怎么调用Kernel？编译器怎么将代码放在Device上？怎么编译Kernel？

对比OpenCL Program与OpenCL Kernel：

Kernel：本质是一个可以在Host上调用，在Device上运行的函数，是在运行时编译的。所有运行在Device上的代码，包括Kernel以及Kernel调用的其他函数都是在运行时编译的。

Program：OpenCL Program是由Kernel函数/其他函数/声明组成。通过cl_program表示。当创建一个Program时，必须指定其是有哪些文件组成的，然后编译它。

Create a Program:

// Returns the OpenCL program
cl_program clCreateProgramWithSource(cl_context context,
cl_uint count, //number of files
const char **strings, //array of strings, each one is a file
const size_t *lengths, //array specifying the file lengths
cl_int *errcode_ret) //error code to be returned

在创建了Program后，执行编译操作：

cl_int clBuildProgram(cl_program program,
cl_uint num_devices,
const cl_device_id *device_list,
const char *options, //Compiler options, see the specifications for more details
void (*pfn_notify)(cl_program, void *user_data),
void *user_data)

编译后，我们使用如下函数查看编译log：

cl_int clGetProgramBuildInfo(cl_program program,
cl_device_id device,
cl_program_build_info param_name, //The parameter we want to know
size_t param_value_size,
void *param_value, // The answer
size_t *param_value_size_ret)

最后提取Program的入口，使用cl_kernel：

cl_kernel cl_CreateKernel(cl_pogram program,
const char *kernel_name, //The name of the kernel, i.e.the name of the kernel function as it's declared in the code
cl_int *errcode_ret)

我们可以创建多个Program，而每个Program中可以包含多个Kernel。创建Program：

// Create a program
size_t src_size = 0;
const char* path = shrFindFilePath("vector_add_gpu.cl", NULL);
const char* source = oclLoadProgSource(path, "", &src_size);
cl_program program = clCreateProgramWithSource(context, 1 &source, &src_size, &error);
assert(error == CL_SUCCESS)

//Build the program
error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
assert(error == CL_SUCCESS)

//Show the log
char* build_log;
size_t log_size;
//First call to know the proper size
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = new char[log_size + 1];
//Second call to get the log
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = ''";
cout << build_log << endl;
delete[] build_log;

//Extracting the kernel
cl_kernel vector_add_kernel = clCreateKernel(program, "vector_add_gpu", &error);
assert(error == CL_SUCCESS);

我们将Kernel建立好后，在运行Kernel前，需要先设置Kernel的参数。

cl_int clSetKernelArg(cl_kernel kernel, //Which Kernel
cl_uint arg_index, //Which argument
size_t arg_size, //Size of the next argument(not the value pointed by it)
const void *arg_value) //Value

对于Kernel中的每个参数，都需要调用上述的函数来为参数设置Value。所有参数设置完毕后，我们可以调用Kernel：

cl_int clWnqueueNDRangeKernel(cl_command_queue conmmand_queue,
cl_kernel kernel,
cl_uint work_dim, //Choose if using 1D, 2D or 3D work-items and work-groups
const size_t *global_work_offset,
const size_t *global_work_size, //The total number of work-items(must have work-dim dimensions)
const size_t *local_work_size, //The number of work-items per work-group(must have work-dim dimensions)
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event)

最终，设置参数及调用Kernel：

//Enqueuing parameters
//Note that we inform the size of the cl_mem object, not the size of the memory pointed by it
error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size);
assert(error == CL_SUCCESS)

//Luauching Kernel
const size_t local_ws = 512; //Number of work-items per work-group
//shrRoundUp returns the smallest multiple of local_ws bigger than size
const size_t global_ws = shrRoundUp(local_ws, size); //Total number of work-items
error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
assert(error == CL_SUCCESS);

(4)读取结果

cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
cl_mem buffer, //From which buffer
cl_bool offset //Offset from the beginning
size_t cb, //Size to be read(in bytes)
void *ptr, //Pointer to the host memory
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event)

使用方法如下：

//Reading back
float* check = net float[size];
clEnQueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);

(5)清除内存

使用clCreate申请的（缓冲区/Kernel/队列等）内存必须使用clRelease释放。

// Cleaning memory
delete[] src_a_h;
delete[] src_b_h;
delete[] res_h;
delete[] check;
clReleaseKernel(vector_add_k);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseMemObject(src_a_d);
clReleaseMemObject(src_ab_d);
clReleaseMemObject(res_d);

未经允许，请勿转载