0_Simple__UnifiedMemoryStreams

使用 OpenMP 和 pthreads 两种环境，利用实现统一内存编址，计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

  1 #include <cstdio>
  2 #include <vector>
  3 #include <algorithm>
  4 #include <cuda_runtime.h>
  5 #include "device_launch_parameters.h"
  6 #include <cublas_v2.h>
  7 
  8 //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
  9 #ifdef USE_PTHREADS
 10     #include <pthread.h>
 11     #pragma comment(lib, "pthreadVC2.lib")
 12 #else
 13     #include <omp.h>
 14 #endif
 15 
 16 // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
 17 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 18 void srand48(long seed) { srand((unsigned int)seed); }
 19 double drand48() { return double(rand()) / RAND_MAX; }
 20 #endif
 21 
 22 template <typename T> struct Task// struct 也可使用类的构造和析构函数
 23 {
 24     unsigned int size, id;
 25     T *data;
 26     T *result;
 27     T *vector;
 28 
 29     Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL) {};
 30     Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL)
 31     {
 32         cudaMallocManaged(&data, sizeof(T)*size*size);
 33         cudaMallocManaged(&result, sizeof(T)*size);
 34         cudaMallocManaged(&vector, sizeof(T)*size);
 35         cudaDeviceSynchronize();
 36     }
 37 
 38     ~Task()
 39     {
 40         cudaDeviceSynchronize();
 41         cudaFree(data);
 42         cudaFree(result);
 43         cudaFree(vector);
 44     }
 45 
 46     void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存，初始化各成员数组
 47     {
 48         id = unique_id;
 49         size = s;
 50         cudaMallocManaged(&data, sizeof(T)*size*size);
 51         cudaMallocManaged(&result, sizeof(T)*size);
 52         cudaMallocManaged(&vector, sizeof(T)*size);
 53         cudaDeviceSynchronize();
 54 
 55         for (int i = 0; i < size*size; i++)
 56             data[i] = drand48();
 57         for (int i = 0; i < size; i++)
 58         {
 59             result[i] = 0.;
 60             vector[i] = drand48();
 61         }
 62     }
 63 };
 64 
 65 #ifdef USE_PTHREADS// 封装 pthread 型的任务
 66 struct threadData_t
 67 {
 68     int tid;
 69     Task<double> *TaskListPtr;
 70     cudaStream_t *streams;
 71     cublasHandle_t *handles;
 72     int taskSize;
 73 };
 74 
 75 typedef struct threadData_t threadData;
 76 #endif
 77 
 78 template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
 79 {
 80     for (int i = 0; i < m; i++)// 源代码这写成了 n，并且漏掉了后面的 alpha
 81     {
 82         result[i] *= *beta; 
 83         for (int j = 0; j < n; j++)
 84             result[i] += *alpha * A[i*n + j] * x[j]; 
 85     }
 86 }
 87 
 88 // execute a single task on either host or device depending on size
 89 #ifdef USE_PTHREADS
 90 void * execute(void* inpArgs)
 91 {
 92     threadData *dataPtr    = (threadData *) inpArgs;
 93     cudaStream_t *stream   = dataPtr->streams;
 94     cublasHandle_t *handle = dataPtr->handles;
 95     int tid                = dataPtr->tid;
 96 
 97     for (int i = 0; i < dataPtr->taskSize; i++)
 98     {
 99         Task<double>  &t = dataPtr->TaskListPtr[i]; 
100         double alpha = 1.0;
101         double beta = 0.0;
102         if (t.size < 100)// 数据规模较小在主机上运行，否则在设备上运行
103         {
104             printf("
Task [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
105             cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost);
106             cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost);
107             cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost);
108             cudaStreamSynchronize(stream[0]);
109             gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
110         }
111         else
112         {
113             printf("
Task [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
114             cublasSetStream(handle[tid+1], stream[tid+1]);
115             cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle);
116             cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle);
117             cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle);
118             cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1);
119         }
120     }
121     return NULL;
122 }
123 #else
124 template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
125 {
126     double alpha = 1.0;
127     double beta = 0.0;
128     if (t.size < 100)// 数据规模较小在主机上运行，否则在设备上运行
129     {
130         printf("
Task [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
131         cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost);
132         cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost);
133         cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost);
134         cudaStreamSynchronize(stream[0]);
135         gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
136     }
137     else
138     {
139         printf("
Task [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
140         cublasSetStream(handle[tid+1], stream[tid+1]);
141         cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle);
142         cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle);
143         cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle);
144         cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1);
145     }
146 }
147 #endif
148 
149 template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
150 {
151     for (unsigned int i = 0; i < TaskList.size(); i++)
152     {
153         int size;
154         size = std::max((int)(drand48()*1000.0), 64);
155         TaskList[i].allocate(size, i);
156     }
157 }
158 
159 int main()
160 {
161     printf("
	Start.
");
162 
163     cudaDeviceProp device_prop;
164     cudaGetDeviceProperties(&device_prop, 0);
165     if (!device_prop.managedMemory)
166     { 
167         printf("
	Unified Memory not supported
");
168         getchar();
169         return 1;
170     }
171     if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
172     {
173         printf("
	ComputeMode is cudaComputeModeProhibited
");
174         getchar();
175         return 1;
176     }
177 
178     srand48(time(NULL));
179     const int nthreads = 4;
180     cudaStream_t *streams = new cudaStream_t[nthreads+1];
181     cublasHandle_t *handles = new cublasHandle_t[nthreads+1];
182     for (int i=0; i<nthreads+1; i++)
183     {
184         cudaStreamCreate(&streams[i]);
185         cublasCreate(&handles[i]);
186     }
187 
188     unsigned int N = 40;
189     std::vector<Task<double> > TaskList(N);
190     initialise_tasks(TaskList);
191     cudaSetDevice(0);
192 
193 #ifdef USE_PTHREADS
194     pthread_t threads[nthreads];
195     threadData *InputToThreads = new threadData[nthreads];
196     int temp = TaskList.size() / nthreads;
197     for (int i=0; i < nthreads; i++)
198     {
199         InputToThreads[i].tid = i; 
200         InputToThreads[i].streams = streams;
201         InputToThreads[i].handles = handles;
202 
203         if (temp == 0)  // 任务数量比线程数少
204         {
205             InputToThreads[i].taskSize = 0; 
206             InputToThreads[i].TaskListPtr = &TaskList[0];
207         }
208         else            // 任务数量不少于线程数。任务尽量均分，多出的零头全部塞给最后一个线程
209         {
210             if (i == nthreads - 1)
211             {
212                 InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads); 
213                 InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
214             }
215             else
216             {
217                 InputToThreads[i].taskSize = temp; 
218                 InputToThreads[i].TaskListPtr = &TaskList[i*temp];
219             }
220         }
221         pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
222     }
223     for (int i=0; i < nthreads; i++)
224         pthread_join(threads[i], NULL);
225 #else
226     omp_set_num_threads(nthreads);
227     #pragma omp parallel for schedule(dynamic)
228     for (int i=0; i<TaskList.size(); i++)
229     {
230         int tid = omp_get_thread_num();
231         execute(TaskList[i], handles, streams, tid);
232     }
233 #endif
234     cudaDeviceSynchronize();
235 
236     // 清理工作
237     for (int i=0; i<nthreads+1; i++)
238     {
239         cudaStreamDestroy(streams[i]);
240         cublasDestroy(handles[i]);
241     }
242     std::vector< Task<double> >().swap(TaskList);
243     printf("
	Finish.
");
244     getchar();
245     return 0;
246 }

▶ 输出结果：OpenMP

    Start.

Task [ 0], thread [ 0], size[ 721], on device
Task [ 1], thread [ 2], size[ 925], on device
Task [ 2], thread [ 3], size[ 250], on device
Task [ 3], thread [ 1], size[ 832], on device
Task [ 4], thread [ 2], size[ 798], on device
Task [ 5], thread [ 3], size[ 379], on device
Task [ 6], thread [ 2], size[ 110], on device
Task [ 7], thread [ 1], size[ 147], on device
Task [ 8], thread [ 0], size[ 941], on device
Task [ 9], thread [ 3], size[ 494], on device
Task [12], thread [ 1], size[ 619], on device
Task [10], thread [ 0], size[ 144], on device
Task [11], thread [ 2], size[ 477], on device
Task [13], thread [ 3], size[ 282], on device
Task [14], thread [ 0], size[ 593], on device
Task [15], thread [ 1], size[ 960], on device
Task [16], thread [ 0], size[ 671], on device
Task [17], thread [ 2], size[ 585], on device
Task [18], thread [ 3], size[ 883], on device
Task [19], thread [ 0], size[ 102], on device
Task [20], thread [ 2], size[ 912], on device
Task [21], thread [ 1], size[ 527], on device
Task [22], thread [ 2], size[ 613], on device
Task [23], thread [ 3], size[ 553], on device
Task [24], thread [ 0], size[ 572], on device
Task [25], thread [ 1], size[ 792], on device
Task [26], thread [ 2], size[ 406], on device
Task [27], thread [ 3], size[ 782], on device
Task [28], thread [ 0], size[ 351], on device
Task [29], thread [ 1], size[ 595], on device
Task [30], thread [ 2], size[ 301], on device
Task [31], thread [ 3], size[ 537], on device
Task [32], thread [ 0], size[ 303], on device
Task [35], thread [ 3], size[ 177], on device
Task [34], thread [ 2], size[ 869], on device
Task [33], thread [ 1], size[ 415], on device
Task [36], thread [ 0], size[ 987], on device
Task [37], thread [ 3], size[ 772], on device
Task [38], thread [ 2], size[ 129], on device
Task [39], thread [ 1], size[ 506], on device
    Finish.

▶ 输出结果：pthreads

    Start.

Task [ 0], thread [ 0], size [ 755], on device
Task [20], thread [ 2], size [ 754], on device
Task [10], thread [ 1], size [ 746], on device
Task [30], thread [ 3], size [ 958], on device
Task [31], thread [ 3], size [ 281], on device
Task [11], thread [ 1], size [ 946], on device
Task [ 1], thread [ 0], size [  77], on host
Task [32], thread [ 3], size [ 144], on device
Task [21], thread [ 2], size [ 432], on device
Task [12], thread [ 1], size [ 105], on device
Task [22], thread [ 2], size [ 837], on device
Task [33], thread [ 3], size [ 402], on device
Task [ 2], thread [ 0], size [ 239], on device
Task [13], thread [ 1], size [ 149], on device
Task [23], thread [ 2], size [ 764], on device
Task [ 3], thread [ 0], size [ 775], on device
Task [34], thread [ 3], size [ 913], on device
Task [24], thread [ 2], size [ 182], on device
Task [35], thread [ 3], size [ 865], on device
Task [ 4], thread [ 0], size [ 729], on device
Task [14], thread [ 1], size [ 110], on device
Task [ 5], thread [ 0], size [ 604], on device
Task [36], thread [ 3], size [ 107], on device
Task [25], thread [ 2], size [ 403], on device
Task [15], thread [ 1], size [ 842], on device
Task [37], thread [ 3], size [ 858], on device
Task [26], thread [ 2], size [  64], on host
Task [ 6], thread [ 0], size [  64], on host
Task [16], thread [ 1], size [ 103], on device
Task [ 7], thread [ 0], size [ 852], on device
Task [27], thread [ 2], size [ 826], on device
Task [ 8], thread [ 0], size [ 673], on device
Task [17], thread [ 1], size [ 243], on device
Task [38], thread [ 3], size [ 178], on device
Task [39], thread [ 3], size [ 788], on device
Task [ 9], thread [ 0], size [  67], on host
Task [18], thread [ 1], size [ 340], on device
Task [28], thread [ 2], size [ 249], on device
Task [19], thread [ 1], size [ 934], on device
Task [29], thread [ 2], size [ 793], on device
    Finish.

▶ 涨姿势：

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库，注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 1 // driver_types.h
 2 #define cudaMemAttachGlobal 0x01  // 可访问内存
 3 #define cudaMemAttachHost   0x02  // 不可访问内存
 4 #define cudaMemAttachSingle 0x04  // 单线程可访问内存
 5 
 6 // cuda_runtime.h
 7 template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = 0, unsigned int flags = cudaMemAttachSingle)
 8 {
 9     return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
10 }
11 
12 // cuda_runtime_api.h
13 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle));