CUDA速度测试

CPU ==> AMD X4 635
GPU ==> GeForce GT 240
三个很简单的测试..
1. 最笨的算法,一堆FOR..
2. 四个线程(4物理核的CPU)..各算一块
3.GPU 分成64*64个BLOCK..每个BLOCK 16*16个线程
4.使用CUBLAS库
结果如下
6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
TIMES: 47125
6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
TIMES: 14203
6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
TIMES: 328
6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
TIMES: 250
时间比例大概为 1885:570:13:10 ....GPU的确比较强悍..
没用INTEL的库..AMD的U怕支持不好..改天找个机器试下.
  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <string.h>
  4 #include <time.h>
  5 #include <assert.h>
  6 #include <conio.h>
  7 #include <windows.h>
  8 #include <process.h>
  9 #include <cuda_runtime.h>
 10 #include <cublas_v2.h>
 11 #include <device_launch_parameters.h>
 12 
 13 
 14 
 15 #define MAX_RUN      0
 16 #define TILE_WIDTH   16
 17 #define MAX_DIM      1024
 18 
 19 float MatrixA[MAX_DIM][MAX_DIM];
 20 float MatrixB[MAX_DIM][MAX_DIM];
 21 float MatrixC[MAX_DIM][MAX_DIM];
 22 
 23 volatile unsigned long thr_run;
 24 
 25 /* 设置矩阵内容 */
 26 void FillMatrix()
 27 {
 28     register int i, j;
 29     srand( ( unsigned int )time( NULL ) );
 30 
 31     for ( i = 0; i < MAX_DIM; i ++ )
 32     {
 33         for ( j = 0; j < MAX_DIM; j ++ )
 34         {
 35             MatrixA[i][j] = ( float )rand() * rand() / 100 / RAND_MAX;
 36             MatrixB[i][j] = ( float )rand() * rand() / 100 / RAND_MAX;
 37         }
 38     }
 39 }
 40 
 41 
 42 /********************************************************************/
 43 
 44 /* 运行在CPU上,最笨的方法 */
 45 void RunOnCPU()
 46 {
 47     float sum;
 48     register int i, j, k;
 49 
 50     for ( i = 0; i < MAX_DIM; ++ i )
 51     {
 52         for ( j = 0; j < MAX_DIM; ++ j )
 53         {
 54             sum = 0;
 55             for ( k = 0; k < MAX_DIM; ++ k )
 56             {
 57                 sum += MatrixA[i][k] * MatrixB[k][j];
 58             }
 59             MatrixC[i][j] = sum;
 60         }
 61     }
 62 }
 63 
 64 
 65 
 66 /********************************************************************/
 67 
 68 /* 子线程ROUTINE */
 69 void CPUThread( void* arg )
 70 {
 71     register int i, j, k;
 72     int dy, dy1;
 73     float mulResult;
 74 
 75     dy = ( ( int )MAX_DIM >> 2 ) * ( int )arg ;
 76     dy1 = dy + ( ( int )MAX_DIM >> 2 );
 77 
 78     for ( i = dy; i < dy1; i ++ )
 79     {
 80         for ( j = 0; j < MAX_DIM; j ++ )
 81         {
 82             mulResult = 0;
 83             for ( k = 0; k < MAX_DIM; k ++ )
 84             {
 85                 mulResult += MatrixA[i][k] * MatrixB[k][j];
 86             }
 87 
 88             MatrixC[i][j] = mulResult;
 89         }
 90     }
 91 
 92     InterlockedIncrement( &thr_run );
 93 
 94     _endthread();
 95 }
 96 
 97 
 98 /* 运行在CPU上, CPU==>X4 635刚好开4个线程, 4个核全100% */
 99 void RunOnCPUMulThr()
100 {
101     int i;
102     unsigned int ret;
103 
104     thr_run = 0;
105 
106     for ( i = 0; i < 4; i ++ )
107     {
108         ret = _beginthread( CPUThread, 0, ( void* )i );
109         assert( ret != -1 );
110     }
111 
112     while ( thr_run != 4 )
113     {
114         Sleep( 1 );
115     }
116 }
117 
118 /********************************************************************/
119 
120 /* 运行在GPU上 */
121 __global__ void Matrix_Mul1( float* c, const float* a, const float* b )
122 {
123     unsigned int i, j, bx, by, tx, ty;
124     float mulResult;
125     __shared__ float d_m[TILE_WIDTH][TILE_WIDTH];
126     __shared__ float d_n[TILE_WIDTH][TILE_WIDTH];
127 
128     bx = blockIdx.x;
129     by = blockIdx.y;
130     tx = threadIdx.x;
131     ty = threadIdx.y;
132 
133     mulResult = 0.0;
134 
135     for ( i = 0; i < gridDim.x; ++i )
136     {
137         d_m[ty][tx] = *( a + ( by * blockDim.y + ty ) * MAX_DIM + i * blockDim.x + tx );
138         d_n[ty][tx] = *( b + ( i * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx );
139         __syncthreads();
140 
141         for ( j = 0; j < blockDim.x; ++ j )
142         {
143             mulResult += d_m[ty][j] * d_n[j][tx];
144         }
145         __syncthreads();
146     }
147     c[( by * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx] = mulResult;
148 }
149 
150 void MatrixMul1( float* c, const float* a, const float* b )
151 {
152     int cnt;
153     float* dev_a;
154     float* dev_b;
155     float* dev_c;
156     cudaError_t cudaStatus;
157     // 64 * 64 ====> 16 * 16
158     dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
159     dim3 blocks( TILE_WIDTH, TILE_WIDTH );
160 
161     cnt = MAX_DIM * MAX_DIM;
162     dev_a = NULL;
163     dev_b = NULL;
164     dev_c = NULL;
165 
166     /* 设置显卡,构建上下文 */
167     cudaStatus = cudaSetDevice( 0 );
168     assert( cudaStatus == cudaSuccess );
169 
170     /* 分配显存 */
171     cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
172     assert( cudaStatus == cudaSuccess );
173 
174     cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
175     assert( cudaStatus == cudaSuccess );
176 
177     cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
178     assert( cudaStatus == cudaSuccess );
179 
180 
181     /* 内存传送数据到显存 */
182     cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
183     assert( cudaStatus == cudaSuccess );
184 
185     cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
186     assert( cudaStatus == cudaSuccess );
187 
188     /* 调用显卡 */
189     Matrix_Mul1 <<< grid, blocks >>> ( dev_c, dev_a, dev_b );
190 
191     /* 设备同步 */
192     cudaStatus = cudaDeviceSynchronize();
193     assert( cudaStatus == cudaSuccess );
194 
195 
196     /* 结果从显存传送到内存 */
197     cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
198     assert( cudaStatus == cudaSuccess );
199 
200     /* 释放显存 */
201     cudaFree( dev_c );
202     cudaFree( dev_a );
203     cudaFree( dev_b );
204 
205     /* 重启显卡(上下文) */
206     cudaDeviceReset();
207 }
208 
209 
210 /********************************************************************/
211 
212 /* 使用CUBLAS库 */
213 void MatrixMul2( float* c, const float* a, const float* b )
214 {
215     int cnt;
216     float* dev_a;
217     float* dev_b;
218     float* dev_c;
219     cublasHandle_t handle;
220     cublasStatus_t cuBlasStatus;
221     cudaError_t cudaStatus;
222     float alpha;
223     float beta;
224     // 64 * 64 ====> 16 * 16
225     dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
226     dim3 blocks( TILE_WIDTH, TILE_WIDTH );
227 
228 
229     dev_a = NULL;
230     dev_b = NULL;
231     dev_c = NULL;
232 
233     cnt = MAX_DIM * MAX_DIM;
234 
235     alpha = 1.0f;
236     beta  = 0.0f;
237 
238 
239     /* 设置显卡,构建上下文 */
240     cudaStatus = cudaSetDevice( 0 );
241     assert( cudaStatus == cudaSuccess );
242 
243     /* 初始化BLAS库 */
244     cuBlasStatus = cublasCreate( &handle );
245     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
246 
247     /* 分配显存 */
248     cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
249     assert( cudaStatus == cudaSuccess );
250 
251     cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
252     assert( cudaStatus == cudaSuccess );
253 
254     cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
255     assert( cudaStatus == cudaSuccess );
256 
257     /* 内存传送数据到显存 */
258     cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
259     assert( cudaStatus == cudaSuccess );
260 
261     cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
262     assert( cudaStatus == cudaSuccess );
263 
264 
265     /* 处理 */
266     cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
267                                 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
268                                 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
269     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
270 
271     /* 处理 */
272     cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
273                                 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
274                                 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
275     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
276 
277     /* 结果从显存传送到内存 */
278     cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
279     assert( cudaStatus == cudaSuccess );
280 
281     /* 销毁BLAS */
282     cuBlasStatus = cublasDestroy( handle );
283     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
284 
285     /* 重启显卡(上下文) */
286     cudaDeviceReset();
287 }
288 
289 
290 /********************************************************************/
291 
292 
293 int main()
294 {
295     DWORD dwTime1, dwTime2;
296 
297     FillMatrix();
298 
299     memset( MatrixC, 0, sizeof( MatrixC ) );
300     dwTime1 = GetTickCount();
301     RunOnCPU();
302     dwTime2 = GetTickCount() - dwTime1;
303     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
304 
305     memset( MatrixC, 0, sizeof( MatrixC ) );
306     dwTime1 = GetTickCount();
307     RunOnCPUMulThr();
308     dwTime2 = GetTickCount() - dwTime1;
309     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
310 
311     memset( MatrixC, 0, sizeof( MatrixC ) );
312     dwTime1 = GetTickCount();
313     MatrixMul1( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
314     dwTime2 = GetTickCount() - dwTime1;
315     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
316 
317     memset( MatrixC, 0, sizeof( MatrixC ) );
318     dwTime1 = GetTickCount();
319     MatrixMul2( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
320     dwTime2 = GetTickCount() - dwTime1;
321     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
322 
323     getch();
324 
325     return 0;
326 }