稀疏矩阵 part 5

▶ 目前为止能跑的所有代码及其结果(2019年2月24日),之后添加:DIA 乘法 GPU 版;其他维度的乘法(矩阵乘矩阵);其他稀疏矩阵格式之间的相互转化

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <memory.h>
  4 #include <malloc.h>
  5 #include <math.h>
  6 #include <time.h>
  7 #include "cuda_runtime.h"
  8 #include "device_launch_parameters.h"
  9 
 10 #define ROW         (8192)
 11 #define COL         (8192)
 12 #define RATIO       0.1                         // 系数矩阵非零元素占比
 13 #define EPSILON     0.0001                      // 浮点数比较
 14 #define THREAD_SIZE 256
 15 #define SEED        1                           // (unsigned int)time(NULL)
 16 #define MAX(x,y)    (((x)>(y))?(x):(y))
 17 #define CEIL(x,y)   (int)(( x - 1 ) /  y + 1)
 18 #define INT                                     // 数字格式用 int 还是 float
 19 
 20 #ifdef INT
 21 typedef int format;
 22 #else
 23 typedef float format;
 24 #endif
 25 
 26 // 矩阵存储格式
 27 typedef struct          // 顺序格式
 28 {
 29     int     row;        // 行数
 30     int     col;        // 列数
 31     int     count;      // 非零元个数(用于转换,不用于计算)
 32     format  *data;      // 元素的值
 33 }
 34 MAT;
 35 
 36 typedef struct          // Compressed Sparse Row 格式
 37 {
 38     int     row;        // 行数
 39     int     col;        // 列数
 40     format  *data;      // 元素的值
 41     int     *index;     // 元素的列号
 42     int     *ptr;       // 每行首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数 
 43 }
 44 CSR;
 45 
 46 typedef struct          // Compressed Sparse Row 格式
 47 {
 48     int     row;        // 行数
 49     int     col;        // 列数
 50     format  *data;      // 元素的值
 51     int     *index;     // 元素的行号
 52     int     *ptr;       // 每列首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数 
 53 }
 54 CSC;
 55 
 56 
 57 typedef struct          // ELLPACK 格式
 58 {
 59     int     row;        // 行数
 60     int     col;        // 列数,相当于MAT格式的行数
 61     int     colOrigin;  // 原列数,相当于MAT格式的列数
 62     format  *data;      // 元素的值
 63     int     *index;     // 元素在MAT格式中的列号
 64 }
 65 ELL;
 66 
 67 typedef struct          // Coordinate 格式
 68 {
 69     int     row;        // 行数
 70     int     col;        // 列数
 71     int     count;      // 非零元个数
 72     int     *rowIndex;  // 行向量
 73     int     *colIndex;  // 列向量
 74     format  *data;      // 元素的值
 75 }
 76 COO;
 77 
 78 typedef struct              // Diagonal 格式
 79 {
 80     int     row;            // 行数
 81     int     col;            // 列数    
 82     int     colOrigin;      // 原列数
 83     format  *data;          // 元素的值
 84     int     *index;         // 原矩阵各对角线是否非零
 85 }
 86 DIA;
 87 
 88 // 全局指针
 89 __managed__ MAT *aMAT, *xMAT, *yMATRef, *yMATCal;
 90 __managed__ CSR *aCSR;
 91 __managed__ ELL *aELL;
 92 __managed__ COO *aCOO;
 93 
 94 // 通用函数
 95 __host__ __device__ inline void checkNULL(const void *input)            // 有点问题,设备函数无法使用 exit(1) 来推出
 96 {
 97     if (input == NULL)
 98         printf("
	find a NULL!");
 99     return;
100 }
101 
102 __host__ inline void checkCudaError(cudaError input)
103 {
104     if (input != cudaSuccess)
105     {
106         printf("
	find a cudaError!");
107         exit(1);
108     }
109     return;
110 }
111 
112 int checkEqual(const format * in1, const format * in2, const int length)// 注意两向量相等时返 0,否则返回“值不等的元素下标加一”
113 {
114     int i;
115     for (i = 0; i < length; i++)
116     {
117 #ifdef INT
118         if (in1[i] != in2[i])
119 #else
120         if (fabs(in1[i] - in2[i]) > EPSILON)
121 #endif
122         {
123             printf("
	Error at i = %d
	in1[i] = %10f, in2[i] = %10f
", i, (float)in1[i], (float)in2[i]);
124             return i + 1;
125         }
126     }
127     printf("
	Check output, passed.
");
128     return 0;
129 }
130 
131 // 打印矩阵
132 void print(const char* info, const MAT *in)// 打印MAT格式的矩阵
133 {
134     printf("%s
	MAT,
	row = %d, col = %d, count = %d", info, in->row, in->col, in->count);
135     printf("
	data =
	");
136     for (int i = 0; i < in->row * in->col; i++)
137     {
138 #ifdef INT
139         printf("%d ", in->data[i]);
140 #else
141         printf("%.2f ", in->data[i]);
142 #endif
143         if ((i + 1) % in->col == 0)
144             printf("
	");
145     }
146     printf("
");
147     return;
148 }
149 
150 void print(const char* info, const CSR *in)// 打印CSR格式的矩阵
151 {
152     printf("%s
	CSR,
	row = %d, col = %d", info, in->row, in->col);
153     printf("
	data =
	");
154     for (int i = 0; i < in->ptr[in->row]; i++)
155 #ifdef INT
156         printf("%d ", in->data[i]);
157 #else
158         printf("%.2f ", in->data[i]);
159 #endif
160     printf("
	index =
	");
161     for (int i = 0; i < in->ptr[in->row]; i++)
162         printf("%d ", in->index[i]);
163     printf("
	ptr =
	");
164     for (int i = 0; i < in->row + 1; i++)
165         printf("%d ", in->ptr[i]);
166     printf("

");
167     return;
168 }
169 
170 void print(const char* info, const ELL *in)// 打印ELL格式的矩阵
171 {
172     int i;
173     printf("%s
	ELL,
	row = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin);
174     printf("
	data =
	");
175     for (i = 0; i < in->row * in->col; i++)
176     {
177 #ifdef INT
178         printf("%d ", in->data[i]);
179 #else
180         printf("%.2f ", in->data[i]);
181 #endif
182         if ((i + 1) % in->col == 0)
183             printf("
	");
184     }
185     printf("index =
	");
186     for (i = 0; i < in->row * in->col; i++)
187     {
188         printf("%d ", in->index[i]);
189         if ((i + 1) % in->col == 0)
190             printf("
	");
191     }
192     printf("

");
193     return;
194 }
195 
196 void print(const char* info, const COO *in)// 打印COO格式的矩阵
197 {
198     int i;
199     printf("%s
	COO,
	row = %d, col = %d, count = %d", info, in->row, in->col, in->count);
200     printf("
	(rowIndex, colIndex, data) =
	");
201     for (i = 0; i < in->count; i++)
202     {
203 #ifdef INT
204         printf("(%d,%d,%d)
	", in->rowIndex[i], in->colIndex[i], in->data[i]);
205 #else
206         printf("(%d,%d,%.2f)
	", in->rowIndex[i], in->colIndex[i], in->data[i]);
207 #endif
208     }
209     printf("

");
210     return;
211 }
212 
213 void print(const char* info, const DIA *in)// 打印DIA格式的矩阵
214 {
215     int i;
216     printf("%s
	DIA,
	row = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin,in->colOrigin);
217     printf("
	data =
	");
218     int * inverseIndex = (int *)malloc(sizeof(int) * in->col);
219     for (int i = 0, j = 0; i < in->row + in->col - 1; i++)
220     {
221         if (in->index[i] == 1)
222         {
223             inverseIndex[j] = i;
224             j++;
225         }
226     }  
227     for (i = 0; i < in->row * in->col; i++)
228     {                                                                            
229         if (i / in->col < in->row - 1 - inverseIndex[i % in->col] || i / in->col > inverseIndex[in->col - 1] - inverseIndex[i % in->col])
230             printf("* ");
231         else
232 #ifdef INT
233             printf("%d ", in->data[i]);
234 #else
235             printf("%.2f ", in->data[i]);
236 #endif
237         if ((i + 1) % in->col == 0)
238             printf("
	");
239     }
240     printf("index =
	");
241     for (i = 0; i < in->row + in->col - 1; i++)    
242         printf("%d ", in->index[i]);       
243     printf("

");
244     free(inverseIndex);
245     return;
246 }
247 
248 // 矩阵初始化与清理
249 MAT *initializeMAT(const int row, const int col, const int count = 0)// 初始化MAT,注意非零元默认为 0
250 {
251     MAT *temp;
252     checkCudaError(cudaMallocManaged((void **)&temp, sizeof(MAT)));
253     temp->row = row;
254     temp->col = col;
255     temp->count = count;
256     checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
257     return temp;
258 }
259 
260 // 统计MAT形式的矩阵中的非零元素个数
261 #define COUNT_MAT(in)                               
262 {                                                   
263     checkNULL(in);                                  
264     int i, zero;                                    
265     for (i = zero = 0; i < in->row * in->col; i++)  
266         zero += !in->data[i];                       
267     in->count = in->row * in->col - zero;           
268 }
269 
270 int freeMatrix(MAT *in)// 释放MAT
271 {
272     checkNULL(in);
273     cudaFree(in->data);
274     cudaFree(in);
275     return 0;
276 }
277 
278 CSR * initializeCSR(const int row, const int col, const int count)// 初始化CSR,要求给出 count
279 {
280     CSR *temp;
281     checkCudaError(cudaMallocManaged((void **)&temp, sizeof(CSR)));
282     temp->row = row;
283     temp->col = col;
284     checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count));
285     checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * count));
286     checkCudaError(cudaMallocManaged((void **)&temp->ptr, sizeof(int) * (row + 1)));
287     return temp;
288 }
289 
290 int freeMatrix(CSR *in)// 释放CSR
291 {
292     checkNULL(in);
293     cudaFree(in->data);
294     cudaFree(in->index);
295     cudaFree(in->ptr);
296     cudaFree(in);
297     return 0;
298 }
299 
300 ELL * initializeELL(const int row, const int col, const int colOrigin)// 初始化ELL,要求给出原列数
301 {
302     ELL *temp;
303     checkCudaError(cudaMallocManaged((void **)&temp, sizeof(ELL)));
304     temp->row = row;
305     temp->col = col;
306     temp->colOrigin = colOrigin;
307     checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
308     checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * row * col));
309     return temp;
310 }
311 
312 int freeMatrix(ELL *in)// 释放ELL
313 {
314     cudaFree(in->data);
315     cudaFree(in->index);
316     cudaFree(in);
317     return 0;
318 }
319 
320 COO * initializeCOO(const int row, const int col, const int count)// 初始化COO,要求给出 count
321 {
322     COO * temp;
323     checkCudaError(cudaMallocManaged((void **)&temp, sizeof(COO)));
324     temp->row = row;
325     temp->col = col;
326     temp->count = count;
327     checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count));
328     checkCudaError(cudaMallocManaged((void **)&temp->rowIndex, sizeof(int) * count));
329     checkCudaError(cudaMallocManaged((void **)&temp->colIndex, sizeof(int) * count));
330     return temp;
331 }
332 
333 int freeMatrix(COO *in)// 释放COO
334 {
335     checkNULL(in);
336     cudaFree(in->data);
337     cudaFree(in->rowIndex);
338     cudaFree(in->colIndex);
339     cudaFree(in);
340     return 0;
341 }
342 
343 DIA * initializeDIA(const int row, const int col, const int colOrigin)// 初始化DIA,要求给出原行数、原列数
344 {
345     DIA * temp;
346     checkCudaError(cudaMallocManaged((void **)&temp, sizeof(DIA)));
347     temp->row = row;
348     temp->col = col;
349     temp->colOrigin = colOrigin;
350     checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
351     checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(format) * (row + col - 1)));
352     return temp;
353 }
354 
355 int freeMatrix(DIA *in)// 释放DIA
356 {
357     checkNULL(in);
358     cudaFree(in->data);
359     cudaFree(in->index);
360     cudaFree(in);
361     return 0;
362 }
363 
364 // 矩阵格式的相互转化
365 CSR * MATToCSR(const MAT *in)                                       // MAT 转 CSR
366 {
367     checkNULL(in);
368     CSR * out = initializeCSR(in->row, in->col, in->count);
369     checkNULL(out);
370     
371     out->ptr[0] = 0;
372     for (int i = 0, j = 0, k = 1; i < in->row * in->col; i++)       // i 遍历 in->data
373     {
374         if (in->data[i] != 0)                                       // 找到非零元
375         {
376             if (j == in->count)                                     // 在 out->data 已经填满了的基础上又发现了非零元,错误
377                 return NULL;
378             out->data[j] = in->data[i];                             // 填充非零元素
379             out->index[j] = i % in->col;                            // 填充列号
380             j++;
381         }
382         if ((i + 1) % in->col == 0)                                 // 到了最后一列,写入行指针号
383             out->ptr[k++] = j;
384     }
385     return out;
386 }
387 
388 MAT * CSRToMAT(const CSR *in)                                       // CSR转MAT
389 {
390     checkNULL(in);
391     MAT *out = initializeMAT(in->row, in->col, in->ptr[in->row]);
392     checkNULL(out);
393 
394     memset(out->data, 0, sizeof(format) * in->row * in->col);
395     for (int i = 0; i < in->row; i++)                               // i 遍历行
396     {                                                
397         for (int j = in->ptr[i]; j < in->ptr[i + 1]; j++)           // j 遍历列 
398             out->data[i * in->col + in->index[j]] = in->data[j];
399     }
400     return out;
401 }
402 
403 ELL * MATToELL(const MAT *in)// MAT转ELL
404 {
405     checkNULL(in);
406 
407     int i, j, maxElement; 
408     for (i = j = maxElement = 0; i < in->row * in->col; i++)                    // i 遍历 in->data,j 记录该行非零元素数,maxElement 记录一行非零元素最大值
409     {     
410         if (in->data[i] != 0)                                                   // 找到非零元       
411             j++;                                                       
412         if ((i + 1) % in->col == 0)                                             // 行末,更新 maxElement                        
413         {
414             maxElement = MAX(j, maxElement);                    
415             j = 0;                                                              // 开始下一行之前清空 j
416         }
417     }
418     format* temp_data=(format *)malloc(sizeof(format) * in->row * maxElement);  // 临时数组,将列数压缩到 maxElement
419     checkNULL(temp_data);
420     int* temp_index = (int *)malloc(sizeof(int) * in->row * maxElement);
421     checkNULL(temp_index);
422     memset(temp_data, 0, sizeof(format) * in->row * maxElement);
423     memset(temp_index, 0, sizeof(int) * in->row * maxElement);
424     for (i = j = 0; i < in->row * in->col; i++)                                 // i 遍历 in->data,j 记录该行非零元素数,把 in 中每行的元素往左边推
425     {        
426         if (in->data[i] != 0)                                                   // 找到非零元
427         {
428             temp_data[i / in->col * maxElement + j] = in->data[i];              // 存放元素
429             temp_index[i / in->col * maxElement + j] = i % in->col;             // 记录所在的列号
430             j++;                                                    
431         }
432         if ((i + 1) % in->col == 0)                                             // 行末,将剩余位置的下标记作 -1,即无效元素
433         {            
434             for (j += i / in->col * in->col; j < maxElement * (i / in->col + 1); j++)   // 使得 j 指向本行最后一个非零元素之后的元素,再开始填充
435                 temp_index[j] = -1;                                 
436             j = 0;                                                              // 开始下一行之前清空 j
437         }
438     }    
439     ELL *out = initializeELL(maxElement, in->row, in->col);                     // 最终输出,如果不转置的话不要这部分
440     checkNULL(out);
441     for (i = 0; i < out->row * out->col; i++)                                   // 将 temp_data 和 temp_index 转置以提高缓存利用
442     {
443         out->data[i] = temp_data[i % out->col * out->row + i / out->col];
444         out->index[i] = temp_index[i % out->col * out->row + i / out->col];
445     }
446     free(temp_data);
447     free(temp_index);
448     return out;
449 }
450 
451 MAT * ELLToMAT(const ELL *in)                                                   // ELL转MAT
452 {
453     checkNULL(in);
454     MAT *out = initializeMAT(in->col, in->colOrigin);
455     checkNULL(out);
456 
457     for (int i = 0; i < in->row * in->col; i++)                                 // i 遍历 out->data 
458     {
459         if (in->index[i] < 0)                                                   // 注意跳过无效元素
460             continue;
461         out->data[i % in->col * in->colOrigin + in->index[i]] = in->data[i];
462     }
463     COUNT_MAT(out);
464     return out;
465 }
466 
467 COO * MATToCOO(const MAT *in)                               // MAT转COO
468 {
469     checkNULL(in);
470     COO *out = initializeCOO(in->row, in->col, in->count);
471 
472     for (int i=0, j = 0; i < in->row * in->col; i++)
473     {
474 #ifdef INT
475         if (in->data[i] != 0)
476 #else
477         if (fabs(in->data[i]) > EPSILON)
478 #endif
479         {
480             out->data[j] = in->data[i];
481             out->rowIndex[j] = i / in->col;
482             out->colIndex[j] = i % in->col;
483             j++;
484         }
485     }
486     return out;
487 }
488 
489 MAT * COOToMAT(const COO *in)                               // COO转MAT
490 {
491     checkNULL(in);
492     MAT *out = initializeMAT(in->row, in->col, in->count);
493     checkNULL(out);
494 
495     for (int i = 0; i < in->row * in->col; i++)
496         out->data[i] = 0;
497     for (int i = 0; i < in->count; i++)
498         out->data[in->rowIndex[i] * in->col + in->colIndex[i]] = in->data[i];
499     return out;
500 }
501 
502 DIA * MATToDIA(const MAT *in)                                       // MAT转DIA
503 {
504     checkNULL(in);
505 
506     int *index = (int *)malloc(sizeof(int)*(in->row + in->col - 1));
507     for (int diff = in->row - 1; diff > 0; diff--)                  // 左侧零对角线情况
508     {        
509         int flagNonZero = 0;
510         for (int i = 0; i < in->col && i + diff < in->row; i++)     // i 沿着对角线方向遍历 in->data,flagNonZero 记录该对角线是否全部为零元
511         {            
512 #ifdef INT
513             if (in->data[(i + diff) * in->col + i] != 0)
514 #else
515             if (fabs(in->data[(i + diff) * in->col + i]) > EPSILON)
516 #endif            
517                 flagNonZero = 1;
518         }
519         index[in->row - 1 - diff] = flagNonZero;                    // 标记该对角线上有非零元
520     }
521     for (int diff = in->col - 1; diff >= 0; diff--)                 // 右侧零对角线情况
522     {                                                                                                                 
523         int flagNonZero = 0;
524         for (int j = 0; j < in->row && j + diff < in->col; j++)
525         {
526 #ifdef INT
527             if (in->data[j * in->col + j + diff] != 0)
528 #else
529             if (fabs(in->data[j * in->col + j + diff]) > EPSILON)
530 #endif            
531                 flagNonZero = 1;
532         }
533         index[in->row - 1 + diff] = flagNonZero;                    // 标记该对角线上有非零元
534     }       
535     int *prefixSumIndex = (int *)malloc(sizeof(int)*(in->row + in->col - 1));
536     prefixSumIndex[0] = index[0];
537     for (int i = 1; i < in->row + in->col - 1; i++)                 // 闭前缀和,prefixSumIndex[i] 表示原矩阵第 0 ~ i 条对角线中共有多少条非零对角线(含)
538         prefixSumIndex[i] = prefixSumIndex[i-1] + index[i];         // index[in->row + in->col -2] 表示原矩阵非零对角线条数,等于 DIA 矩阵列数
539     DIA *out = initializeDIA(in->row, prefixSumIndex[in->row + in->col - 2], in->col);
540     checkNULL(out);
541 
542     memset(out->data, 0, sizeof(int)*out->row * out->col);
543     for (int i = 0; i < in->row + in->col - 1; i++)             
544         out->index[i] = index[i];                                   // index 搬进 out
545     for (int i = 0; i < in->row; i++)                               // i,j 遍历原矩阵,将元素搬进 out
546     {
547         for (int j = 0; j < in->col; j++)
548         {
549             int temp = j - i + in->row - 1;
550             if (index[temp] == 0)
551                 continue;            
552             out->data[i * out->col + (temp > 0 ? prefixSumIndex[temp - 1] : 0)] = in->data[i * in->col + j];    // 第 row - 1 行第 0 列元素 temp == 0,单独处理
553         }
554     }
555     free(index);
556     free(prefixSumIndex);
557     return out;
558 }
559 
560 MAT * DIAToMAT(const DIA *in)                                       // DIA转MAT
561 {
562     checkNULL(in);
563     MAT *out = initializeMAT(in->row, in->colOrigin);
564     checkNULL(out);
565 
566     int * inverseIndex = (int *)malloc(sizeof(int) * in->col);
567     for (int i = 0, j = 0; i < in->row + in->col - 1; i++)          // 求一个 index 的逆,即 DIA 中第 i 列对应原矩阵第 inverseIndex[i] 对角线
568     {                                                               // 原矩阵对角线编号 (row-1, 0) 为第 0 条,(0, 0) 为第 row - 1 条,(col-1, 0) 为第 row + col - 2 条
569         if (in->index[i] == 1)
570         {
571             inverseIndex[j] = i;
572             j++;
573         }
574     }
575     for (int i = 0; i < in->row; i++)                               // i 遍历 in->data 行,j 遍历 in->data 列
576     {
577         for (int j = 0; j < in->col; j++)
578         {
579             if (i < in->row - 1 - inverseIndex[j] || i > inverseIndex[in->col - 1] - inverseIndex[j])   // 跳过两边呈三角形的无效元素
580                 continue;
581             out->data[i * in->col + inverseIndex[j] - in->row + 1] = in->data[i * in->col + j];         // 利用 inverseIndex 来找钙元素在原距震中的位置
582         }
583     }
584     free(inverseIndex);
585     return out;
586 }
587 
588 // 各种形式的矩阵和一个向量的乘法
589 int dotCPU(const MAT *a, const MAT *x, MAT *y)      // CPU MAT 乘法
590 {
591     checkNULL(a); checkNULL(x); checkNULL(y);
592     if (a->col != x->row)
593     {
594         printf("dotMATCPU dimension mismatch!
");
595         return 1;
596     }
597     
598     y->row = a->row;
599     y->col = x->col;
600     for (int i = 0; i < a->row; i++)
601     {
602         format sum = 0;
603         for (int j = 0; j < a->col; j++)        
604             sum += a->data[i * a->col + j] * x->data[j];                
605         y->data[i] = sum;        
606     }
607     COUNT_MAT(y);
608     return 0;
609 }
610 
611 int dotCPU(const CSR *a, const MAT *x, MAT *y)      // CPU CSR 乘法
612 {
613     checkNULL(a); checkNULL(x); checkNULL(y);
614     if (a->col != x->row)
615     {
616         printf("dotCSRCPU dimension mismatch!
");
617         return 1;
618     }
619     
620     y->row = a->row;
621     y->col = x->col;
622     for (int i = 0; i < a->row; i++)                            // i 遍历 ptr,j 遍历行内数据,A 中为 0 的元素不参加乘法
623     {
624         format sum = 0;
625         for (int j = a->ptr[i]; j < a->ptr[i + 1]; j++)
626             sum += a->data[j] * x->data[a->index[j]];
627         y->data[i] = sum;
628     }
629     COUNT_MAT(y);
630     return 0;
631 }
632 
633 int dotCPU(const ELL *a, const MAT *x, MAT *y)      // CPU ELL乘法
634 {
635     checkNULL(a); checkNULL(x); checkNULL(y);
636     if (a->colOrigin != x->row)
637     {
638         printf("dotELLCPU dimension mismatch!
");
639         return 1;
640     }
641 
642     y->row = a->col;
643     y->col = x->col;
644     for (int i = 0; i<a->col; i++)
645     {
646         format sum = 0;
647         for (int j = 0; j < a->row; j++)
648         {
649             int temp = a->index[j * a->col + i];
650             if (temp < 0)                                   // 跳过无效元素
651                 continue;
652             sum += a->data[j * a->col + i] * x->data[temp];
653         }
654         y->data[i] = sum;
655     }
656     COUNT_MAT(y);
657     return 0;
658 }
659 
660 int dotCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法
661 {
662     checkNULL(a); checkNULL(x); checkNULL(y);
663     if (a->col != x->row)
664     {
665         printf("dotCOOCPU null!
");
666         return 1;
667     }
668 
669     y->row = a->row;
670     y->col = x->col;
671     for (int i = 0; i<a->count; i++)
672         y->data[a->rowIndex[i]] += a->data[i] * x->data[a->colIndex[i]];
673     COUNT_MAT(y);
674     return 0;
675 }
676 
677 int dotCPU(const DIA *a, const MAT *x, MAT *y)// CPU DIA乘法
678 {
679     checkNULL(a); checkNULL(x); checkNULL(y);
680     if (a->colOrigin != x->row)
681     {
682         printf("dotDIACPU null!
");
683         return 1;
684     }    
685     y->row = a->row;
686     y->col = x->col;
687     int * inverseIndex = (int *)malloc(sizeof(int) * a->col);
688     for (int i = 0, j = 0; i < a->row + a->col - 1; i++)
689     {
690         if (a->index[i] == 1)
691         {
692             inverseIndex[j] = i;
693             j++;
694         }
695     }
696     for (int i = 0; i < a->row; i++)
697     {
698         format sum = 0;
699         for (int j = 0; j < a->col; j++)
700         {
701             if (i < a->row - 1 - inverseIndex[j] || i > inverseIndex[a->col - 1] - inverseIndex[j])
702                 continue;
703             sum += a->data[i * a->col + j] * x->data[i + inverseIndex[j] - a->row + 1];
704         }
705         y->data[i] = sum;
706     }
707     COUNT_MAT(y);
708     free(inverseIndex);
709     return 0;
710 }
711 
712 __global__ void dotGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法
713 {
714     int id = blockIdx.x * blockDim.x + threadIdx.x;
715     if (id < a->row)
716     {
717         format sum = 0;
718         for (int i = 0; i < a->col; i++)
719             sum += a->data[id * a->col + i] * x->data[i];
720         y->data[id] = sum;
721     }
722     if (id == 0)
723     {
724         y->row = a->row;
725         y->col = x->col;
726         COUNT_MAT(y);
727     }
728     return;
729 }
730 
731 __global__ void dotGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法
732 {
733     int id = blockIdx.x * blockDim.x + threadIdx.x;
734     if (id < a->row)
735     {
736         format sum = 0;
737         for (int j = a->ptr[id]; j < a->ptr[id + 1]; j++)
738             sum += a->data[j] * x->data[a->index[j]];
739         y->data[id] = sum;
740     }
741     if (id == 0)
742     {
743         y->row = a->row;
744         y->col = x->col;
745         COUNT_MAT(y);
746     }
747     return;
748 }
749 
750 __global__ void dotGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法
751 {
752     int id = blockIdx.x * blockDim.x + threadIdx.x;
753     if (id < a->col)
754     {
755         format sum = 0;
756         for (int j = 0; j < a->row; j++)            
757             sum += a->data[id + j * a->col] * (a->index[id + j * a->col] < 0 ? 0 : x->data[a->index[id + j * a->col]]);
758         y->data[id] = sum;
759     }
760     if (id == 0)
761     {
762         y->row = a->col;
763         y->col = x->col;
764         COUNT_MAT(y);
765     }
766     return;
767 }
768 
769 __global__ void dotGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法
770 {
771     int id = blockIdx.x * blockDim.x + threadIdx.x;
772     if (id < a->count)
773         atomicAdd(&(y->data[a->rowIndex[id]]), a->data[id] * x->data[a->colIndex[id]]);
774     if (id == 0)
775     {
776         y->row = a->row;
777         y->col = x->col;
778         COUNT_MAT(y);
779     }
780     return;
781 }
782 
783 int test()// 测试矩阵打印和CPU计算的函数
784 {
785     const int row = 4, col = 5;
786     
787     MAT* a = initializeMAT(row, col);
788     a->data[0] = 3, a->data[2] = 1, a->data[4] = 5, a->data[11] = 2;
789     a->data[12] = 4, a->data[13] = 1, a->data[15] = 1, a->data[18] = 1;
790     COUNT_MAT(a);
791 
792     MAT* x = initializeMAT(col, 1);
793     for (int i = 0; i < x->row; i++)
794         x->data[i] = i + 1;
795     COUNT_MAT(x);
796 
797     MAT* y = initializeMAT(row, 1);
798     COUNT_MAT(y);
799 
800     print("MAT a =", a);    
801     print("MAT x =", x);
802     dotCPU(a, x, y);    
803     print("MAT y = a * x = ", y);
804    
805     CSR* b = MATToCSR(a);        
806     print("CSR b = a = ", b);
807     memset(y->data, 0, sizeof(format) * y->row * y->col);
808     dotCPU(b, x, y);    
809     print("MAT y = b * x = ", y);
810     MAT* c = CSRToMAT(b);
811     print("MAT c = b =", c);
812     freeMatrix(b);
813     
814     ELL* d = MATToELL(a);
815     print("ELL d = a =", d);
816     memset(y->data, 0, sizeof(format) * y->row * y->col);
817     dotCPU(d, x, y);
818     print("MAT y = d * x =", y);    
819     c = ELLToMAT(d);
820     print("MAT c = d =", c);
821     freeMatrix(d);
822 
823     COO* e = MATToCOO(a);
824     print("ELL e = a = ", e);
825     memset(y->data, 0, sizeof(format) * y->row * y->col);
826     dotCPU(e, x, y);
827     print("MAT y = e * x = ", y);    
828     c = COOToMAT(e);
829     print("MAT c = e =", c);
830     freeMatrix(e);
831    
832     DIA* f = MATToDIA(a);
833     print("DIA f = a = ", f);
834     memset(y->data, 0, sizeof(format) * y->row * y->col);
835     dotCPU(f, x, y);
836     print("MAT y = f * x = ", y);
837     c = DIAToMAT(f);
838     print("MAT c = f =", c);
839     freeMatrix(f);
840 
841     freeMatrix(a);
842     freeMatrix(x);
843     freeMatrix(y);   
844     freeMatrix(c);    
845     return 0;
846 }
847 
848 int main()
849 {
850     test();
851    
852     clock_t timeCPU;
853     cudaEvent_t startGPU, stopGPU;
854     float timeGPU;
855     cudaEventCreate(&startGPU);
856     cudaEventCreate(&stopGPU);
857 
858     printf("
	Start. Matrix dimension:	%d × %d", ROW, COL);
859 
860     // 初始化
861     timeCPU = clock();
862     aMAT = initializeMAT(ROW, COL);
863     xMAT = initializeMAT(COL, 1);
864     yMATRef = initializeMAT(ROW, 1);
865     yMATCal = initializeMAT(ROW, 1);
866 
867     srand(SEED);
868 #ifdef INT
869     for (int i = 0; i < COL * ROW; i++)
870         aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? (rand() - RAND_MAX / 2) % 32 : 0;
871     COUNT_MAT(aMAT);
872     int count = aMAT->count;
873     for (int i = 0; i < COL; i++)
874         xMAT->data[i] = i % 32;
875     COUNT_MAT(xMAT);
876 #else
877     for (int i = 0; i < COL * ROW; i++)
878         aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? ((float)rand() / RAND_MAX) : 0;
879     aMAT->count = countMAT(aMAT);
880     for (int i = 0; i < COL; i++)
881         xMAT->data[i] = ((float)rand() / RAND_MAX);
882     xMAT->count = countMAT(xMAT);
883 #endif
884     printf("
	Initialized. Time:		%d ms
", clock() - timeCPU);
885 
886     //CPU计算部分
887     //MAT 格式
888     timeCPU = clock();
889     dotCPU(aMAT, xMAT, yMATRef);
890     timeCPU = clock() - timeCPU;
891     printf("
	dotMATCPU time:	%8.3f ms
", (float)timeCPU);
892 
893     // CSR格式                    
894     aCSR = MATToCSR(aMAT);
895     timeCPU = clock();
896     dotCPU(aCSR, xMAT, yMATCal);
897     timeCPU = clock() - timeCPU;
898     printf("
	dotCSRCPU time:	%8.3f ms
", (float)timeCPU);
899     checkEqual(yMATRef->data, yMATCal->data, ROW);    
900 
901     // ELL格式
902     aELL = MATToELL(aMAT);
903     timeCPU = clock();
904     dotCPU(aELL, xMAT, yMATCal);
905     timeCPU = clock() - timeCPU;
906     printf("
	dotELLCPU time:	%8.3f ms
", (float)timeCPU);
907     checkEqual(yMATRef->data, yMATCal->data, ROW);    
908 
909     // COO格式
910     aCOO = MATToCOO(aMAT);
911     timeCPU = clock();
912     dotCPU(aCOO, xMAT, yMATCal);
913     timeCPU = clock() - timeCPU;
914     printf("
	dotCOOCPU time:	%8.3f ms
", (float)timeCPU);
915     checkEqual(yMATRef->data, yMATCal->data, ROW);
916 
917     // GPU计算部分
918     // MAT格式
919     cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col);
920     yMATCal->count = 0;
921     cudaEventRecord(startGPU, 0);
922     dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aMAT, xMAT, yMATCal);
923     cudaDeviceSynchronize();                                                  
924     cudaEventRecord(stopGPU, 0);                                              
925     cudaEventSynchronize(stopGPU);
926     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
927     printf("
	dotMATGPU time:	%8.3f ms
", timeGPU);
928     checkEqual(yMATRef->data, yMATCal->data, ROW);
929     freeMatrix(aMAT);
930 
931     // CSR格式
932     cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col);
933     yMATCal->count = 0;
934     cudaEventRecord(startGPU, 0);
935     dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aCSR, xMAT, yMATCal);
936     cudaDeviceSynchronize();
937     cudaEventRecord(stopGPU, 0);
938     cudaEventSynchronize(stopGPU);
939     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
940     printf("
	dotCSRGPU time:	%8.3f ms
", timeGPU);
941     checkEqual(yMATRef->data, yMATCal->data, ROW);
942     freeMatrix(aCSR);
943 
944     // ELL格式
945     cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col);
946     yMATCal->count = 0;
947     cudaEventRecord(startGPU, 0);
948     dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aELL, xMAT, yMATCal);
949     cudaDeviceSynchronize();
950     cudaEventRecord(stopGPU, 0);
951     cudaEventSynchronize(stopGPU);
952     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
953     printf("
	dotELLGPU time:	%8.3f ms
", timeGPU);
954     checkEqual(yMATRef->data, yMATCal->data, ROW);
955     freeMatrix(aELL);
956 
957     // COO格式
958     cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col);
959     yMATCal->count = 0;
960     cudaEventRecord(startGPU, 0);
961     dotGPU << <CEIL(count, THREAD_SIZE), THREAD_SIZE >> > (aCOO, xMAT, yMATCal);
962     cudaDeviceSynchronize();
963     cudaEventRecord(stopGPU, 0);
964     cudaEventSynchronize(stopGPU);
965     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
966     printf("
	dotCOOGPU time:	%8.3f ms
", timeGPU);
967     checkEqual(yMATRef->data, yMATCal->data, ROW);
968     freeMatrix(aCOO);
969 
970     // 清理内存    
971     freeMatrix(xMAT);
972     freeMatrix(yMATRef);
973     freeMatrix(yMATCal); 
974 
975     getchar();
976     return 0;
977 }

● 输出结果

MAT a =
        MAT,
        row = 4, col = 5, count = 8
        3 0 1 0 5
        0 0 0 0 0
        0 2 4 1 0
        1 0 0 1 0

MAT x =
        MAT,
        row = 5, col = 1, count = 5
        1
        2
        3
        4
        5

MAT y = a * x =
        MAT,
        row = 4, col = 1, count = 3
        31
        0
        20
        5

CSR b = a =
        CSR,
        row = 4, col = 5
        3 1 5 2 4 1 1 1
        0 2 4 1 2 3 0 3
        0 3 3 6 8

MAT y = b * x =
        MAT,
        row = 4, col = 1, count = 3
        31
        0
        20
        5

MAT c = b =
        MAT,
        row = 4, col = 5, count = 8
        3 0 1 0 5
        0 0 0 0 0
        0 2 4 1 0
        1 0 0 1 0

ELL d = a =
        ELL,
        row = 3, col = 4, colOrigin = 5
        3 0 2 1
        1 0 4 1
        5 0 1 0

        0 0 1 0
        2 0 2 3
        4 -1 3 0


MAT y = d * x =
        MAT,
        row = 4, col = 1, count = 3
        31
        0
        20
        5

MAT c = d =
        MAT,
        row = 4, col = 5, count = 7
        3 0 1 0 5
        0 0 0 0 0
        0 2 4 1 0
        0 0 0 1 0

ELL e = a =
        COO,
        row = 4, col = 5, count = 8
        (0,0,3)
        (0,2,1)
        (0,4,5)
        (2,1,2)
        (2,2,4)
        (2,3,1)
        (3,0,1)
        (3,3,1)

MAT y = e * x
        MAT,
        row = 4, col = 1, count = 3
        31
        0
        20
        5

MAT c = e =
        MAT,
        row = 4, col = 5, count = 8
        3 0 1 0 5
        0 0 0 0 0
        0 2 4 1 0
        1 0 0 1 0


        Start. Matrix dimension:        8192 × 8192
        Initialized. Time:                 0.000 ms

        dotMATCPU time:  304.000 ms

        dotCSRCPU time:    3.000 ms

        Check output, passed.

        dotCELLPU time:  103.000 ms

        Check output, passed.

        dotCOOCPU time:   11.000 ms

        Check output, passed.

        dotMATGPU time:    5.133 ms

        Check output, passed.

        dotCSRGPU time:    2.263 ms

        Check output, passed.

        dotELLGPU time:    1.253 ms

        Check output, passed.

        dotCOOGPU time:    4.754 ms

        Check output, passed.
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/10428527.html