【CUDA并行编程之四】矩阵相乘

前面介绍了基本的Cuda编程的相关知识，那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能，我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码：

mat_mul.cc:

[cpp]view
 plaincopy

//a[i]*b[i] + c[i] = d[i]  

#include<iostream>  

#include<vector>  

#include<map>  

#include<fstream>  

#include"wtime.h"   

using namespace std;  

const int N = 320;  

//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示  

int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];  

int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];  

void init()  

{  

    for(int i=0;i<N;i++)  

        for(int j=0;j<N;j++)  

        {  

            a[i][j] = 1;  

            b[i][j] = 2;  

            c[i][j] = 3;  

        }  

}  

void init1()  

{  

    for(int i=0;i<N;i++)  

        for(int j=0;j<N;j++)  

        {  

            aa[i*N+j] = 1;  

            bb[i*N+j] = 2;  

            cc[i*N+j] = 3;  

        }  

}  

void mul()  

{  

    for(int i=0;i<N;i++)   

      for(int j=0;j<N;j++)  

      {  

        for(int k=0;k<N;k++)  

        {  

            d[i][j] += a[i][k] * b[k][j];  

        }  

        d[i][j] += c[i][j];  

      }  

}  

void mul1()  

{  

    for(int i=0;i<N;i++)   

      for(int j=0;j<N;j++)  

      {  

        for(int k=0;k<N;k++)  

        {  

            dd[i*N+j] += aa[i*N+k] * bb[k*N+j];  

        }  

        dd[N*i+j] += cc[N*i+j];  

      }  

}  

void print()  

{  

    ofstream fout;  

    fout.open("result.txt");  

    if(!fout)  

    {  

        perror("can not open the file");  

    }  

    for(int i=0;i<N;i++)  

    {  

      for(int j=0;j<N;j++)  

      {  

          fout<<d[i][j]<<" ";  

      }  

      fout<<endl;  

    }  

    fout.close();  

}  

int main()  

{  

    init1();      

    double t = wtime();  

    mul1();  

    t = wtime()-t;  

    printf("computation timing = %10.10f sec
",t);  

    //print();  

    return 0;  

}

wtime.h:

[cpp]view
 plaincopy

#ifndef _WTIME_  

#define _WTIME_  

double wtime();  

#endif

wtime.cc:

[cpp]view
 plaincopy

#include <stdio.h>  

#include <sys/time.h>  

#include <iostream>  

#include <cstdlib>  

double wtime(void)  

{  

    double now_time;  

    struct timeval etstart;  

    struct timezone tzp;  

    if(gettimeofday(&etstart,&tzp)==-1)  

    {  

        perror("Error:calling gettimeofday() not successfully.
");  

    }  

    now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  

    return now_time;  

}  

#if 0  

int main()  

{  

    double time;  

    time = wtime();  

    printf("time of day = %10.4f
",time);  

    return 0;  

}  

#endif

makefile:

[cpp]view
 plaincopy

target:  

    g++ mat_mul.cc wtime.cc  

    ./a.out  

结果：

2.GPU上执行矩阵相乘以及性能。

代码：

cuda_mat_mul_v1.cu:

[cpp]view
 plaincopy

//matrix multiplication with global memory   

#include<iostream>  

#include<fstream>  

#include "wtime.h"  

using namespace std;  

const int BLOCK_SIZE = 16;  

const int GRID_SIZE = 20;  

//D = A * B + C;  

__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)  

{  

    int row = blockIdx.y * blockDim.y + threadIdx.y;  

    int col = blockIdx.x * blockDim.x + threadIdx.x;  

    int sum = 0;  

    for(int i=0;i<N;i++)  

    {  

        sum += da[row*N + i] * db[row*i+col];  

    }  

    dd[row*N + col] = sum + dc[row*N + col];  

}  

int main()  

{  

    int N = BLOCK_SIZE * GRID_SIZE;  

    int *ha,*hb,*hc,*hd;  

    int *da,*db,*dc,*dd;  

    double time;  

    ha = new int[N*N];  

    hb = new int[N*N];  

    hc = new int[N*N];  

    hd = new int[N*N];  

    cudaError_t err;  

    //initialize  

    for(int i=0;i<N;i++)  

        for(int j=0;j<N;j++)  

        {  

            ha[i*N+j] = 1;  

            hb[i*N+j] = 2;  

            hc[i*N+j] = 3;  

        }  

    //malloc</strong>  

    cudaMalloc(&da,N*N*sizeof(int));  

    cudaMalloc(&db,N*N*sizeof(int));  

    cudaMalloc(&dc,N*N*sizeof(int));  

    err = cudaMalloc(&dd,N*N*sizeof(int));  

    printf("Cuda Malloc C : %s
",cudaGetErrorString(err));  

    //host to device  

    cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);  

    cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);  

    cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);  

    cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);  

    dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);  

    dim3 grid(GRID_SIZE,GRID_SIZE);  

    //kernel  

    time = wtime();  

    mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);  

    printf("Computation time is %10.10f
",wtime()-time);  

    //device to host  

    cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);  

    //print result to file  

    ofstream fout;  

    fout.open("result_v1.txt");  

    if(!fout)    

    {  

        cerr<<"open the file error"<<endl;  

        exit(-1);  

    }  

    for(int i=0;i<N;i++)   

    {  

        for(int j=0;j<N;j++)  

        {  

            fout<<hd[i*N+j]<<" ";  

        }  

        fout<<endl;  

    }  

    delete []ha;delete []hb;delete []hc;delete []hd;  

    cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);  

    return 0;  

}

cuda_wtime.cu:

[cpp]view
 plaincopy

#include <stdio.h>  

#include <sys/time.h>  

#include <iostream>  

#include <cstdlib>  

double wtime(void)  

{  

    double now_time;  

    struct timeval etstart;  

    struct timezone tzp;  

    if(gettimeofday(&etstart,&tzp)==-1)  

    {  

        perror("Error:calling gettimeofday() not successfully.
");  

    }  

    now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  

    return now_time;  

}  

#if 0  

int main()  

{  

    double time;  

    time = wtime();  

    printf("time of day = %10.4f
",time);  

    return 0;  

}  

#endif

wtime.h:

[cpp]view
 plaincopy

#ifndef _WTIME_  

#define _WTIME_  

double wtime();  

#endif

cuda_wtime.cu:

[cpp]view
 plaincopy

#include <stdio.h>  

#include <sys/time.h>  

#include <iostream>  

#include <cstdlib>  

double wtime(void)  

{  

    double now_time;  

    struct timeval etstart;  

    struct timezone tzp;  

    if(gettimeofday(&etstart,&tzp)==-1)  

    {  

        perror("Error:calling gettimeofday() not successfully.
");  

    }  

    now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  

    return now_time;  

}  

#if 0  

int main()  

{  

    double time;  

    time = wtime();  

    printf("time of day = %10.4f
",time);  

    return 0;  

}  

#endif

makefile:

[cpp]view
 plaincopy

cu:  

    nvcc cuda_mat_mul_v1.cu cuda_wtime.cu  

    ./a.out  

结果：

3.计算性能对比：

矩阵大小	*16001600**	1200*1200	800*800	320*320
串行时间/s	30.9	11.49865	2.597987	0.162311
并行时间	grid=100/block=16	grid=75/block=16	grid=50/block=16	grid=20/block=16
kernel执行时间/s	0.0000319	0.0000309944	0.0000309944	0.0000231266
并行计算总时间（分配内存加+数据拷贝+计算）/s	0.70796	0.439213	0.310214	0.237676

可见，在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。

注明出处：http://blog.csdn.net/lavorange/article/details/41896591