CUDA 矩阵相乘完整代码


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cublas_v2.h"

#define BLOCK_SIZE 16

cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

__global__ void multiKernel(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{
  //saved in register
  int xBlock = blockIdx.x;
  int yBlock = blockIdx.y;

  int xThread = threadIdx.x;
  int yThread = threadIdx.y;

  unsigned int aWidth = aW;
  unsigned int bWidth = bW;

  float Cvalue= 0;

  for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
  {
    __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

    aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
    bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

    __syncthreads();

    for(int e=0; e<BLOCK_SIZE; ++e)
    {
      Cvalue += aSub[yThread][e]*bSub[e][xThread];
    }
    __syncthreads();
  }

  int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
  c[cIndex] = Cvalue;
}

__global__ void multiKernel_NoLoop(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{
  int xBlock = blockIdx.x;
  int yBlock = blockIdx.y;

  int xThread = threadIdx.x;
  int yThread = threadIdx.y;

  unsigned int aWidth = aW;
  unsigned int bWidth = bW;

  float Cvalue= 0;

  for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
  {
    __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

    aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
    bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

    __syncthreads();

    Cvalue += aSub[yThread][0]*bSub[0][xThread] + aSub[yThread][1]*bSub[1][xThread] +
          aSub[yThread][2]*bSub[2][xThread] + aSub[yThread][3]*bSub[3][xThread] +
          aSub[yThread][4]*bSub[4][xThread] + aSub[yThread][5]*bSub[5][xThread] +
          aSub[yThread][6]*bSub[6][xThread] + aSub[yThread][7]*bSub[7][xThread] +
          aSub[yThread][8]*bSub[8][xThread] + aSub[yThread][9]*bSub[9][xThread] +
          aSub[yThread][10]*bSub[10][xThread] + aSub[yThread][11]*bSub[11][xThread] +
          aSub[yThread][12]*bSub[12][xThread] + aSub[yThread][13]*bSub[13][xThread] +
          aSub[yThread][14]*bSub[14][xThread] + aSub[yThread][15]*bSub[15][xThread] ;

    __syncthreads();
  }

  int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
  c[cIndex] = Cvalue;
}

cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

int main()
{
  const unsigned int aH = 320;
  const unsigned int aW = 320;
  const unsigned int bW = 320;
  const unsigned int bH = aW;
  const unsigned int cH = aH;
  const unsigned int cW = bW;

  float *cpu_a, *cpu_b,*cpu_c;

  cpu_a = (float*)malloc(aH*aW*sizeof(float));
  cpu_b = (float*)malloc(bH*bW*sizeof(float));
  cpu_c = (float*)malloc(cH*cW*sizeof(float));

  for(int y=0; y<aH; ++y)
  {
    for(int x =0; x<aW; ++x)
    {
      int index = y*aW + x;
      cpu_a[index] = (float)(x<y?x:y);
    }
  }

  for(int y=0; y<bH; ++y)
  {
    for(int x =0; x<bW; ++x)
    {
      int index = y*bW + x;
      cpu_b[index] = (float)(x<y?x:y);
    }
  }

  cudaError_t cudaStatus = multiCuda(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "multiCuda failed!");
  return 1;
  }
  /*
  for(int y=0; y<cH; ++y)
  {
    for(int x =0; x<cW; ++x)
    {
      if(x==1&&y==1)
      {
        int index = y*cW + x;
        printf("c(1,1)=%.1f ",cpu_c[index]);
      }

    }
  }
*/
  cudaStatus = multiWithcuBlase(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "multiCuda failed!");
  return 1;
  }
  /*
  for(int y=0; y<cH; ++y)
  {
    for(int x =0; x<cW; ++x)
    {
      if(x==1&&y==1)
      {
        int index = y*cW + x;
        printf("c(1,1)=%.1f ",cpu_c[index]);
      }

    }
  }
*/

  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  cudaStatus = cudaDeviceReset();
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceReset failed!");
  return 1;
  }

  float start,end;
  start = clock();
  multiCPU(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  end = clock();
  printf("CPU runtime is %f msec ",end - start);

  free(cpu_a);
  free(cpu_b);
  free(cpu_c);

  getchar();
  return 0;
}

cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
  float *gpu_a = 0;
  float *gpu_b = 0;
  float *gpu_c = 0;
  cudaError_t cudaStatus;

  cudaStatus = cudaSetDevice(0);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  goto Error;
  }

  size_t size_a = aH*aW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_b = bH*bW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_c = aH*bW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
  dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);

  cudaEvent_t start;
  cudaStatus = cudaEventCreate(&start);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop;
  cudaStatus = cudaEventCreate(&stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  multiKernel<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

  cudaStatus = cudaEventRecord(stop, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal = 0.0f;
  cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("HS__ GPU runtime is %f msec ",msecTotal);

/*******************************************************/
  cudaEvent_t start1;
  cudaStatus = cudaEventCreate(&start1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop1;
  cudaStatus = cudaEventCreate(&stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start1, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  multiKernel_NoLoop<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

  cudaStatus = cudaEventRecord(stop1, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal1 = 0.0f;
  cudaStatus = cudaEventElapsedTime(&msecTotal1, start1, stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("HS__NoLoop GPU runtime is %f msec ",msecTotal1);

/***********************************************************/

  cudaStatus = cudaGetLastError();
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "addKernel launch failed: %s ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  // cudaDeviceSynchronize waits for the kernel to finish, and returns
  // any errors encountered during the launch.
  cudaStatus = cudaDeviceSynchronize();
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel! ", cudaStatus);
  goto Error;
  }

  cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

Error:
  cudaFree(gpu_a);
  cudaFree(gpu_b);
  cudaFree(gpu_c);

  return cudaStatus;
}

void inline checkError(cublasStatus_t status, const char *msg)
{
  if (status != CUBLAS_STATUS_SUCCESS)
  {
  printf("%s", msg);
  exit(EXIT_FAILURE);
  }
}

cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
  float *gpu_a = 0;
  float *gpu_b = 0;
  float *gpu_c = 0;
  cudaError_t cudaStatus;

  cudaStatus = cudaSetDevice(0);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  goto Error;
  }

  size_t size_a = aH*aW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_b = bH*bW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_c = aH*bW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
  dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);

  //printf("Computing result using CUBLAS... ");

  cublasHandle_t handle;
  cublasStatus_t ret;
  ret = cublasCreate(&handle);
  if (ret != CUBLAS_STATUS_SUCCESS)
  {
  printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);
  goto Error;
  }

  const float alpha = 1.0f;
  const float beta = 0.0f;

  cudaEvent_t start;
  cudaStatus = cudaEventCreate(&start);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop;
  cudaStatus = cudaEventCreate(&stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aH, bW, aW, &alpha, gpu_a, aH, gpu_b, bH, &beta, gpu_c, aH);

  cudaStatus = cudaEventRecord(stop, NULL);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal = 0.0f;
  cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("cuBlas__ GPU runtime is %f msec ",msecTotal);

  cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }
  checkError(cublasDestroy(handle), "cublasDestroy() error! ");

Error:
  cudaFree(gpu_a);
  cudaFree(gpu_b);
  cudaFree(gpu_c);

  return cudaStatus;

}


void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
  for(int y=0; y<aH; ++y)
  {
    for(int x =0; x<bW; ++x)
    {
      int index = y*bW + x;
      c[index] = 0.0f;
      for(int i=0; i<aW; ++i)
      {
        c[index] += a[y*aW+i]*b[i*bW + x];
      }
    }
  }
}

原文地址:https://www.cnblogs.com/huangshan/p/3916918.html