cuda编程-矩阵乘法（1）

本方法采用简单的单线程计算每组行和列乘加运算
代码如下：
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>

__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if(tx >= width || ty >= height)
    return;

  float sum = 0;
  for(int i=0; i<width; ++i){
    sum += A[ty * width + i] * B[i * width + tx];
  }

  C[ty * width + tx] = sum;
}

void constantInit(float *data, int size, float val){
    for (int i = 0; i < size; ++i){
        data[i] = val;
    }
}


void matrixMul(){
  unsigned int width = 128;
  unsigned int height = 128;
  unsigned int size = width * height * sizeof(float);
  float *h_A = (float*)malloc(size);
  float *h_B = (float*)malloc(size);
  float *h_C = (float*)malloc(size);
  // Initialize host memory
  const float valB = 0.01f;
  constantInit(h_A, width*height, 1.0f);
  constantInit(h_B, width*height, valB);

  float *d_A, *d_B, *d_C;
  cudaMalloc((void**)&d_A, size);
  cudaMalloc((void**)&d_B, size);
  cudaMalloc((void**)&d_C, size);

  //copy host memory to device
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  //config dims
  dim3 block(16, 16);
  dim3 grid(width / block.x, height / block.y);

  // Excute the kernel
  matrixMulKernel<<<grid, block>>>(d_C, d_A, d_B, width, height);

  // Copy the memory from device to host
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  printf("Checking computed result for correctness: ");
  bool correct = true;
  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  double eps = 1.e-6 ; // machine zero

  for (int i = 0; i < width*height; i++){
      double abs_err = fabs(h_C[i] - (width * valB));
      double dot_length = width;
      double abs_val = fabs(h_C[i]);
      double rel_err = abs_err/abs_val/dot_length ;
      if (rel_err > eps)
      {
          printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
", i, h_C[i], (float)(width*height), eps);
          correct = false;
      }
  }
  printf("%s
", correct ? "Result = PASS" : "Result = FAIL");

  // Free
  free(h_A);
  free(h_B);
  free(h_C);
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

}

int main(){
  matrixMul();
}