CUDA_矢量相加

#include<iostream>

#define N 10

_ _global_ _ void add(*a,*b,*c)

{

  int tid=blockIdx.x;

  if(tid<N)

    c[tid]=b[tid]+a[tid];

}

int main (void)

{

  int a[N],b[N],c[N];

  int *dev_a,*dev_b,dev_c;

  ///在GPU上分配内存

  cudaMalloc((void**)&dev_a,N*sizeof(int));

  cudaMalloc((void**)&dev_b,N*sizeof(int));

  cudaMalloc((void**)&dev_b,N*sizeof(int));

  for(i=0;i<N;i++)

  {

    a[i]=i;

    b[i]=i*i;

  }

  cudaMencpy(dev_a,a,N*sizeof(int),cudaMencpyHostToDevice);

  cudaMencpy(dev_b,b,N*sizeof(int),cudaMencpyHostToDevice);

  cudaMencpy(dev_c,c,N*sizeof(int),cudaMencpyHostToDevice);

  add<<<N,1>>>(dev_a,dev_b,dev_c);

  cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);

  for(int i=0;i<N;i++)

  {

    printf("%d+%d=%d ",a[i],b[i],c[i]);

  }

  cudaFree(dev_a);

  cudaFree(dev_b);

  cudaFree(dev_c);

  return 0;

}

原文地址:https://www.cnblogs.com/uestcsummer/p/4988753.html