数组的并行求和-cuda实现

简介

参考：https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

NVIDIA 官方有一个PPT是介绍reduce sum，就是对数组进行求和。这个在串行程序里面非常简单的程序，在并行里面实现却有很多的技巧。PPT的最后给出了最终优化的版本，非常值得学习，但是网上几乎没有运行这个程序的demo。这里给出一个简单的demo程序。

完整代码

#include <cuda_runtime_api.h>
#include <cuda.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>

#define CUDA_ERROR_CHECK
#define WARP_SIZE 32
#define MAX_THREAD 512
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
    cudaError err = cudaGetLastError();
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s
",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }

    // More careful checking. However, this will affect performance.
    // Comment away if needed.
    err = cudaDeviceSynchronize();
    if( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s
",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }
#endif

    return;
}

__device__ void warpReduce(volatile int *sdata, unsigned int tid, int blockSize)
{
    if (blockSize >= 64) {sdata[tid] += sdata[tid + 32];}
    if (blockSize >= 32) {sdata[tid] += sdata[tid + 16];}
    if (blockSize >= 16) {sdata[tid] += sdata[tid + 8];}
    if (blockSize >= 8) {sdata[tid] += sdata[tid + 4];}
    if (blockSize >= 4) {sdata[tid] += sdata[tid + 2];}
    if (blockSize >= 2) {sdata[tid] += sdata[tid + 1];}
}

__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n, int blockSize) 
{
    extern __shared__ int sdata[];
    
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + tid; 
    unsigned int gridSize = blockSize*2*gridDim.x; //suit for the grid_dim.y > 1

    sdata[tid] = 0;//set the shared memory to zero is needed
    while (i < n) 
    { 
        sdata[tid] += g_idata[i];
        if ((i+blockSize) < n)
            sdata[tid] += g_idata[i+blockSize];
        
        i += gridSize; 
    }
     __syncthreads();
    if (blockSize >= 512) 
    { 
        if (tid < 256)
        { 
            sdata[tid] += sdata[tid + 256]; 
        }
        __syncthreads(); 
    } 
    if (blockSize >= 256) 
    { 
        if (tid < 128) 
        { 
            sdata[tid] += sdata[tid + 128]; 
        }
        __syncthreads(); 
    } 
    if (blockSize >= 128)
    { 
        if (tid < 64)
        { 
            sdata[tid] += sdata[tid + 64]; 
        } 
        __syncthreads(); 
    }
    if (tid < 32) warpReduce(sdata, tid, blockSize);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 
}

unsigned int calc_next_32_times(unsigned int x)
{
    unsigned int next_pow_32_times = WARP_SIZE;
    for(int i=2; next_pow_32_times < x; i++)
        next_pow_32_times = WARP_SIZE*i;
    
    return next_pow_32_times;
}

int configure_reduce_sum(dim3 &grid_dim, dim3 &block_dim, unsigned int next_pow_32_times)
{
    if(next_pow_32_times < MAX_THREAD)
    {
        block_dim = dim3(next_pow_32_times, 1, 1);
        grid_dim = dim3(1,1,1);
    }
    else
    {
        int grid_x = ceil((double) next_pow_32_times / MAX_THREAD);
        block_dim = dim3(MAX_THREAD, 1, 1);
        grid_dim = dim3(grid_x,1,1);
    }

    return 0;
}

extern "C" 
long API_cuda_reduce_sum(int *arr, unsigned int total_num)
{
    int *d_arr, *d_out_arr, *h_out_arr;
    unsigned int next_pow_32_times = calc_next_32_times(total_num);

    cudaMalloc(&d_arr, next_pow_32_times * sizeof(int));
    cudaMemset(d_arr, 0, next_pow_32_times * sizeof(int));
    
    dim3 block_dim, grid_dim;
    configure_reduce_sum(grid_dim, block_dim, next_pow_32_times);
    
    int block_num = grid_dim.x * grid_dim.y * grid_dim.z;
   
    cudaMalloc(&d_out_arr, block_num * sizeof(int));
    cudaMemset(d_out_arr, 0, block_num * sizeof(int));
    CudaCheckError();

    cudaMemcpy(d_arr, arr, sizeof(int) * total_num, cudaMemcpyHostToDevice);
    
    reduce6<<<grid_dim, block_dim, block_dim.x*sizeof(int)>>>(d_arr, d_out_arr, total_num, block_dim.x);
    CudaCheckError();
    cudaDeviceSynchronize();

    h_out_arr = new int[block_num];
    memset(h_out_arr, 0, sizeof(int)*block_num);
    cudaMemcpy(h_out_arr, d_out_arr, sizeof(int) * block_num, cudaMemcpyDeviceToHost);

    long ret=0;
    for(int i=0; i<block_num; i++)
    {
        ret += h_out_arr[i];
    }

    cudaFree(d_arr);
    cudaFree(d_out_arr);
    delete []h_out_arr;

    return ret;

}

int main(int argc, char **argv)
{
    int arr[] = {1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,1,1,4,1,1,1,4}; //80
    int arr_len = sizeof(arr)/sizeof(arr[0]);
    int times = 5000000;
    int *big_arr = new int[times*arr_len];
    long long sum = 0;
    clock_t start, finish;

    for(int i=0; i<times; i++)
    {
        memcpy((void*)(big_arr + (i*arr_len)), (void*)arr, sizeof(int)*arr_len);
    }

    start = clock();
    for(int i=0; i<arr_len*times; i++)
    {  
        sum += big_arr[i];
    }
    finish = clock();
    printf("arr len %ld
", arr_len*times);
    printf("serial sum of arr: %ld, cost time: %.4f s
", sum,  (double) (finish - start) / CLOCKS_PER_SEC);

    start = clock();
    int cuda_sum = API_cuda_reduce_sum(big_arr, arr_len*times);
    finish = clock();

    printf("cuda sum of arr: %ld, cost time: %.4f s
", cuda_sum, (double) (finish - start) / CLOCKS_PER_SEC);
    return 0;

}

要实现完整的求和需要递归的调用kernel函数，这里只实现了第一层的reduce，也就是每个block得到一个sum，最后对每个block的sum串行的求和得到最终的sum。

编译运行

nvcc cuda_reduce_sum.c -o cuda_reduce

./cuda_reduce

测试

程序输出：

arr len 400000000

serial sum of arr: 1150000000, cost time: 0.8800

cuda sum of arr: 1150000000, cost time: 0.5500

测试过程中发现，如果数组较小，串行的求和比cuda的方式更快。因为，使用cuda求和，需要进行内存搬运和kernel函数的launch等费时操作。所以在数据量较少时，使用串行求和的效率更高。