CUDA 实例练习（二）

题目：多项式f(x)=x + x^2 + x^3 + ···＋x^n求和。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
#include <gputimer.h>

__global__ void polynomial_items(float * array, float x, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    //float sum = 0.0f;

    if (idx < n) {
        array[idx] = pow(x, idx + 1);
    }
    __syncthreads();

    //for (int i = 0; i <= idx; i++){
    //sum += array[idx];
    //}
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;
    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();
    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();// make sure all adds at one stage are done!
    }
    
    if (tid == 0)// only thread 0 writes result for this block back to global mem
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in,int size)
{
    // assumes that size is not greater than maxThreadsPerBlock^2 
    // and that size is a multiple of maxThreadsPerBlock 
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock + 1;
    shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >(
        d_intermediate, d_in);

    // now we're down to one block left, so reduce it
    threads = blocks;// launch one thread for each block in prev step
    blocks = 1;
    shmem_reduce_kernel << <blocks, threads, threads*sizeof(float) >> >(
        d_out, d_intermediate);
}
int main()
{
    GpuTimer timer;
    int n;
    float x;
    float sum = 0.0f;
    printf("n is ");
    scanf("%d", &n);
    printf("x is ");
    scanf("%f", &x);
    float * h_in;
    h_in = (float *)malloc(sizeof(float)*n);

    float * d_array;
    const int array_bytes = n* sizeof(float);
    cudaMalloc((void **)&d_array, array_bytes);
    float *d_in, *d_intermediate, *d_out;
    cudaMalloc((void **)&d_in, array_bytes);
    cudaMalloc((void **)&d_intermediate, array_bytes);
    cudaMalloc((void **)&d_out, sizeof(float));
    float h_out;

    timer.Start();
    polynomial_items << <n / 1024 + 1, 1024 >> >(d_array, x, n);
    cudaMemcpy(h_in, d_array, array_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice);
    reduce(d_out, d_intermediate, d_in, n);
    //for (int i = 0; i <n; i++){
        //sum += h_in[i];
    //}
    timer.Stop();
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < n; i++){
        printf("%f ", h_in[i]);
    }
    printf("
Time elapsed = %g ms
", timer.Elapsed());

    //printf("sum is %lf
", sum);
    printf("h_out is %lf
", h_out);

    cudaFree(d_array);
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);

    return 0;

}