basic_double_stream

basic_double_stream_incorrect

不合理的代码
  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 
 17 #include "../common/book.h"
 18 #include "cuda.h"
 19 #include "cuda_runtime.h"
 20 #include "device_launch_parameters.h"
 21 #define N   (1024*1024)
 22 #define FULL_DATA_SIZE   (N*20)
 23 
 24 
 25 __global__ void kernel(int *a, int *b, int *c) {
 26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 27     if (idx < N) {
 28         int idx1 = (idx + 1) % 256;
 29         int idx2 = (idx + 2) % 256;
 30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
 31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
 32         c[idx] = (as + bs) / 2;
 33     }
 34 }
 35 
 36 
 37 int main(void) {
 38     cudaDeviceProp  prop;
 39     int whichDevice;
 40     HANDLE_ERROR(cudaGetDevice(&whichDevice));
 41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
 42     if (!prop.deviceOverlap) {
 43         printf("Device will not handle overlaps, so no speed up from streams
");
 44         return 0;
 45     }
 46 
 47     cudaEvent_t     start, stop;
 48     float           elapsedTime;
 49 
 50     cudaStream_t    stream0, stream1;
 51     int *host_a, *host_b, *host_c;
 52     int *dev_a0, *dev_b0, *dev_c0;
 53     int *dev_a1, *dev_b1, *dev_c1;
 54 
 55     // start the timers
 56     HANDLE_ERROR(cudaEventCreate(&start));
 57     HANDLE_ERROR(cudaEventCreate(&stop));
 58 
 59     // initialize the streams
 60     HANDLE_ERROR(cudaStreamCreate(&stream0));
 61     HANDLE_ERROR(cudaStreamCreate(&stream1));
 62 
 63     // allocate the memory on the GPU
 64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
 65         N * sizeof(int)));
 66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
 67         N * sizeof(int)));
 68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
 69         N * sizeof(int)));
 70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
 71         N * sizeof(int)));
 72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
 73         N * sizeof(int)));
 74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
 75         N * sizeof(int)));
 76 
 77     // allocate host locked memory, used to stream
 78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
 79         FULL_DATA_SIZE * sizeof(int),
 80         cudaHostAllocDefault));
 81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
 82         FULL_DATA_SIZE * sizeof(int),
 83         cudaHostAllocDefault));
 84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
 85         FULL_DATA_SIZE * sizeof(int),
 86         cudaHostAllocDefault));
 87 
 88     for (int i = 0; i<FULL_DATA_SIZE; i++) {
 89         host_a[i] = rand();
 90         host_b[i] = rand();
 91     }
 92 
 93     HANDLE_ERROR(cudaEventRecord(start, 0));
 94     // now loop over full data, in bite-sized chunks
 95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
 96         // copy the locked memory to the device, async
 97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
 98             N * sizeof(int),
 99             cudaMemcpyHostToDevice,
100             stream0));
101         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
102             N * sizeof(int),
103             cudaMemcpyHostToDevice,
104             stream0));
105 
106         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
107 
108         // copy the data from device to locked memory
109         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
110             N * sizeof(int),
111             cudaMemcpyDeviceToHost,
112             stream0));
113 
114 
115         // copy the locked memory to the device, async
116         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
117             N * sizeof(int),
118             cudaMemcpyHostToDevice,
119             stream1));
120         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
121             N * sizeof(int),
122             cudaMemcpyHostToDevice,
123             stream1));
124 
125         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
126 
127         // copy the data from device to locked memory
128         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
129             N * sizeof(int),
130             cudaMemcpyDeviceToHost,
131             stream1));
132     }
133     HANDLE_ERROR(cudaStreamSynchronize(stream0));
134     HANDLE_ERROR(cudaStreamSynchronize(stream1));
135 
136     HANDLE_ERROR(cudaEventRecord(stop, 0));
137 
138     HANDLE_ERROR(cudaEventSynchronize(stop));
139     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
140         start, stop));
141     printf("Time taken:  %3.1f ms
", elapsedTime);
142 
143     // cleanup the streams and memory
144     HANDLE_ERROR(cudaFreeHost(host_a));
145     HANDLE_ERROR(cudaFreeHost(host_b));
146     HANDLE_ERROR(cudaFreeHost(host_c));
147     HANDLE_ERROR(cudaFree(dev_a0));
148     HANDLE_ERROR(cudaFree(dev_b0));
149     HANDLE_ERROR(cudaFree(dev_c0));
150     HANDLE_ERROR(cudaFree(dev_a1));
151     HANDLE_ERROR(cudaFree(dev_b1));
152     HANDLE_ERROR(cudaFree(dev_c1));
153     HANDLE_ERROR(cudaStreamDestroy(stream0));
154     HANDLE_ERROR(cudaStreamDestroy(stream1));
155 
156     return 0;
157 }
代码下载