CUDA实例练习（四）：矩阵转置

 1 #include <stdio.h>
 2 #include "gputimer.h"
 3 #include "cuda_runtime.h"
 4 #include "device_launch_parameters.h"
 5 #include <stdlib.h>
 6 
 7 const int N = 1024;
 8 const int K = 32;
 9 
10 void fill_matrix(float * mat){
11     for (int i = 0; i < N*N; i++)
12         mat[i] = (float)i;
13 }
14 
15 void print_matrix(float *mat)
16 {
17     for (int j = 0; j < N; j++)
18     {
19         for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
20         printf("
");
21     }
22 }
23 
24 __global__ void transpose_serial(float in[], float out[]){
25     for (int i = 0; i < N; i++)
26         for (int j = 0; j < N; j++)
27             out[i + j*N] = in[j + i*N];
28 }
29 
30 __global__ void transpose_parallel_per_row(float in[], float out[]){
31     int i = threadIdx.x;
32 
33     for (int j = 0; j < N; j++)
34         out[j + i*N] = in[i + j*N];
35 }
36 
37 __global__ void transpose_parallel_per_element(float in[], float out[]){
38     int i = blockIdx.x * K + threadIdx.x;
39     int j = blockIdx.y * K + threadIdx.y;
40     out[j + i*N] = in[i + j*N];
41 }
42 int main(void){
43     int numbytes = N * N * sizeof(float);
44 
45     float *in = (float *)malloc(numbytes);
46     float *out = (float *)malloc(numbytes);
47     fill_matrix(in);
48     
49     float *d_in, *d_out;
50 
51     cudaMalloc((void **)&d_in, numbytes);
52     cudaMalloc((void **)&d_out, numbytes);
53     cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
54 
55     GpuTimer timer;
56     timer.Start();
57     transpose_serial << <1, 1 >> >(d_in, d_out);
58     timer.Stop();
59     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
60     printf("transpose_serial:%g ms.
", timer.Elapsed());
61 
62     timer.Start();
63     transpose_parallel_per_row << <1, N >> >(d_in, d_out);
64     timer.Stop();
65     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
66     printf("transpose_parallel_per_row:%g ms.
", timer.Elapsed());
67 
68     dim3 blocks(N / K, N / K);
69     dim3 threads(K, K);
70     timer.Start();
71     transpose_parallel_per_element << <blocks, threads >> >(d_in, d_out);
72     timer.Stop();
73     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
74     printf("transpose_parallel_per_element: %g ms.
", timer.Elapsed());
75 
76     cudaFree(d_in);
77     cudaFree(d_out);
78 
79     return 0;
80 }