▶ 按书上的步骤使用不同的导语优化矩阵乘法
● 已经优化的代码
1 #include <iostream> 2 #include <cstdlib> 3 #include <chrono> 4 5 #define SIZE 1024 6 7 using namespace std; 8 using namespace std::chrono; 9 10 double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) 11 12 int main() 13 { 14 //int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长 15 double tmp; 16 17 #pragma acc enter data create(a, b, c) 18 #pragma acc kernels present(a, b, c) 19 { 20 for (int i = 0; i < SIZE; i++) // 初始化 ab 21 { 22 for (int j = 0; j < SIZE; j++) 23 a[i][j] = (double)(i + j); 24 } 25 for (int i = 0; i < SIZE; i++) // 初始化 ab 26 { 27 for (int j = 0; j < SIZE; j++) 28 b[i][j] = (double)(i - j); 29 } 30 for (int i = 0; i < SIZE; i++) // 每种方法前都要清空 c 31 { 32 for (int j = 0; j < SIZE; j++) 33 c[i][j] = 0.0; 34 } 35 } 36 37 high_resolution_clock::time_point t1 = high_resolution_clock::now(); 38 39 #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto 40 { 41 #pragma acc loop auto 42 for (int i = 0; i < SIZE; i++) 43 { 44 #pragma acc loop auto 45 for (int j = 0; j < SIZE; j++) 46 { 47 #pragma acc loop auto 48 for (int k = 0; k < SIZE; k++) 49 c[i][j] += a[i][k] * b[k][j]; 50 } 51 } 52 } 53 54 high_resolution_clock::time_point t2 = high_resolution_clock::now(); 55 duration<double> time = duration_cast<duration<double>>(t2 - t1); 56 printf("Time OpenACC - Auto: %.6lf s. ", time.count()); 57 58 #pragma acc kernels present(c) 59 for (int i = 0; i < SIZE; i++) 60 { 61 for (int j = 0; j < SIZE; j++) 62 c[i][j] = 0.0; 63 } 64 65 t1 = high_resolution_clock::now(); 66 67 #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行 68 { 69 #pragma acc loop independent 70 for (int i = 0; i < SIZE; i++) 71 { 72 #pragma acc loop independent 73 for (int j = 0; j < SIZE; j++) 74 { 75 #pragma acc loop independent 76 for (int k = 0; k < SIZE; k++) 77 c[i][j] += a[i][k] * b[k][j]; 78 } 79 } 80 } 81 82 t2 = high_resolution_clock::now(); 83 time = duration_cast<duration<double>>(t2 - t1); 84 printf("Time OpenACC - Independent Seq: %.6lf s. ", time.count()); 85 86 #pragma acc kernels present(c) 87 for (int i = 0; i < SIZE; i++) 88 { 89 for (int j = 0; j < SIZE; j++) 90 c[i][j] = 0.0; 91 } 92 93 t1 = high_resolution_clock::now(); 94 95 #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约 96 { 97 #pragma acc loop independent 98 for (int i = 0; i < SIZE; i++) 99 { 100 #pragma acc loop independent 101 for (int j = 0; j < SIZE; j++) 102 { 103 tmp = 0.0f; 104 #pragma acc loop reduction(+: tmp) 105 for (int k = 0; k < SIZE; k++) 106 tmp += a[i][k] * b[k][j]; 107 c[i][j] = tmp; 108 } 109 } 110 } 111 112 t2 = high_resolution_clock::now(); 113 time = duration_cast<duration<double>>(t2 - t1); 114 printf("Time OpenACC - Independent Reduction: %.6lf s. ", time.count()); 115 116 #pragma acc kernels present(c) 117 for (int i = 0; i < SIZE; i++) 118 { 119 for (int j = 0; j < SIZE; j++) 120 c[i][j] = 0.0; 121 } 122 123 t1 = high_resolution_clock::now(); 124 125 #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector 126 { 127 #pragma acc loop gang(32) 128 for (int i = 0; i < SIZE; i++) 129 { 130 #pragma acc loop vector(16) 131 for (int j = 0; j < SIZE; j++) 132 { 133 tmp = 0.0f; 134 #pragma acc loop reduction(+: tmp) 135 for (int k = 0; k < SIZE; k++) 136 tmp += a[i][k] * b[k][j]; 137 c[i][j] = tmp; 138 } 139 } 140 } 141 142 t2 = high_resolution_clock::now(); 143 time = duration_cast<duration<double>>(t2 - t1); 144 printf("Time OpenACC - Gang Vector: %.6lf s. ", time.count()); 145 146 #pragma acc kernels present(c) 147 for (int i = 0; i < SIZE; i++) 148 { 149 for (int j = 0; j < SIZE; j++) 150 c[i][j] = 0.0; 151 } 152 153 t1 = high_resolution_clock::now(); 154 155 #pragma acc kernels present(a, b, c) // 方法 5,分块重排 156 { 157 #pragma acc loop tile(32, 32) 158 for (int i = 0; i < SIZE; i++) 159 { 160 for (int j = 0; j < SIZE; j++) 161 { 162 tmp = 0.0f; 163 #pragma acc loop reduction(+ 164 : tmp) 165 for (int k = 0; k < SIZE; ++k) 166 tmp += a[i][k] * b[k][j]; 167 c[i][j] = tmp; 168 } 169 } 170 } 171 172 t2 = high_resolution_clock::now(); 173 time = duration_cast<duration<double>>(t2 - t1); 174 printf("Time OpenACC - tile: %.6lf s. ", time.count()); 175 176 #pragma acc kernels present(c) 177 for (int i = 0; i < SIZE; i++) 178 { 179 for (int j = 0; j < SIZE; j++) 180 c[i][j] = 0.0; 181 } 182 183 t1 = high_resolution_clock::now(); 184 185 #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代 186 { 187 #pragma acc loop collapse(2) independent 188 for (int i = 0; i < SIZE; i++) 189 { 190 for (int j = 0; j < SIZE; j++) 191 { 192 tmp = 0.0f; 193 #pragma acc loop reduction(+: tmp) 194 for (int k = 0; k < SIZE; k++) 195 tmp += a[i][k] * b[k][j]; 196 c[i][j] = tmp; 197 } 198 } 199 } 200 201 t2 = high_resolution_clock::now(); 202 time = duration_cast<duration<double>>(t2 - t1); 203 printf("Time OpenACC - Collapse: %.6lf s. ", time.count()); 204 205 #pragma acc exit data copyout(a, b, c) 206 207 #pragma omp parallel for shared(d) 208 for (int i = 0; i < SIZE; i++) 209 { 210 for (int j = 0; j < SIZE; j++) 211 d[i][j] = 0.0; 212 } 213 214 t1 = high_resolution_clock::now(); 215 216 #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP 217 for (int i = 0; i < SIZE; i++) 218 { 219 for (int j = 0; j < SIZE; j++) 220 { 221 for (int k = 0; k < SIZE; k++) 222 d[i][j] += a[i][k] * b[k][j]; 223 } 224 } 225 t2 = high_resolution_clock::now(); 226 time = duration_cast<duration<double>>(t2 - t1); 227 printf("Time OpenMP: %.6lf s. ", time.count()); 228 229 for (int i = 0; i < SIZE; i++) // 检查结果 230 { 231 for (int j = 0; j < SIZE; j++) 232 { 233 if (c[i][j] != d[i][j]) 234 printf(" Error at [%d, %d],c = %f d = %f ", i, j, c[i][j], d[i][j]); 235 } 236 } 237 return 0; 238 }
● 输出结果(数据管理优化前)
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Time OpenACC - Auto: 4.589736 s. Time OpenACC - Independent Seq: 4.823721 s. Time OpenACC - Independent Reduction: 3.669336 s. Time OpenACC - Gang Vector: 3.611391 s. Time OpenACC - tile: 3.609573 s. Time OpenACC - Collapse: 3.605792 s. Time OpenMP: 4.345018 s.
● 输出结果(数据管理优化后)
cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++11 -acc -mp -Minfo -o main.exe main: 3, include "chrono" 31, include "chrono" 208, Parallel region activated 212, Parallel region terminated 217, Parallel region activated 224, Parallel region terminated 19, Generating enter data create(b[:][:],c[:][:],a[:][:]) Generating present(a[:][:],b[:][:],c[:][:]) 20, Loop is parallelizable 22, Loop is parallelizable Generating Tesla code 20, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 22, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 25, Loop is parallelizable 27, Loop is parallelizable Generating Tesla code 25, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 30, Loop is parallelizable 32, Loop is parallelizable Generating Tesla code 30, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 32, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 32, Memory zero idiom, loop replaced by call to __c_mzero8 40, Generating present(a[:][:],c[:][:],b[:][:]) 42, Loop is parallelizable 45, Loop is parallelizable 48, Complex loop carried dependence of c prevents parallelization Loop carried dependence of c prevents parallelization Loop carried backward dependence of c prevents vectorization Inner sequential loop scheduled on accelerator Generating Tesla code 42, #pragma acc loop gang /* blockIdx.y */ 45, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 48, #pragma acc loop seq 48, Complex loop carried dependence of c prevents parallelization Loop carried backward dependence of c prevents vectorization 56, Generating present(c[:][:]) 59, Loop is parallelizable 61, Loop is parallelizable Generating Tesla code 59, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 61, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 61, Memory zero idiom, loop replaced by call to __c_mzero8 68, Generating present(a[:][:],c[:][:],b[:][:]) 70, Loop is parallelizable 73, Loop is parallelizable 76, Loop is parallelizable Generating Tesla code 70, #pragma acc loop gang /* blockIdx.z */ 73, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 76, #pragma acc loop gang /* blockIdx.y */ 84, Generating present(c[:][:]) 87, Loop is parallelizable 89, Loop is parallelizable Generating Tesla code 87, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 89, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 89, Memory zero idiom, loop replaced by call to __c_mzero8 96, Generating present(a[:][:],c[:][:],b[:][:]) 98, Loop is parallelizable 101, Loop is parallelizable Generating Tesla code 98, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 101, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 105, #pragma acc loop seq 101, FMA (fused multiply-add) instruction(s) generated 105, Loop is parallelizable 114, Generating present(c[:][:]) 117, Loop is parallelizable 119, Loop is parallelizable Generating Tesla code 117, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 119, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 119, Memory zero idiom, loop replaced by call to __c_mzero8 126, Generating present(a[:][:],c[:][:],b[:][:]) 128, Loop is parallelizable 131, Loop is parallelizable Generating Tesla code 128, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */ 131, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */ 135, #pragma acc loop seq 135, Loop is parallelizable 144, Generating present(c[:][:]) 147, Loop is parallelizable 149, Loop is parallelizable Generating Tesla code 147, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 149, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 149, Memory zero idiom, loop replaced by call to __c_mzero8 156, Generating present(a[:][:],c[:][:],b[:][:]) 158, Loop is parallelizable 160, Loop is parallelizable Generating Tesla code 158, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */ 160, /* blockIdx.x threadIdx.x tiled */ 165, #pragma acc loop seq 165, Loop is parallelizable 174, Generating present(c[:][:]) 177, Loop is parallelizable 179, Loop is parallelizable Generating Tesla code 177, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 179, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 179, Memory zero idiom, loop replaced by call to __c_mzero8 186, Generating present(a[:][:],c[:][:],b[:][:]) 188, Loop is parallelizable 190, Loop is parallelizable Generating Tesla code 188, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 190, /* blockIdx.x threadIdx.x collapsed */ 194, #pragma acc loop seq 194, Loop is parallelizable 208, Generating exit data copyout(c[:][:],b[:][:],a[:][:]) Parallel loop activated with static block schedule 210, Memory zero idiom, loop replaced by call to __c_mzero8 212, Barrier 217, Parallel loop activated with static block schedule FMA (fused multiply-add) instruction(s) generated 224, Barrier cuan@CUAN:~/acc$ ./main.exe launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=22 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=27 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=32 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=48 device=0 threadid=1 num_gangs=8192 num_workers=1 vector_length=128 grid=8x1024 block=128 Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=61 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=76 device=0 threadid=1 num_gangs=32768 num_workers=1 vector_length=128 grid=8x1024x4 block=128 Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=89 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=101 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=119 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=131 device=0 threadid=1 num_gangs=2048 num_workers=8 vector_length=16 grid=64x32 block=16x8 Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=149 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=160 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=1024 grid=1024 block=1024 Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=179 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4 launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=190 device=0 threadid=1 num_gangs=8192 num_workers=1 vector_length=128 grid=8192 block=128 Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data /home/cuan/acc/main.cpp main NVIDIA devicenum=0 time(us): 112,420 19: compute region reached 1 time 22: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=38 max=38 min=38 avg=38 elapsed time(us): total=317 max=317 min=317 avg=317 27: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=39 max=39 min=39 avg=39 elapsed time(us): total=50 max=50 min=50 avg=50 32: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=39 max=39 min=39 avg=39 elapsed time(us): total=50 max=50 min=50 avg=50 19: data region reached 3 times 40: compute region reached 1 time 48: kernel launched 1 time grid: [8x1024] block: [128] device time(us): total=18,705 max=18,705 min=18,705 avg=18,705 elapsed time(us): total=18,717 max=18,717 min=18,717 avg=18,717 40: data region reached 2 times 56: compute region reached 1 time 61: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=43 max=43 min=43 avg=43 elapsed time(us): total=176 max=176 min=176 avg=176 56: data region reached 2 times 68: compute region reached 1 time 76: kernel launched 1 time grid: [8x1024x4] block: [128] device time(us): total=40,585 max=40,585 min=40,585 avg=40,585 elapsed time(us): total=40,709 max=40,709 min=40,709 avg=40,709 68: data region reached 2 times 84: compute region reached 1 time 89: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=39 max=39 min=39 avg=39 elapsed time(us): total=71 max=71 min=71 avg=71 84: data region reached 2 times 96: compute region reached 1 time 101: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=12,456 max=12,456 min=12,456 avg=12,456 elapsed time(us): total=12,467 max=12,467 min=12,467 avg=12,467 96: data region reached 2 times 114: compute region reached 1 time 119: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=37 max=37 min=37 avg=37 elapsed time(us): total=63 max=63 min=63 avg=63 114: data region reached 2 times 126: compute region reached 1 time 131: kernel launched 1 time grid: [64x32] block: [16x8] device time(us): total=12,295 max=12,295 min=12,295 avg=12,295 elapsed time(us): total=12,306 max=12,306 min=12,306 avg=12,306 126: data region reached 2 times 144: compute region reached 1 time 149: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=42 max=42 min=42 avg=42 elapsed time(us): total=187 max=187 min=187 avg=187 144: data region reached 2 times 156: compute region reached 1 time 160: kernel launched 1 time grid: [1024] block: [1024] device time(us): total=13,447 max=13,447 min=13,447 avg=13,447 elapsed time(us): total=13,599 max=13,599 min=13,599 avg=13,599 156: data region reached 2 times 174: compute region reached 1 time 179: kernel launched 1 time grid: [32x256] block: [32x4] device time(us): total=41 max=41 min=41 avg=41 elapsed time(us): total=173 max=173 min=173 avg=173 174: data region reached 2 times 186: compute region reached 1 time 190: kernel launched 1 time grid: [8192] block: [128] device time(us): total=12,651 max=12,651 min=12,651 avg=12,651 elapsed time(us): total=12,669 max=12,669 min=12,669 avg=12,669 186: data region reached 2 times 208: data region reached 1 time 208: data copyout transfers: 3 device time(us): total=1,963 max=674 min=644 avg=654