OpenACC 计算构建内的自定义函数

▶ 使用 routine 构件创建的自定义函数，在并行调用上的差别

● 代码，自定义一个 sqab 函数，使用内建函数 fabsf 和 sqrtf 计算一个矩阵所有元素绝对值的平方根

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <math.h>
 4 #include <openacc.h>
 5 
 6 #define ROW 8
 7 #define COL 64
 8 
 9 #pragma acc routine vector
10 void sqab(float *a, const int m)
11 {    
12 #pragma acc loop
13     for (int idx = 0; idx < m; idx++)
14         a[idx] = sqrtf(fabsf(a[idx]));    
15 }
16 
17 int main()
18 {
19     float x[ROW][COL];
20     int row, col;
21     for (row = 0; row < ROW; row++)
22     {
23         for (col = 0; col < COL; col++)
24             x[row][col] = row * 10 + col;
25     }
26     printf("
x[1][1] = %f
", x[1][1]);
27 
28 #pragma acc parallel loop vector pcopy(x[0:ROW][0:COL]) // 之后在这里分别添加 gang，worker，vector
29     for (row = 0; row < ROW; row++)
30         sqab(&x[row][0], COL);
31     printf("
x[1][1] = %f
", x[1][1]);
32 
33     //getchar();
34     return 0;
35 }

● 输出结果，第 28 行不添加并行级别子句（默认使用 gang）

 1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe
 2 sqab:
 3      11, Generating Tesla code
 4          13, #pragma acc loop vector /* threadIdx.x */
 5      13, Loop is parallelizable
 6 main:
 7      28, Generating copy(x[:][:])
 8          Accelerator kernel generated
 9          Generating Tesla code
10          29, #pragma acc loop gang /* blockIdx.x */
11 
12 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
13 
14 x[1][1] = 11.000000
15 launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
16 line=28 device=0 threadid=1 num_gangs=8 num_workers=1 vector_length=32 grid=8 block=32      // 8 个 gang 在 blockIdx.x 层级，1 个 worker，vector 在 threadIdx.x 层级
17 
18 x[1][1] = 3.316625
19 PGI: "acc_shutdown" not detected, performance results might be incomplete.
20  Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.
21 
22 Accelerator Kernel Timing data
23 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
24   main  NVIDIA  devicenum=0
25     time(us): 9
26     28: compute region reached 1 time
27         28: kernel launched 1 time
28             grid: [8]  block: [32]
29             elapsed time(us): total=1000 max=1000 min=1000 avg=1000
30     28: data region reached 2 times
31         28: data copyin transfers: 1
32              device time(us): total=4 max=4 min=4 avg=4
33         31: data copyout transfers: 1
34              device time(us): total=5 max=5 min=5 avg=5

● 输出结果，第 28 行添加并行级别子句 worker

 1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe
 2 sqab:
 3      11, Generating Tesla code
 4          13, #pragma acc loop vector /* threadIdx.x */
 5      13, Loop is parallelizable
 6 main:
 7      28, Generating copy(x[:][:])
 8          Accelerator kernel generated
 9          Generating Tesla code
10          29, #pragma acc loop worker(4) /* threadIdx.y */
11      29, Loop is parallelizable
12 
13 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
14 
15 x[1][1] = 11.000000
16 launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
17 line=28 device=0 threadid=1 num_gangs=1 num_workers=4 vector_length=32 grid=1 block=32x4    // 1 个 gang，4 个 worker 在 threadIdx.y 层级，使用 2 维线程网格
18 
19 x[1][1] = 3.316625
20 PGI: "acc_shutdown" not detected, performance results might be incomplete.
21  Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.
22 
23 Accelerator Kernel Timing data
24 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
25   main  NVIDIA  devicenum=0
26     time(us): 10
27     28: compute region reached 1 time
28         28: kernel launched 1 time
29             grid: [1]  block: [32x4]
30              device time(us): total=0 max=0 min=0 avg=0
31     28: data region reached 2 times
32         28: data copyin transfers: 1
33              device time(us): total=5 max=5 min=5 avg=5
34         31: data copyout transfers: 1
35              device time(us): total=5 max=5 min=5 avg=5

● 输出结果，第 28 行添加并行级别子句 vector

 1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe
 2 sqab:
 3      11, Generating Tesla code
 4          13, #pragma acc loop vector /* threadIdx.x */
 5      13, Loop is parallelizable
 6 main:
 7      28, Generating copy(x[:][:])
 8          Accelerator kernel generated
 9          Generating Tesla code
10          29, #pragma acc loop seq
11      29, Loop is parallelizable
12 
13 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
14 
15 x[1][1] = 11.000000
16 launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main 
17 line=28 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=32 grid=1 block=32      // 1 个 gang，1 个 worker，并行全都堆在 threadIdx.x 层级上
18 
19 x[1][1] = 3.316625
20 PGI: "acc_shutdown" not detected, performance results might be incomplete.
21  Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.
22 
23 Accelerator Kernel Timing data
24 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
25   main  NVIDIA  devicenum=0
26     time(us): 10
27     28: compute region reached 1 time
28         28: kernel launched 1 time
29             grid: [1]  block: [32]
30             elapsed time(us): total=1000 max=1000 min=1000 avg=1000
31     28: data region reached 2 times
32         28: data copyin transfers: 1
33              device time(us): total=5 max=5 min=5 avg=5
34         31: data copyout transfers: 1
35              device time(us): total=5 max=5 min=5 avg=5

● 如果自定义函数并行子句等级高于主调函数，则主调函数并行子句会变成 seq；如果自定义函数并行子句等级低于内部并行子句等级，则会报 warning，忽略掉内部并行子句：

1 #pragma acc routine vector
2 void sqab(float *a, const int m)
3 {    
4 #pragma acc loop worker
5     for (int idx = 0; idx < m; idx++)
6         a[idx] = sqrtf(fabsf(a[idx]));    
7 }

● 编译结果（运行结果通上面的 worker，不写）

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe
PGC-W-0155-acc loop worker clause ignored in acc routine vector procedure  (main.c: 13)
sqab:
     11, Generating Tesla code
         13, #pragma acc loop vector /* threadIdx.x */
     13, Loop is parallelizable