OpenACC 与 CUDA 的相互调用

▶ 按照书上的代码完成了 OpenACC 与CUDA 的相互调用,以及 OpenACC 调用 cuBLAS。遇到了很多问题,如 CUDA 版本,代码版本,计算能力指定等,以后填坑。

● 代码,OpenACC 调用 CUDA

 1 // kernel.cu
 2 __global__ void saxpy_kernel(const int n, const float a, float *x, float *y)
 3 {
 4     int id = blockIdx.x * blockDim.x + threadIdx.x;
 5     if (id < n)
 6         y[id] += a * x[id];
 7 }
 8 
 9 extern "C" void saxpy(const int n, const float a, float *x, float *y)
10 {
11     saxpy_kernel << < (n + 128 - 1) / 128, 128 >> > (n, a, x, y);
12 }
13 
14 // main.c
15 #include <stdio.h>
16 #include <stdlib.h>
17 
18 #define N   1024
19 
20 #pragma acc routine seq
21 extern void saxpy(int n, float a, float *x, float *y);
22 
23 int main()
24 {
25     float *x = (float *)malloc(sizeof(float)*N);
26     float *y = (float *)malloc(sizeof(float)*N);
27 
28 #pragma acc data create(x[0:N]) copyout(y[0:N])
29     {
30 #pragma acc kernels 
31 #pragma acc loop independent
32         for (int i = 0; i < N; i++)
33         {
34             x[i] = 1.0f;
35             y[i] = 4.0f;
36         }
37 #pragma acc host_data use_device(x, y)
38         saxpy(N, 2.0f, x, y);
39     }
40 #pragma wait
41 
42     printf("
y[0] = %f
", y[0]);
43     free(x);
44     free(y);
45     //getchar();
46     return 0;
47 }

● 输出结果,代码在 win10上不能链接,报错:LINK : fatal error LNK1104: 无法打开文件“libcudapgi.lib”,WSL上输出结果不正确,在 Ubuntu 中报链接错误。参考了 参考https://blog.csdn.net/wcj0626/article/details/12611689?locationNum=12&fps=1 和 https://stackoverflow.com/questions/31737024/openacc-calling-cuda-device-kernel-from-openacc-parallel-loop,还是没有解决问题

WSL:
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c kernel.cu -rdc=true
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o kernel.o
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[0] = 4.000000

Ubuntu:
@E@nvlink fatal   : elfLink fatbinary error
pgacclnk: child process exit status 2: /usr/local/pgi/linux86-64/18.4/bin/pgnvd

● 代码,OpenACC 调用 CUDA

 1 // fun.c
 2 void set(const int n, const float c, float *x)
 3 {
 4 #pragma acc kernels deviceptr(x)
 5     for (int i = 0; i < n; i++)
 6         x[i] = c;
 7 }
 8 
 9 void saxpy(const int n, const float a, float *restrict x, float *restrict y)
10 {
11 #pragma acc kernels deviceptr(x, y)
12     for (int i = 0; i < n; i++)
13         y[i] += a * x[i];
14 }
15 
16 // main.cu
17 #include <stdio.h>
18 #include <cuda.h>
19 #include "cuda_runtime.h"
20 #include "device_launch_parameters.h"
21 
22 #define N   1024
23 
24 extern "C" void set(int, float, float *);
25 extern "C" void saxpy(int, float, float *, float *);
26 
27 int main()
28 {
29     float *x, *y, y0;
30     cudaMalloc((void**)&x, sizeof(float)*N);
31     cudaMalloc((void**)&y, sizeof(float)*N);
32 
33     set(N, 1.0f, x);
34     set(N, 0.0f, y);
35     saxpy(N, 2.0f, x, y);
36     cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost);
37     
38     printf("
y[0] = %f
", y0);
39     cudaFree(x);
40     cudaFree(y);
41     //getchar();
42     return 0;
43 }

● 输出结果,代码在 win10上不能链接,WSL 和 Ubuntu 中报链接错误

WSL:
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c fun.c
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o fun.o
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe
Segmentation fault (core dumped)

Ubuntu:
cuan@CUAN:~/Temp$ nvcc -c main.cu -rdc=true
cuan@CUAN:~/Temp$ pgcc -acc -c fun.c
cuan@CUAN:~/Temp$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o fun.o
@E@nvlink fatal   : elfLink fatbinary error
pgacclnk: child process exit status 2: /usr/local/pgi/linux86-64/18.4/bin/pgnvd

● 代码,CUDA 调用 OpenACC,捆绑变量地址

 1 // fun.c
 2 #include <openacc.h>
 3 
 4 void map(float *restrict pHost, float *restrict pDevice, int sizeByte)
 5 {
 6     acc_map_data(pHost, pDevice, sizeByte);
 7 }
 8 
 9 void set(int n, float c, float *x)
10 {
11 #pragma acc kernels present(x)
12     for (int i = 0; i < n; i++)
13         x[i] = c;
14 }
15 
16 void saxpy(int n, float a, float *restrict x, float *restrict y)
17 {
18 #pragma acc kernels deviceptr(x,y)
19     for (int i = 0; i < n; i++)
20         y[i] += a * x[i];
21 }
22 
23 // main.cu
24 #include <stdio.h>
25 #include <stdlib.h>
26 
27 #define N   1024
28 
29 extern "C" void map(float *, float *, int);
30 extern "C" void set(int, float, float *);
31 extern "C" void saxpy(int, float, float *, float *);
32 
33 int main()
34 {
35 
36     float *x = (float *)malloc(sizeof(float)*N);
37     float *y = (float *)malloc(sizeof(float)*N);
38     float *dx, *dy, y0;
39     cudaMalloc((void**)&dx, sizeof(float)*N);
40     cudaMalloc((void**)&dy, sizeof(float)*N);
41 
42     map(x, dx, sizeof(float)*N);
43     map(y, dy, sizeof(float)*N);
44     set(N, 1.0f, x);
45     set(N, 4.0f, y);
46     saxpy(N, 2.0f, x, y);
47     cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost);
48 
49     printf("
y[0] = %f
",y0);
50     free(x);
51     free(y);
52     cudaFree(x);
53     cudaFree(y);
54     //getchar();
55     return 0;
56 }

● 输出结果,代码在 win10上不能链接,在 WSL 上结果正确,在 Ubuntu 中未尝试

WSL:
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -c fun.c -acc
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -o acc.exe main.o fun.o -ta=tesla:rdc,cuda9.1
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[0] = 6.000000

● 代码,OpenACC 调用 cuBLAS

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 
 4 #define N   1024
 5 
 6 extern void cublasSaxpy(int, float, float *, int, float *, int);
 7 
 8 int main()
 9 {
10     float *x = (float *)malloc(sizeof(float)*N);
11     float *y = (float *)malloc(sizeof(float)*N);
12 
13 #pragma acc data create(x[0:N]) copyout(y[0:N])
14     {
15 #pragma acc kernels
16         for (int i = 0; i < N; i++)
17         {
18             x[i] = 1.0f;
19             y[i] = 4.0f;
20         }
21 #pragma acc host_data use_device(x,y)
22         {
23             cublasSaxpy(N, 2.0f, x, 1, y, 1);
24         }
25     }
26 
27     printf("
y[0] = %f
", y[0]);
28     free(x);
29     free(y);
30     //getchar();
31     return 0;
32 }

● 输出结果,代码在 win10上不能链接,在 WSL 中结果错误,在 Ubuntu 中结果正确

WSL:
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c fun.c -rdc=true
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o
cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[0] = 4.000000

Ubuntu:
cuan@CUAN:~/Temp$ nvcc -c fun.c -rdc=true
cuan@CUAN:~/Temp$ pgcc -acc -c main.c
cuan@CUAN:~/Temp$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o
cuan@CUAN:~/Temp$ ./acc.exe

y[0] = 6.000000
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9446401.html