OpenACC 数据管理语句

▶ 书中第4章，数据管理部分的代码和说明

● 代码，关于 copy，copyin，copyout，create

 1 #include <stdio.h>
 2 #include <openacc.h>
 3 
 4 int main()
 5 {
 6     const int length = 1024;
 7     int a[length], b[length], c[length],d[length];
 8 
 9     for (int i = 0; i < length; a[i] = b[i] = c[i] = 1);
10     {
11 #pragma acc kernels create(d)
12         for (int i = 0; i < length; i++)
13         {
14             a[i] ++;
15             c[i] = a[i] + b[i];
16             d[i] = 0;
17         }
18     }
19     for (int i = 0; i < 10; i++)
20         printf("a[%d] = %d, c[%d] = %d
", i, a[i], i, c[i]);
21     getchar();
22     return 0;
23 }

● 输出结果，显式创建了中间变量 d，隐式创建了 a，b，c，并具有不同的拷贝属性

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
main:
     11, Generating create(d[:])
         Generating implicit copyout(c[:1024])
         Generating implicit copyin(b[:1024])
         Generating implicit copy(a[:1024])
     12, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

● 在 kernels 里单独使用 copyout 时报警告：PGC-W-0996-The directive #pragma acc copyout is deprecated; use #pragma acc declare copyout instead (main.c: XX)

● enter data 和 exit data 用于 C++。

■ 首先，windows 中 pgi 不支持 C++ 编译，只有 pgcc.exe 而没有 pgc++*.exe，只能乖乖到 Linux 下去写！

■ 书上的代码有点问题，大意是 OpenACC 的 copy 是浅拷贝，对于内含指针的数据结构（如 vector，class）不会连着指针指向的对象一起拷。这里有两种解决办法，一种是去结构化，将 class 中的数据集中成简单数组来进行拷贝；另一种是使用 Managed 内存，也就不存在显式拷贝的问题了。【https://stackoverflow.com/questions/53860467/how-to-copy-on-gpu-a-vector-of-vector-pointer-memory-allocated-in-openacc】

■ 书上的代码没有采用这两种解决方案，会报错 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution” 以及 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution”，这个问题还蛮常见的【https://stackoverflow.com/search?q=call+to+returned+error+700%3A+Illegal+address+during+kernel+execution】

● 使用去结构化来使用数组

 1 #include <iostream>
 2 #include <vector>
 3 #include <cstdint>
 4 
 5 using namespace std;
 6 
 7 int main()
 8 {
 9     const int vectorCount = 1024, vectorLength = 20;
10     long sum = 0;
11 
12     vector<int32_t> *vectorTable = new vector<int32_t>[vectorCount]; // 1024 个向量，每个向量放入 20 个元素
13     for (int i = 0; i < vectorCount; i++)
14     {
15         for (int j = 0; j < vectorLength; j++)
16             vectorTable[i].push_back(i);
17     }
18     int32_t **arrayTable = new int32_t *[vectorCount]; // 仅包含向量数据的数组，与 vectorTable 对应
19     int *vectorSize = new int[vectorCount];            // 每个向量的尺寸
20 
21 #pragma acc enter data create(arrayTable [0:vectorCount] [0:0]) // 设备中创建 arryTable，注意维度
22     for (int i = 0; i < vectorCount; i++)
23     {
24         int sze = vectorTable[i].size();
25         vectorSize[i] = sze;
26         arrayTable[i] = vectorTable[i].data();        // 把每个向量数据的指针赋给 arrayTable
27 #pragma acc enter data copyin(arrayTable [i:1][:sze]) // 把每个向量的数据拷贝进设备
28     }
29 #pragma acc enter data copyin(vectorSize[:vectorCount]) // 向量尺寸也放进设备
30 
31 #pragma acc parallel loop gang vector reduction(+: sum) present(arrayTable, vectorSize) // 规约计算
32     for (int i = 0; i < vectorCount; i++)
33     {
34         for (int j = 0; j < vectorSize[i]; ++j)
35             sum += arrayTable[i][j];
36     }
37     cout << "Sum: " << sum << endl;
38 
39 #pragma acc exit data delete (vectorSize)
40 #pragma acc exit data delete (arrayTable)
41     delete[] vectorSize;
42     delete[] vectorTable;
43     return 0;
44 }

● 输出结果

cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla -Minfo -acc
main:
     95, Generating enter data create(arrayTable[:1024][:0])
     32, Generating enter data copyin(arrayTable[i][:sze+1],vectorSize[:1024])
         Generating implicit copy(sum)
         Generating present(vectorSize[:])
         Generating Tesla code
         32, Generating reduction(+:sum)
         32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         34, #pragma acc loop seq
     32, Generating present(arrayTable[:][:])
     34, Loop is parallelizable
     41, Generating exit data delete(vectorSize[:1],arrayTable[:1][:1])
cuan@CUAN:~$ ./main.exe
Sum: 10475520

● 使用 Managed 内存

 1 #include <iostream>
 2 
 3 using namespace std;
 4 
 5 class ivector
 6 {
 7 public:
 8     int len;
 9     int *arr;
10     ivector(int length)
11     {
12         len = length;
13         arr = new int[len];
14 #pragma acc enter data copyin(this)
15 #pragma acc enter data create(arr [0:len])
16 #pragma acc parallel loop present(arr [0:len])
17         for (int iend = len, i = 0; i < iend; i++)              // 使用临时变量 iend，防止编译器认为 len 值在循环中会改变，从而拒绝并行化
18             arr[i] = i;
19     }
20 
21     ivector(const ivector &s)
22     {
23         len = s.len;
24         arr = new int[len];
25 #pragma acc enter data copyin(this)
26 #pragma acc enter data create(arr [0:len])
27 #pragma acc parallel loop present(arr [0:len], s.arr [0:len])   // s 也已经在设备上了
28         for (int iend = len, i = 0; i < iend; i++)
29             arr[i] = s.arr[i];
30     }
31 
32     ~ivector()
33     {
34 #pragma acc exit data delete (arr)                              // 销毁对象时依次销毁设备上的 arr 和 this
35 #pragma acc exit data delete (this)
36         cout << "deconstruction!" << endl;
37         delete[] arr;
38         len = 0;
39     }
40 
41     int &operator[](int i)
42     {
43         if (i < 0 || i >= this->len)
44             return arr[0];
45         return arr[i];
46     }
47 
48     void add(int c)
49     {
50 #pragma acc kernels loop present(arr [0:len])                   // 每次涉及修改 arr 的操作都要注明 present
51         for (int iend = len, i = 0; i < iend; i++)
52             arr[i] += c;
53     }
54 
55     void updateHost()                                           // 手动更新主机端数据
56     {
57 #pragma acc update host(arr [0:len])
58     }
59 };
60 
61 int main()
62 {
63     ivector s1(20);
64     s1.add(10);
65     s1.updateHost();
66     cout << "s1[1] = " << s1[1] << endl;
67 
68     ivector s2(s1);
69     s2.updateHost();
70     cout << "s2[1] = " << s2[1] << endl;
71 
72     return 0;
73 }

● 输出结果，不加 -ta=tesla:managed 会报错【填坑】

cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla:managed -Minfo -acc
ivector::ivector(int):
     13, Generating enter data copyin(this[:1])
         Generating enter data create(arr[:len])
         Generating Tesla code
         17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     13, Generating implicit copy(this[:])
         Generating present(arr[:len])
ivector::ivector(const ivector&):
     24, Generating enter data create(arr[:len])
         Generating enter data copyin(this[:1])
         Generating present(arr[:len])
         Generating Tesla code
         28, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     24, Generating implicit copyin(s[:])
         Generating implicit copy(this[:])
         Generating present(s->arr[:len])
ivector::~ivector():
     36, Generating exit data delete(this[:1],arr[:1])
ivector::add(int):
      0, Generating Tesla code
     49, Accelerator serial kernel generated
         Generating implicit copy(this[:])
         Generating present(arr[:len])
     51, Loop is parallelizable
         Generating Tesla code
         51, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
ivector::updateHost():
     58, Generating update self(arr[:len])
cuan@CUAN:~$ ./main.exe
launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivectorC1Ei line=13 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=49 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=51 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
s1[1] = 11
launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivectorC1ERKS_ line=24 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
s2[1] = 11
deconstruction!
deconstruction!

● 在这本书上找到了 C++ 中使用 OpenACC 的办法【https://www.elsevier.com/books/parallel-programming-with-openacc/farber/978-0-12-410397-9】，代码是【https://github.com/rmfarber/ParallelProgrammingWithOpenACC/tree/master/Chapter05】下的 accList.double.cpp

  1 // accList.h
  2 #ifndef ACC_LIST_H
  3 #define ACC_LIST_H
  4 
  5 #include <cstdlib>
  6 #include <cassert>
  7 #ifdef _OPENACC
  8 #include <openacc.h>
  9 #endif
 10 
 11 template<typename T>
 12 class accList
 13 {
 14 public:
 15     explicit accList() {}
 16     explicit accList(size_t size)           // 构造函数把 this 指针拷进设备，然后创建内存
 17     {
 18 #pragma acc enter data copyin(this)         
 19         allocate(size);
 20     }
 21 
 22     ~accList()                              // 析构时释放内存，再删除 this 指针
 23     {
 24         release();
 25 #pragma acc exit data delete(this)          
 26     }
 27 
 28 #pragma acc routine seq
 29     T& operator[](size_t idx)
 30     {
 31         return _A[idx];
 32     }
 33 
 34 #pragma acc routine seq
 35     const T& operator[](size_t idx) const
 36     {
 37         return _A[idx];
 38     }
 39 
 40     size_t size() const
 41     {
 42         return _size;
 43     }
 44 
 45     accList& operator=(const accList& B)
 46     {
 47         allocate(B.size());
 48         for (size_t j = 0; j < _size; ++j)
 49         {
 50             _A[j] = B[j];
 51         }
 52         accUpdateDevice();
 53         return *this;
 54     }
 55 
 56     void insert(size_t idx, const T& val)
 57     {
 58         _A[idx] = val;
 59     }
 60     void insert(size_t idx, const T* val)
 61     {
 62         _A[idx] = *val;
 63     }
 64 
 65     void accUpdateSelf()
 66     {
 67         accUpdateSelfT(_A, 0);
 68     }
 69     void accUpdateDevice()
 70     {
 71         accUpdateDeviceT(_A, 0);
 72     }
 73 
 74 private:
 75     T * _A{ nullptr };                      // 数据成员只有指针和长度
 76     size_t _size{ 0 };
 77 
 78     void release()
 79     {
 80         if (_size > 0)
 81         {
 82 #pragma acc exit data delete(_A[0:_size])   // 释放内存时删除设备内存
 83             delete[] _A;
 84             _A = nullptr;
 85             _size = 0;
 86         }
 87     }
 88 
 89     void allocate(size_t size)
 90     {
 91         if (_size != size)                  // 申请内存尺寸与当前尺寸不一致时重新开辟一块
 92         {
 93             release();
 94             _size = size;
 95 #pragma acc update device(_size)
 96             if (_size > 0)
 97             {
 98                 _A = new T[_size];
 99 #ifdef _OPENACC                             // 有 OpenACC 的话检查 _A 是否已经在设备上了
100                 assert(!acc_is_present(&_A[0], sizeof(T)));
101 #endif
102 #pragma acc enter data create(_A[0:_size])  // 在设备上申请新内存
103             }
104         }
105     }
106 
107     template<typename U>
108     void accUpdateSelfT(U *p, long)
109     {
110 #pragma acc update self(p[0:_size])
111     }
112 
113     template<typename U>
114     auto accUpdateSelfT(U *p, int) -> decltype(p->accUpdateSelf())
115     {
116         for (size_t j = 0; j < _size; ++j)
117         {
118             p[j].accUpdateSelf();
119         }
120     }
121 
122     template<typename U>
123     void accUpdateDeviceT(U *p, long)
124     {
125 #pragma acc update device(p[0:_size])
126     }
127 
128     template<typename U>
129     auto accUpdateDeviceT(U *p, int) -> decltype(p->accUpdateDevice())
130     {
131         for (size_t j = 0; j < _size; ++j)
132         {
133             p[j].accUpdateDevice();
134         }
135     }
136 };
137 #endif
138 
139 // main.cpp
140 #include <iostream>
141 #include <cstdlib>
142 #include <cstdint>
143 #include "accList.h" 
144 using namespace std;
145 #ifndef N
146 #define N 1024
147 #endif
148 
149 int main()
150 {
151     accList<double> A(N), B(N);
152     for (int i = 0; i < B.size(); ++i)
153         B[i] = 2.5;
154     B.accUpdateDevice();                                // 手动更新设备内存
155 #pragma acc parallel loop gang vector present(A,B)
156     for (int i = 0; i < A.size(); ++i)
157         A[i] = B[i] + i;
158     A.accUpdateSelf();                                  // 手动更新主机内存
159     for (int i = 0; i<10; ++i)
160         cout << "A[" << i << "]: " << A[i] << endl;
161     cout << "......" << endl;
162     for (int i = N - 10; i<N; ++i)
163         cout << "A[" << i << "]: " << A[i] << endl;
164     return 0;
165 }

● 运行结果

cuan@CUAN:~/acc$ pgc++ main.cpp -o main.exe -Minfo -acc
main:
     16, Generating present(B,A)
         Generating Tesla code
         18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
accList<double>::accList(unsigned long):
      4, include "accList.h"
          18, Generating enter data copyin(this[:1])
accList<double>::~accList():
      4, include "accList.h"
          25, Generating exit data delete(this[:1])
accList<double>::operator [](unsigned long):
      4, include "accList.h"
          29, Generating acc routine seq
              Generating Tesla code
accList<double>::size() const:
      4, include "accList.h"
          40, Generating implicit acc routine seq
              Generating acc routine seq
              Generating Tesla code
accList<double>::release():
      4, include "accList.h"
          82, Generating exit data delete(_A[:_size])
accList<double>::allocate(unsigned long):
      4, include "accList.h"
          95, Generating update device(_size)
         102, Generating enter data create(_A[:_size])
void accList<double>::accUpdateSelfT<double>(T1 *, long):
      4, include "accList.h"
         110, Generating update self(p[:_size])
void accList<double>::accUpdateDeviceT<double>(T1 *, long):
      4, include "accList.h"
         125, Generating update device(p[:_size])
cuan@CUAN:~/acc$ ./main.exe 
launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=16 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
A[0]: 2.5
A[1]: 3.5
A[2]: 4.5
A[3]: 5.5
A[4]: 6.5
A[5]: 7.5
A[6]: 8.5
A[7]: 9.5
A[8]: 10.5
A[9]: 11.5
......
A[1014]: 1016.5
A[1015]: 1017.5
A[1016]: 1018.5
A[1017]: 1019.5
A[1018]: 1020.5
A[1019]: 1021.5
A[1020]: 1022.5
A[1021]: 1023.5
A[1022]: 1024.5
A[1023]: 1025.5

Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
  main  NVIDIA  devicenum=0
    time(us): 13
    16: compute region reached 1 time
        16: kernel launched 1 time
            grid: [1024]  block: [128]
             device time(us): total=13 max=13 min=13 avg=13
            elapsed time(us): total=329 max=329 min=329 avg=329
    16: data region reached 2 times
/home/cuan/acc/main.cpp
  _ZN7accListIdEC1Em  NVIDIA  devicenum=0
    time(us): 12
    18: data region reached 2 times
        18: data copyin transfers: 2
             device time(us): total=12 max=9 min=3 avg=6
/home/cuan/acc/main.cpp
  _ZN7accListIdED1Ev  NVIDIA  devicenum=0
    time(us): 0
    25: data region reached 2 times
/home/cuan/acc/main.cpp
  _ZN7accListIdE7releaseEv  NVIDIA  devicenum=0
    time(us): 7
    82: data region reached 2 times
        82: data copyin transfers: 2
             device time(us): total=7 max=4 min=3 avg=3
/home/cuan/acc/main.cpp
  _ZN7accListIdE8allocateEm  NVIDIA  devicenum=0
    time(us): 11
    95: update directive reached 2 times
        95: data copyin transfers: 2
             device time(us): total=5 max=3 min=2 avg=2
    102: data region reached 2 times
        102: data copyin transfers: 2
             device time(us): total=6 max=3 min=3 avg=3
/home/cuan/acc/main.cpp
  _ZN7accListIdE14accUpdateSelfTIdEEvPT_l  NVIDIA  devicenum=0
    time(us): 10
    110: update directive reached 1 time
        110: data copyout transfers: 1
             device time(us): total=10 max=10 min=10 avg=10
/home/cuan/acc/main.cpp
  _ZN7accListIdE16accUpdateDeviceTIdEEvPT_l  NVIDIA  devicenum=0
    time(us): 3
    125: update directive reached 1 time
        125: data copyin transfers: 1
             device time(us): total=3 max=3 min=3 avg=3