OpenACC 简单的原子操作

▶ OpenACC 的原子操作,用到了 C++ 的一个高精度计时器

● 代码,直接的原子操作

 1 #include <iostream>
 2 #include <cstdlib>
 3 #include <chrono>
 4 
 5 #define ATOMIC
 6 
 7 using namespace std;
 8 using namespace std::chrono;
 9 
10 int main()
11 {
12     high_resolution_clock::time_point t1 = high_resolution_clock::now();// 高精度计时器
13 
14     const int count = 1073741824;
15     int sum = 0;
16 
17 #pragma acc parallel loop copyout(sum)
18     for (int i = 0; i < count; i++)
19     {
20 #ifdef ATOMIC
21     #pragma acc atomic update
22 #endif
23         sum++;
24     }
25 
26     high_resolution_clock::time_point t2 = high_resolution_clock::now();
27     duration<double> time = duration_cast<duration<double>>(t2 - t1);
28     
29     cout << "
Count = " << count << ", duraion = " << time.count() << " s" << endl;        
30     return 0;
31 }

● 输出结果,不知道为什么,win10中的 pgCC 不能用

D:CodeOpenACCOpenACCProjectOpenACCProject>pgCC -acc -o acc_win10.exe main.cpp -Minfo
pgCC-Warning-C++ compilation is not supported: main.cpp

● 输出结果,WSL 中

// 不使用 OpenACC
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -std=c++11 -o acc.exe main.cpp -Minfo
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Count = 1073741824, duraion = 0.483907 s

// 使用宏 ATOMIC,即使用原子操作
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo
main:
     15, Generating copyout(sum)
         Accelerator kernel generated
         Generating Tesla code
         18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Count = 1073741824, duraion = 0.248377 s

// 不用宏 ATOMIC,即不用原子操作
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo
main:
     15, Generating copyout(sum)
         Accelerator kernel generated
         Generating Tesla code
         18, #pragma acc loop seq
     23, Accelerator restriction: induction variable live-out from loop: sum    // 编译器提示强制原子操作
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Count = 1073741824, duraion = 0.247399 s

● 优化一下,使用分段计数

 1 #include <iostream>
 2 #include <cstdlib>
 3 #include <chrono>
 4 
 5 using namespace std;
 6 using namespace std::chrono;
 7 
 8 int main()
 9 {
10     high_resolution_clock::time_point t1 = high_resolution_clock::now();
11 
12     const int count = 1073741824, length = count / 32;// 每一段的长度
13     int sum = 0;
14 
15 #pragma acc parallel loop copyout(sum)
16     for (int start = 0; start < count; start+=length)                       // start 取每段的起点,共 count / length 段
17     {
18         const int end = (start + length < count) ? start + length : count;  // 每段的终点
19         int subSum = 0;
20 #pragma acc loop worker reduction(+:subSum)
21         for (int j = start; j < end; j++)                                   // 每段从 start 加到 end
22             subSum ++;
23 
24 #pragma acc atomic update                                                   
25         sum += subSum;                                                      // 规约结果加到 sum 中来
26     }
27 
28     high_resolution_clock::time_point t2 = high_resolution_clock::now();
29     duration<double> time = duration_cast<duration<double>>(t2 - t1);
30     
31     cout << "
Count = " << sum << ", duraion = " << time.count() << " s" << endl;                
32     return 0;
33 }

● 输出结果,好像好一点点

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo
main:
     15, Generating copyout(sum)
         Accelerator kernel generated
         Generating Tesla code
         18, #pragma acc loop gang /* blockIdx.x */
         23, #pragma acc loop seq /* threadIdx.y */
             Generating reduction(+:subSum)
     23, Loop is parallelizable
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Count = 1073741824, duraion = 0.246488 s
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9458900.html