▶ 第五章,几个优化
● 代码
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 5 #define S 1024*32 6 #define T float 7 #define ITER 1000 8 #define VLEN 16 9 10 __declspec(noinline) void scalar(T *A, T *B, T *C, T k) // 标量型 11 { 12 __assume_aligned(A, 64); // 声明数组对齐到 512 bit 13 __assume_aligned(B, 64); 14 __assume_aligned(C, 64); 15 for (int i = 0; i < S; i++) 16 { 17 T tmp = A[i] * k + B[i]; 18 if (tmp > 0.5f) 19 tmp *= sin(B[i]); 20 C[i] = tmp; 21 } 22 } 23 24 __declspec(noinline) void shortVector(T *A, T *B, T *C, T k)// 短向量型 25 { 26 __assume_aligned(A, 64); 27 __assume_aligned(B, 64); 28 __assume_aligned(C, 64); 29 for (int i = 0; i < S; i+=VLEN) 30 { 31 T tmp[VLEN]; 32 tmp[:]= A[i:VLEN] * k + B[i:VLEN]; // Cilk 的 tmp[起点:长度] 作引用 33 if (tmp[:] > 0.5f) 34 tmp[:] *= sin(B[i:VLEN]); 35 C[i:VLEN] = tmp[:]; 36 } 37 } 38 39 int main() 40 { 41 volatile __int64 start, time1, time2; 42 T ref_result, short_result; 43 const float k = 0.5; 44 45 T *A, *B, *C; // 声明并定义对齐的数组或数组指针 46 posix_memalign((void **)&A, 64, sizeof(T)*S); 47 posix_memalign((void **)&B, 64, sizeof(T)*S); 48 posix_memalign((void **)&C, 64, sizeof(T)*S); 49 //__declspec(align(64)) T A[S], B[S], C[S]; // 等价的声明方式 50 51 A[0:S] = __sec_implicit_index(0); // 数组赋值为 {0, 1, 2, ...} 52 B[0:S] = __sec_implicit_index(0); 53 C[0:S] = 0; 54 //for(int i=0;i<S;i++) // 等价的复制方法 55 // A[i] = B[i] = i, C[i] = 0; 56 57 //for(int i=0;i<10;i++) 58 // printf("%f, ", A[i]); 59 //printf(" "); 60 //for(int i=0;i<10;i++) 61 // printf("%f, ", B[i]); 62 //printf(" "); 63 64 start = __rdtsc(); // 计时器,计算 CPU 的tick 数 65 for (int i = 0; i < ITER; i++) 66 scalar(A, B, C, k); 67 time1 = __rdtsc() - start; 68 //for(int i=0;i<10;i++) 69 // printf("%f, ", C[i]); 70 //printf(" "); 71 ref_result = __sec_reduce_add(C[0:S]); // 规约加,用于比较结果是否正确 72 73 start = __rdtsc(); 74 for (int i = 0; i < ITER; i++) 75 shortVector(A, B, C, k); 76 time2 = __rdtsc() - start; 77 //for(int i=0;i<10;i++) 78 // printf("%f, ", C[i]); 79 //printf(" "); 80 short_result = __sec_reduce_add(C[0:S]); 81 82 printf("ref: %f, %Id CPU tick short: %f, %Id CPU tick ", ref_result, time1, short_result, time2); 83 return 0; 84 }
● 输出结果。O0 优化,XeonPhi 上 CPU tick,较少,尤其是 shortVector 优化效果明显,但是 O3 优化 XeonPhi 更差,甚至比自身 O0 还差
[cuan@server3145 XeonPhi]$ icc kk.c -O0 [cuan@server3145 XeonPhi]$ ./a.out ref: -39578.562500, 1514650782 CPU tick short: -39578.562500, 1802072334 CPU tick [cuan@server3145 XeonPhi]$ icc -mmic kk.c -O0 [cuan@server3145 XeonPhi]$ scp a.out cuan@mic0:/home/cuan cuan@mic0's password: a.out 100% 33KB 33.5KB/s 00:00 [cuan@server3145 XeonPhi]$ ssh cuan@mic0 cuan@mic0's password: [cuan@server3145-mic0 ~]$ ./a.out ref: -39578.562500, 1467023211 CPU tick short: -39578.562500, 414319544 CPU tick [cuan@server3145 XeonPhi]$ icc kk.c -O3 [cuan@server3145 XeonPhi]$ ./a.out ref: -39578.617188, 387034332 CPU tick short: -39578.617188, 395228084 CPU tick [cuan@server3145 XeonPhi]$ icc -mmic kk.c -O3 [cuan@server3145 XeonPhi]$ scp a.out cuan@mic0:/home/cuan cuan@mic0's password: a.out 100% 90KB 90.2KB/s 00:00 [cuan@server3145 XeonPhi]$ ssh cuan@mic0 cuan@mic0's password: [cuan@server3145-mic0 ~]$ ./a.out ref: -39578.429688, 638000619 CPU tick short: -39578.429688, 633708861 CPU tick