访存加速-Speed-up of Memory Access Intensive Program

【参考网址】
关于采用PLD指令进行内存预取减少内存操作指令的等待时间:
https://www.jianshu.com/p/7b3bfc3aed12
关于L1,L2,L3的缓存介绍:
https://www.cnblogs.com/arnoldlu/p/7883663.html
关于乱序处理和顺序处理的解释:(本质上不存在乱序,只是指令之间的依赖关系)
https://www.sohu.com/a/127028459_505803
实际中配合NEON的v_load指令似乎并没有起到任何作用,不仅如此反而有些变慢了。U_U

代码如下:

static inline void prefetch_range(uint8_t *addr, size_t len) { uint8_t *cp; uint8_t *end = addr + len; for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 0, 3); }

static inline void prestore_range(uint8_t addr, size_t len){
uint8_t
cp;
uint8_t *end = addr + len;
for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 1, 3);
}

void GetPixelOrder_CPU(cv::Mat & order, const cv::Mat & grey){
uint8_t data_order = order.data;
uint8_t
data_grey = grey.data;
if(order.type() != CV_8UC1 ||
grey.type() != CV_8UC1 ||
order.size() != grey.size()){
CAP_LOGE("[GetPixelOrder] invalid inputs");
exit(-1);
}
int h = grey.size().height;
int w = grey.size().width;
int offset;
int idx;
uint8_t buf[8]; // window is a 4 x 2 ( width x height ) rectangle
uint8_t center; // center is in fact the left corner pixel in window
for(int i=0; i<h-1; ++i){
offset = i * w;
idx = 6;
buf[0] = data_grey[offset];
buf[1] = data_grey[offset + w];

    </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">2</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">];</span><span class="sc0">
    </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">3</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">];</span><span class="sc0">

    </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">4</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">];</span><span class="sc0">
    </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">5</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">];</span><span class="sc0">
    </span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">&lt;</span><span class="sc11">w</span><span class="sc10">-</span><span class="sc4">3</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc10">++</span><span class="sc11">j</span><span class="sc10">){</span><span class="sc0">
        </span><span class="sc2">// update the outdated pixels with new ones coming in by the right side

buf[idx] = data_grey[offset + 3];
buf[idx + 1] = data_grey[offset + w + 3];
// update window position
++offset;
// update buffer state: position of outdating pixels
idx = (idx + 2) % 8;
// the center of window is always the up-left one
center = buf[idx];
// calculate the order for center point within current window
uint8_t counter = 0;
for(int k=1; k<8; ++k) counter += (buf[(idx + k)%8] > center);
data_order[offset] = counter << 5;
}
}
}

void GetPixelOrder_NEON(cv::Mat & order, const cv::Mat & grey){
uint8_t data_order = order.data;
uint8_t
data_grey = grey.data;
if(order.type() != CV_8UC1 ||
grey.type() != CV_8UC1 ||
order.size() != grey.size()){
CAP_LOGE("[GetPixelOrder] invalid inputs");
exit(-1);
}
int h = grey.size().height;
int w = grey.size().width;
int offset;

</span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_uint8x16</span><span class="sc0"> </span><span class="sc11">v_b_0</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_1</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_2</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_3</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_4</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_5</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_6</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_7</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc2">// 4(w) x 2(h) window

cv::v_uint8x16 v_counter;
cv::v_uint8x16 v_flag;

</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">y_end</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">h</span><span class="sc0"> </span><span class="sc10">-</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">;</span><span class="sc0">
</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">x_end</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">-</span><span class="sc0"> </span><span class="sc4">15</span><span class="sc10">;</span><span class="sc0">

</span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc10">&lt;</span><span class="sc11">y_end</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc10">++</span><span class="sc11">i</span><span class="sc10">){</span><span class="sc0">
    </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc0"> </span><span class="sc10">*</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc10">;</span><span class="sc0">
    </span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">&lt;</span><span class="sc11">x_end</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">+=</span><span class="sc4">16</span><span class="sc10">){</span><span class="sc0">
        </span><span class="sc2">// memory hint instructions: PLD Commands to enable data prefetching

//prefetch_range(data_grey + offset, 24);
//prefetch_range(data_grey + offset + w, 24);
//prestore_range(data_order + offset, 16);

// update window
v_b_0 = cv::v_load(data_grey + offset);
v_b_2 = cv::v_load(data_grey + offset + 1);
v_b_4 = cv::v_load(data_grey + offset + 2);
v_b_6 = cv::v_load(data_grey + offset + 3);

        </span><span class="sc11">v_b_1</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc10">);</span><span class="sc0">
        </span><span class="sc11">v_b_3</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">);</span><span class="sc0">
        </span><span class="sc11">v_b_5</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">);</span><span class="sc0">
        </span><span class="sc11">v_b_7</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">3</span><span class="sc10">);</span><span class="sc0">

        </span><span class="sc2">// calculate the order for center point within current window

v_counter = cv::v_setall_u8(0);
v_flag = v_b_0 < v_b_1;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_2;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_3;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_4;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_5;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_6;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_flag = v_b_0 < v_b_7;
v_flag = v_flag >> 7;
v_counter = v_counter + v_flag;
v_counter = v_counter << 5;
cv::v_store(data_order + offset, v_counter);
// update window position
offset += 16;
}
}
}

CPU版本:17-26ms(手机运行不稳定)
NEON版本:1-2ms(绝大部分时间是1ms)
NEON+PLD/PST(预取/存):3-7ms(不稳定)
将预取prefetch_range以及prestore_range加上后稍微稳定些了,但是速度变慢了。
因此还需要认真思考预取的使用策略,使其生效。

原文地址:https://www.cnblogs.com/thisisajoke/p/12017657.html