对比使用C# unsafe代码和OpenCV进行图像处理的效率（中）

方便起见，下面再贴一下上次的对比结果：

1、

2、

注意：由于上面两次比较不在同一系统上，Stopwatch.Frequency的值可能不同，故这两次的结果之间不具有可比性！！

——————————————————————————————————————————————————————

先分析下Add/Sub两个方法，在Image类里使用的是指针+TPL（.NET 并行任务库），使用Parallel.For方法对每一行进行循环，这个应该很容易看懂，就不多讲了。下面来看下OpenCV是如何实现的：

//文件：opencv\modules\core\src

void cv::add( InputArray src1, InputArray src2, OutputArray dst,
          InputArray mask, int dtype )
{
    arithm_op(src1, src2, dst, mask, dtype, addTab );
}

void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
               InputArray mask, int dtype )
{
    arithm_op(src1, src2, dst, mask, dtype, subTab );
}

原来这两个方法只是个代理，真正干活的是arithm_op，这个方法太长，就不在这贴了，不然有占版面的嫌疑，有兴趣的可以自己去看。沿着这个方法一路追踪，我们可以看到类似于这样的代码：

    #if CV_SSE2
        if( USE_SSE2 )
        {
            for( ; x <= sz.width - 32; x += 32 )
            {
                __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x));
                __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 16));
                r0 = op8(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
                r1 = op8(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 16)));
                _mm_storeu_si128((__m128i*)(dst + x), r0);
                _mm_storeu_si128((__m128i*)(dst + x + 16), r1);
            }
            for( ; x <= sz.width - 8; x += 8 )
            {
                __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x));
                r0 = op8(r0,_mm_loadl_epi64((const __m128i*)(src2 + x)));
                _mm_storel_epi64((__m128i*)(dst + x), r0);
            }
        }
    #endif

一切真相大白了，原来OpenCV使用了SSE2汇编指令集。这里简单说下SSE，SSE是由Intel引入的一种单指令多数据指令集，它包括若干128位的寄存器，可以在一个频率周期内同时同时执行128位乘法、128位加法、128位数据加载与128位数据回存，或着是4个32位单倍浮点精确度乘法与4个32位单倍浮点精确度加法运算。OpenCV的效率正来源于此！

那么为什么在第一台电脑上，Image和OpenCV比分约为2:1，第二台就变成5:1了呢？因为OpenCV是单线程运算，而Image使用了最新的TPL，TPL是在.NET 4.0中引入的一个并行任务库，Parallel.For类似于使用多线程来进行For循环，并且能够更新CPU核心数来决定创建的线程数，也就是说，Image使用了多线程构架，这使得它在多核CPU上有优势。第一台电脑4核，第二台电脑双核，原因就在于此！

再看Mul方法，在第一台电脑上，Image类居然以1:2的成绩奇迹般的胜过了OpenCV，有点不可思议！我们看看OpenCV的代码：

void cv::multiply(InputArray src1, InputArray src2,
                  OutputArray dst, double scale, int dtype)
{
    arithm_op(src1, src2, dst, noArray(), dtype, mulTab, true, &scale);
}

mulTab是一个针对不同数据类型的方法列表，随便挑一个看看，就看mul8u吧：

static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
{
    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
}

template<typename T, typename WT> static void
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
      T* dst, size_t step, Size size, WT scale )
{
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);


    if( scale == (WT)1. )
    {
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int i;
            for( i = 0; i <= size.width - 4; i += 4 )
            {
                T t0;
                T t1;
                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
                dst[i  ] = t0;
                dst[i+1] = t1;


                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
                dst[i+2] = t0;
                dst[i+3] = t1;
            }


            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
        }
    }
    else
    {
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int i;
            for( i = 0; i <= size.width - 4; i += 4 )
            {
                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
                dst[i] = t0; dst[i+1] = t1;


                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
                dst[i+2] = t0; dst[i+3] = t1;
            }


            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
        }
    }
}

也是直接相乘，并未使用SSE（不知道为什么，有知道的告诉下），但是它是以4个为间隔进行循环，减少了循环次数，因此效率会得到提升；但受限于单线程运算，无法充分利用多核优势，因此在4核电脑上，其成绩落后Image一大截; 在双核电脑上，OpenCV反超Image，看来我需要对Image也是用这种优化方法（--!!）。

再看Threshold，在第一台电脑上，两者半斤八两，OpenCV险胜，第二台电脑，Image 2.5:1败北。来看看OpenCV的代码：

//文件：opencv\modules\imgproc\src\thresh.cpp

double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
{
    Mat src = _src.getMat();
    bool use_otsu = (type & THRESH_OTSU) != 0;
    type &= THRESH_MASK;

    if( use_otsu )
    {
        CV_Assert( src.type() == CV_8UC1 );
        thresh = getThreshVal_Otsu_8u(src);
    }
  
    _dst.create( src.size(), src.type() );
    Mat dst = _dst.getMat();
    
    if( src.depth() == CV_8U )
    {
        int ithresh = cvFloor(thresh);
        thresh = ithresh;
        int imaxval = cvRound(maxval);
        if( type == THRESH_TRUNC )
            imaxval = ithresh;
        imaxval = saturate_cast<uchar>(imaxval);

        if( ithresh < 0 || ithresh >= 255 )
        {
            if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
                ((type == THRESH_TRUNC || type == THRESH_TOZERO_INV) && ithresh < 0) ||
                (type == THRESH_TOZERO && ithresh >= 255) )
            {
                int v = type == THRESH_BINARY ? (ithresh >= 255 ? 0 : imaxval) :
                        type == THRESH_BINARY_INV ? (ithresh >= 255 ? imaxval : 0) :
                        type == THRESH_TRUNC ? imaxval : 0;
                dst.setTo(v);
            }
            else
                src.copyTo(dst);
        }
        else
            thresh_8u( src, dst, (uchar)ithresh, (uchar)imaxval, type );
    }
    else if( src.depth() == CV_32F )
        thresh_32f( src, dst, (float)thresh, (float)maxval, type );
    else
        CV_Error( CV_StsUnsupportedFormat, "" );

    return thresh;
}

这仍然只是一个代理，下面看thresh_8u的代码（由于太长，所以只贴其中关键部分片段）：

//下面是针对TEGRA处理器的优化方法，跳过
#ifdef HAVE_TEGRA_OPTIMIZATION
    switch( type )
    {
    case THRESH_BINARY:
        if(tegra::thresh_8u_binary(_src, _dst, roi.width, roi.height, thresh, maxval)) return;
。。。。。。

//从下面可以看出，OpenCV采用的是查表法
    switch( type )
    {
    case THRESH_BINARY:
        for( i = 0; i <= thresh; i++ )
            tab[i] = 0;
        for( ; i < 256; i++ )
            tab[i] = maxval;
        break;
    case THRESH_BINARY_INV:
        for( i = 0; i <= thresh; i++ )
            tab[i] = maxval;
        for( ; i < 256; i++ )
            tab[i] = 0;
        break;
    case THRESH_TRUNC:
        for( i = 0; i <= thresh; i++ )
            tab[i] = (uchar)i;
        for( ; i < 256; i++ )
            tab[i] = thresh;
        break;
    case THRESH_TOZERO:
        for( i = 0; i <= thresh; i++ )
            tab[i] = 0;
        for( ; i < 256; i++ )
            tab[i] = (uchar)i;
        break;
    case THRESH_TOZERO_INV:
        for( i = 0; i <= thresh; i++ )
            tab[i] = (uchar)i;
        for( ; i < 256; i++ )
            tab[i] = 0;
        break;
    default:
        CV_Error( CV_StsBadArg, "Unknown threshold type" );
    }


//如果CPU支持SSE的话，会使用SSE进行运算
#if CV_SSE2
    if( checkHardwareSupport(CV_CPU_SSE2) )
    {
        __m128i _x80 = _mm_set1_epi8('\x80');
        __m128i thresh_u = _mm_set1_epi8(thresh);
。。。。。。

//否则会使用普通方法，以4为单位进行循环运算
    if( j_scalar < roi.width )
    {
        for( i = 0; i < roi.height; i++ )
        {
            const uchar* src = (const uchar*)(_src.data + _src.step*i);
            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
            
            for( j = j_scalar; j <= roi.width - 4; j += 4 )
            {
。。。。。。

下面是Smooth，所谓Smooth，就是遍历图像中的每个像素，将其值设为以它为中心的一个方格内所有像素的平均值。直接按照定义写算法会非常慢，Image类的算法主要是利用图像像素排列上的连续性来排除重复计算，其效率比直接按定义算提高了200多倍！！！OpenCV里的算法没看明白，谁看明白了告诉我下，万分感谢！！文件在opencv\modules\imgproc\src\smooth.cpp

至于AddWeighted，代码也很简单，无非就是SSE+for循环优化（以4位步长），就不细说了。

通过上边的分析，我们知道了OpenCV算法优化的原理，那么能不能对Image类进行同样的优化呢？且听下回分解~~~