快速求熵程序

信息熵很有用,就拿我的老本行反病毒来说,它是静态病毒特征的常见组成部分。信息熵的计算公式很简单:

$ Entropy=-sum_{i=1}^{n}P(X_{i})log_{2}P(X_{i}) $

其中,$ P(X_{i}) $是随机变量$X_{i}$出现的概率,这里熵的单位是bit。通常,我们用$C(X_{i})$表示$X_{i}$出现的次数,$T=sum_{i=1}^{n}C(X_{i})$表示观察次数的总和,那么:

$ Entropy=-sum_{i=1}^{n}frac{C(X_{i})}{T}log_{2}frac{C(X_{i})}{T} $

根据这个公式,最直接也是最常见的求熵程序类似于:

double entropy(int *counts, int n, int total)
{
    double sum = 0;
    double p;
    for(int i = 0; i < n; i++){
        if(counts[i] > 0){
            p = (double) counts[i] / total;
            sum -= p * log(p);
        }
    }
    return sum / log(2.0);
}

比如,要计算某段内存数据的熵,一般是这样调用entropy函数的:

double entropy_data(unsigned char *data, int size)
{
    int counts[256];
    memset(counts, 0, sizeof(int) * 256);
    for(int i = 0; i < size; i++)
        counts[data[i]]++;
    return entropy(counts, 256, size);
}

如果内存数据里的字节是均匀分布的(比如随机字节序列),那么熵值接近于8。反之,某几个字节大量重复出现时,熵值接近于0。在病毒特征中,特定文件数据的熵值是判断文件是否经过压缩加密的重要指标。

在实际应用中,我们对熵的精度要求并不高,15个有效位的double乃至7个有效位的float都显得有些“浪费”,而浮点运算又使得熵的计算速度不敢恭维。那么,是否可以在牺牲部分精度的前提下,快速计算近似熵值呢?答案是肯定的。经过几天的琢磨,终于写了一个出来。

在贴出代码之前,先看一下计算100万次熵值的用时对比数据(单位:ms)。测试环境:i7-3520M @2.9GHz, 8GB RAM, Win7 SP1 64-bit。

  x86 编译 x64 编译
entropy 4575.6 9851.5
fast_entropy 677.1 405.6

x86版本的fast_entropy速度大约是entropy的6倍,x64版本的fast_entropy速度是entropy的24倍。结果精度方面,fast_entropy一般可以精确至小数点后3到4位,严格的误差分析还没有做,有时间会补上。

fast_entropy用了整数运算来替代浮点运算,并应用了IEEE浮点数存储格式的技巧,得以用整数运算近似Log运算。之所以在x64下表现更好,是因为64-bit整数操作在x64上更快。

好了,上代码!

static int _u[256] = {
    0xc0801c36, 0xc0805429, 0xc0808b65, 0xc080c1eb, 0xc080f7bf, 0xc0812cdf, 0xc081614f, 0xc081950f,
    0xc081c821, 0xc081fa86, 0xc0822c3f, 0xc0825d4d, 0xc0828db3, 0xc082bd71, 0xc082ec88, 0xc0831af9,
    0xc08348c7, 0xc08375f1, 0xc083a27a, 0xc083ce62, 0xc083f9ab, 0xc0842455, 0xc0844e63, 0xc08477d4,
    0xc084a0aa, 0xc084c8e6, 0xc084f08a, 0xc0851796, 0xc0853e0c, 0xc08563ec, 0xc0858937, 0xc085adef,
    0xc085d215, 0xc085f5a9, 0xc08618ad, 0xc0863b21, 0xc0865d07, 0xc0867e60, 0xc0869f2c, 0xc086bf6c,
    0xc086df22, 0xc086fe4e, 0xc0871cf2, 0xc0873b0e, 0xc08758a2, 0xc08775b1, 0xc087923b, 0xc087ae40,
    0xc087c9c2, 0xc087e4c1, 0xc087ff3f, 0xc088193c, 0xc08832b9, 0xc0884bb7, 0xc0886436, 0xc0887c38,
    0xc08893bd, 0xc088aac6, 0xc088c155, 0xc088d768, 0xc088ed03, 0xc0890224, 0xc08916cd, 0xc0892aff,
    0xc0893ebb, 0xc0895200, 0xc08964d1, 0xc089772d, 0xc0898916, 0xc0899a8b, 0xc089ab8e, 0xc089bc20,
    0xc089cc41, 0xc089dbf2, 0xc089eb34, 0xc089fa06, 0xc08a086b, 0xc08a1662, 0xc08a23ec, 0xc08a310a,
    0xc08a3dbc, 0xc08a4a04, 0xc08a55e1, 0xc08a6155, 0xc08a6c60, 0xc08a7702, 0xc08a813d, 0xc08a8b10,
    0xc08a947d, 0xc08a9d84, 0xc08aa626, 0xc08aae62, 0xc08ab63b, 0xc08abdb0, 0xc08ac4c2, 0xc08acb71,
    0xc08ad1be, 0xc08ad7aa, 0xc08add35, 0xc08ae260, 0xc08ae72b, 0xc08aeb97, 0xc08aefa4, 0xc08af353,
    0xc08af6a4, 0xc08af998, 0xc08afc30, 0xc08afe6b, 0xc08b004b, 0xc08b01d0, 0xc08b02fa, 0xc08b03ca,
    0xc08b0440, 0xc08b045e, 0xc08b0422, 0xc08b038f, 0xc08b02a4, 0xc08b0161, 0xc08affc8, 0xc08afdd9,
    0xc08afb94, 0xc08af8f9, 0xc08af609, 0xc08af2c5, 0xc08aef2d, 0xc08aeb42, 0xc08ae703, 0xc08ae271,
    0xc08add8e, 0xc08ad858, 0xc08ad2d1, 0xc08accf9, 0xc08ac6d0, 0xc08ac057, 0xc08ab98e, 0xc08ab276,
    0xc08aab0f, 0xc08aa35a, 0xc08a9b56, 0xc08a9305, 0xc08a8a66, 0xc08a817a, 0xc08a7841, 0xc08a6ebd,
    0xc08a64ec, 0xc08a5ad0, 0xc08a5069, 0xc08a45b8, 0xc08a3abc, 0xc08a2f76, 0xc08a23e6, 0xc08a180d,
    0xc08a0bec, 0xc089ff81, 0xc089f2cf, 0xc089e5d5, 0xc089d893, 0xc089cb0a, 0xc089bd3b, 0xc089af25,
    0xc089a0c8, 0xc0899227, 0xc089833f, 0xc0897413, 0xc08964a2, 0xc08954ec, 0xc08944f2, 0xc08934b5,
    0xc0892434, 0xc089136f, 0xc0890268, 0xc088f11f, 0xc088df93, 0xc088cdc5, 0xc088bbb6, 0xc088a965,
    0xc08896d4, 0xc0888401, 0xc08870ef, 0xc0885d9c, 0xc0884a09, 0xc0883637, 0xc0882226, 0xc0880dd5,
    0xc087f946, 0xc087e479, 0xc087cf6e, 0xc087ba24, 0xc087a49e, 0xc0878eda, 0xc08778d9, 0xc087629b,
    0xc0874c21, 0xc087356b, 0xc0871e78, 0xc087074b, 0xc086efe1, 0xc086d83d, 0xc086c05e, 0xc086a844,
    0xc0868ff0, 0xc0867762, 0xc0865e9a, 0xc0864598, 0xc0862c5e, 0xc08612ea, 0xc085f93d, 0xc085df58,
    0xc085c53a, 0xc085aae4, 0xc0859057, 0xc0857592, 0xc0855a95, 0xc0853f61, 0xc08523f7, 0xc0850855,
    0xc084ec7e, 0xc084d070, 0xc084b42c, 0xc08497b2, 0xc0847b03, 0xc0845e1e, 0xc0844105, 0xc08423b6,
    0xc0840633, 0xc083e87c, 0xc083ca90, 0xc083ac71, 0xc0838e1d, 0xc0836f96, 0xc08350dc, 0xc08331ee,
    0xc08312ce, 0xc082f37b, 0xc082d3f5, 0xc082b43d, 0xc0829453, 0xc0827437, 0xc08253ea, 0xc082336b,
    0xc08212ba, 0xc081f1d9, 0xc081d0c7, 0xc081af83, 0xc0818e10, 0xc0816c6c, 0xc0814a98, 0xc0812894,
    0xc0810660, 0xc080e3fd, 0xc080c16b, 0xc0809ea9, 0xc0807bb8, 0xc0805899, 0xc080354a, 0xc08011ce,
};

static inline long long _lxlogx(int x)
{
    float f = (float)x;
    int i = *(int *)&f;
    i += _u[(i & 0x007F8000) >> 15];
    return (long long)i * x; 
}

double fast_entropy(int *counts, int n, int total)
{
    long long s = 0;
    int i, c;
    for(i = 0; i < n; i++) {
        if(c = counts[i])
            s -= _lxlogx(c);
    }
    s += _lxlogx(total);
    s /= total;
    return 0.00000011920929 * s;
}
原文地址:https://www.cnblogs.com/daishuo/p/3954711.html