C++优化和计算速度(碎碎念)

实现一个文件编码（冗余）方式，各种找资料，最后使用伽罗瓦域+线性方程组。
注：代码中的类暂时不想公开，以后我觉得拿得出手的时候，会贴出来。
以上是背景。

以下是代码思路及其变化：

基础：
一个使用高斯消元法解线性方程组的Matrix类。一个有限域（伽罗瓦域）运算的类galois_field。
最初都是网上找来的代码，或存在于开源项目，或是好心人共享的代码。
拿来后阉割掉用不到的部分，全部抽象画为模板类。
最初使用一个BaseComputer类，virtual了add,min,mul,div四个纯虚函数。
galois_field继承自BaseComputer，实现了基于有限域的基本运算法则。
然后Matrix类构造函数包含一个BaseComputer指针，之后内部所有运算使用BaseComputer完成。
最后给出value_t，可以自定义内部运算使用的数据类型。如果你愿意的话，使用BigInteger，甚至Rational（有理数）也是可以的。（请自己负责数据溢出）

 1 template<typename value_t>
 2 class base_computer
 3 {
 4 public:
 5     typedef value_t value_t;
 6 
 7     virtual value_t add(const value_t &left, const value_t &right) = 0;
 8     virtual value_t min(const value_t &left, const value_t &right) = 0;
 9     virtual value_t mul(const value_t &left, const value_t &right) = 0;
10     virtual value_t div(const value_t &left, const value_t &right) = 0;
11 };

 1 template<typename value_t, int bits, int polynomial>
 2 class galois_field : public base_computer<value_t>
 3 {
 4     int field_size;
 5     value_t *pwr_of;
 6     value_t *val_of;
 7 
 8     void build_gf();
 9 
10 public:
11     typedef value_t value_t;
12     galois_field();
13     galois_field(const galois_field & g);
14     ~galois_field();
15 
16     value_t power_of(const value_t value) const;
17     value_t value_of(const value_t power) const;
18     int size() const;
19     int power() const;
20 
21     galois_field & operator=(const galois_field & g);
22 
23     virtual value_t add(const value_t &left, const value_t &right);
24     virtual value_t min(const value_t &left, const value_t &right);
25     virtual value_t mul(const value_t &left, const value_t &right);
26     virtual value_t div(const value_t &left, const value_t &right);
27 };

 1 template<typename value_t>
 2 class Matrix
 3 {
 4 public:
 5     typedef value_t value_t;
 6     Matrix(base_computer<value_t> *c, value_t *matrix, int rowCount, int colCount);
 7     ~Matrix();
 8 private:
 9     int rowCount;
10     int colCount;
11     value_t* matrix;
12     value_t** matrixJump;
13     base_computer<value_t> *C;
14     int FindMaxRow(int colIndex);                       //选主元
15     void MulAndAdd(int srcRow, value_t k, int dstRow);  //第srcRow行乘以k加到dstRow行
16     void Mul(int rowIndex, value_t k);                  //rowIndex行乘以k
17     void SwapRow(int srcRow, int dstRow);               //交换矩阵两行
18 
19 public:
20     value_t At(int row, int col) const;
21     value_t &At(int row, int col);
22     int Gasuss();                                       //返回0成功，其它数字表示有问题的方程个数
23     int RowCount() const;
24     int ColCount() const;
25 };

阶段性成果：
数据类型使用unsigned short, 运算器使用galois_field<unsigned short, 16, 0x1100B>(被typedef为galois_field_16）
测试64元方程组，预定解，随机64组系数，当场构建一个64x65的矩阵，交给Matrix类解出验证结果是否正确。
重复10000次，耗时42秒。

改进：
42秒真心慢，思考应该是纯虚函数的vptr跳转，以及使用方式，使得C++优于C语言的特性完全没有发挥出来。
继续改进，BaseComputer类不要了。galois_field的所有成员改成静态，Matrix的运算器也由模板管理。所有的模板，可能频繁调用的都标记inline。
最后实现一个template<typename value_t> class default_compute，用作Matrix类的默认参数。

 1 template<typename value_t>
 2 class default_computer
 3 {
 4 public:
 5     typedef value_t value_t;
 6 
 7     inline static value_t add(const value_t &left, const value_t &right){ return left + right; }
 8     inline static value_t min(const value_t &left, const value_t &right){ return left - right; }
 9     inline static value_t mul(const value_t &left, const value_t &right){ return left * right; }
10     inline static value_t div(const value_t &left, const value_t &right){ return left / right; }
11 };

 1 template<typename value_t, int bits, int polynomial>
 2 class galois_field
 3 {
 4 private:
 5     galois_field();
 6     ~galois_field();
 7     
 8     static galois_field<value_t, bits, polynomial> gf;
 9     static int field_size;
10     static value_t *pwr_of;
11     static value_t *val_of;
12 
13     void build_gf();
14 
15 public:
16     typedef value_t value_t;
17 
18     static value_t power_of(const value_t value);
19     static value_t value_of(const value_t power);
20     static int size();
21     static int power();
22 
23     static value_t add(const value_t &left, const value_t &right);
24     static value_t min(const value_t &left, const value_t &right);
25     static value_t mul(const value_t &left, const value_t &right);
26     static value_t div(const value_t &left, const value_t &right);
27 };

 1 template<typename value_t, class computer_t = default_computer<value_t> >
 2 class Matrix
 3 {
 4 public:
 5     typedef value_t value_t;
 6     Matrix(value_t *matrix, int rowCount, int colCount);
 7     ~Matrix();
 8 private:
 9     int rowCount;
10     int colCount;
11     value_t* matrix;
12     value_t** matrixJump;
13     int FindMaxRow(int colIndex);                       //选主元
14     void MulAndAdd(int srcRow, value_t k, int dstRow);  //第srcRow行乘以k加到dstRow行
15     void Mul(int rowIndex, value_t k);                  //rowIndex行乘以k
16     void SwapRow(int srcRow, int dstRow);               //交换矩阵两行
17 
18 public:
19     value_t At(int row, int col) const;
20     value_t &At(int row, int col);
21     int Gasuss();                                       //返回0成功，其它数字表示有问题的方程个数
22     int RowCount() const;
23     int ColCount() const;
24 };

阶段性成果：
重复上面的测试，时间变成了22秒。

此时手贱，把数据类型改成double，运算器使用default_computer。

重复上面的测试，时间是9秒。

上面的9秒是失误，不小心引入特殊数据了。

对矩阵内部计算又调整优化了一下。

以下是测试代码。

 1 int main(){
 2 #if 1
 3     typedef galois_field_16 computer_t;
 4 #else
 5     typedef default_computer<double> computer_t;
 6 #endif
 7 
 8     typedef computer_t::value_t value_t;
 9 
10     const int n = 128;
11     const int l = 1000;
12     Matrix<value_t, computer_t> matrix(NULL, n, n + 1);
13     value_t resault[n];
14 
15     int begin = clock();
16     for(int loop = 0; loop < l; ++loop)
17     {
18         for(int i = 0; i < n; ++ i) resault[i] = value_t(rand() % 65536);
19         for(int y = 0; y < n; ++ y)
20             for(int x = 0; x < n; ++ x)
21                 matrix.At(y, x) = value_t(rand() % 65536);
22         for(int y = 0; y < n; ++ y)
23         {
24             matrix.At(y, n) = 0;
25             for(int x = 0; x < n; ++ x)
26             {
27                 matrix.At(y, n) = computer_t::add(matrix.At(y, n), computer_t::mul(matrix.At(y, x), resault[x]));
28             }
29         }
30         matrix.Gasuss();
31     }
32     cout << clock() - begin << endl;
33     system("pause");
34     return 0;
35 }
36

就算排除掉特殊数据，结果依然是令人沮丧的。double作为运算类型，使用原本的加减乘除，要比伽罗瓦域构的计算快一倍以上。

思考：

从虚函数表改变为内联展开的模板，缩短了近一半的时间，可见当调用次数累计到可观的程度也是非常可怕的，当然，这也是C++比C高效的一部分原因。

伽罗瓦域的加减法是一样的，都是异或。那么速度一定比double快，但是乘除法就不同了。

伽罗瓦域的乘除需要3次查表，1次布尔运算，1或2次加减运算，最少需要访问5次内存。

double的话，就算是乘除法，所有操作也都是在寄存器完成的。

至此，想说的也说完了。

很想了解别人是怎么实现在有限域解线性方程组的，是更加高效呢？还是说有硬件辅助呢？希望有人指正。