文件压缩小项目haffman压缩

文件压缩的原理：

　　文件压缩总体可以分为有损压缩和无损压缩两类，有损压缩是指对mp3等格式的文件，忽略一些无关紧要的信息，只保留一些关键的信息，但并不因此影响用户对于这些mp3格式文件的体验度，无损压缩是基于比特位的压缩，即都是通过某种特殊的编码方式将数据信息中存在的重复度、冗余度有效地降低，从而达到数据压缩的目的。比如，“中国”可以代替中华人民共和国。压缩的文件必须可以解压，这样才算达到了文件压缩的目的。

　　haffman压缩：

1.定义一个字符数组vector<_char_info>(_char_info为一个结构体，存放字符的信息)，大小为256，遍历整个文档，统计文档中字符出现的次数（以字符的ascll码为下标，每当字符出现一次，就将存储在该位置的数字加一）；

2.根据统计得到的次数建立haffman树，haffman树采用孩子双亲表示法

　　priority_queue实际是一个堆，默认情况下按照按照小于的方式建堆，即默认情况下为大堆，我们建立haffman需要先找到vector<_char_info>中的子符出现次数（权值）最小的，因此需要建立小堆，这样堆顶元素即为vector<_char_info>中的最小值。建立小堆，需要自定义priority_queue，将其设置成大于的比较方式，因此需要对（）运算符进行重载，设置成大于的比较方式

3.haffman树建立好之后，根据 haffman获取文档中出现字符的haffman编码

　　记录根节点的权值，存储的文件中所有字符的个数filesize

　　获取方式：从haffman树的叶子节点开始，如果当前节点的父节点不为空，则一直往上遍历。得到的编码我们需要将其reverse一下，获得正确的编码，每reverse一下，需要将filzesize--，将获取的编码存储在_char_info结构体中的_char_Code字段，当filesize为0时，获取了所有字符的编码，进行下一步

　　往压缩文件中写源文件的信息，我们将自己压缩的文件后缀命名为.hzp，将源文件的后缀，字符信息存储到压缩的文件中，注意，解压时必须要能区分文本信息与我们写入的辅助信息，因此需要将每个字符的信息以如下的方式存入.hzp文件中

　　　将辅助信息写入压缩文件之后，开始以将源文档中字符编码按照比特位的方式一个字节一个字节写入压缩文件，当源文件的文件在此到达文件末尾的时候写入完毕，此时还要注意是否将所有的比特位都写入压缩文件，如果没有，则需要特殊处理

　　　以上的步骤做完之后，需要解压文件

　4.解压文件：

　　打开后缀为.hzp的文件，由于我们往.hzp文件中写信息的时候是以上图中的方式，因此，我们需要先读取解压后文件的后缀，存入一个字符串中str_suffix中，读取一行后接着读取需读取的辅助信息的行数，读到行数之后，将字符形式的行数转化为size_t类型，可以调用库函数atoi。将读取的信息存入_char_info结构体中（字符以及字符出现的次数），读完辅助信息之后根据辅助信息重建haffman树。

　　haffman重建之后，根据压缩文件中的字符遍历haffman树，如果读到的字节中的比特位为1，则访问haffman树的右子树，为0则访问haffman树的左子树，当走到叶子节点的时候需要将该编码对应的字符写入到解压后的文件中（每次写一个字符一个字符），这时需要将访问haffman树的指针复位，重新指向根节点，让filesize--，当filesize为0时说明解压完毕。

　5.一下为编程代码：

　　(1)头文件：

 1 #pragma once 
 2 
 3 #include<iostream>
 4 #include<vector>
 5 #include<string>
 6 #include<assert.h>
 7 #include"HaffmanTree.hpp"
 8 using namespace std;
 9 
10 typedef unsigned char UCH;
11 
12 struct Char_info{
13 
14     //构造函数
15     Char_info(size_t char_count = 0)
16         :_char_count(char_count)
17     {}
18 
19     UCH _ch;            //记录当前字符
20     long long _char_count;            //记录当前字符在文件中的位置
21     string _str_code;                //记录当前字符的haffman编码
22 
23     Char_info operator+(const Char_info& temp)const{
24         return Char_info(temp._char_count + _char_count);
25     }
26 
27     bool operator>(const Char_info& temp)const{
28         return _char_count > temp._char_count;
29     }
30 
31     bool operator<(const Char_info& temp)const{
32         return _char_count < temp._char_count;
33     }
34 
35     bool operator!=(const Char_info& temp)const{
36         return _char_count != temp._char_count;
37     }
38 
39     bool operator==(const Char_info& temp)const{
40         return _char_count == temp._char_count;
41     }
42 
43     bool operator!()const{
44         return !_char_count;
45     }
46 
47 };
48 
49 class FileCompressHaffMan{
50 public:
51     //构造函数
52     FileCompressHaffMan();
53 
54     void CompressFile(const string& strPath);            //文件压缩
55 
56     void GetHaffmanCode(HaffmanTreeNode<Char_info>* pRoot);    //获取haffman编码
57 
58     void WriteHead(FILE* DestFile, const string& strPath);            //写后缀，编码等信息
59 
60     void UNCompressFile(const string& strPath);            //解压文件
61 
62     void GetLine(FILE* fIn,string& str);//读取一行数据
63 
64     vector<Char_info> _char_info;
65     
66 };

　(2)构造函数：

1 FileCompressHaffMan::FileCompressHaffMan(){
2     _char_info.resize(256);
3     for (size_t i = 0; i < _char_info.size(); ++i){
4         _char_info[i]._ch = i;
5     }
6 }

　　　(3)压缩文件的函数：

 1 void FileCompressHaffMan::CompressFile(const string& strPath){
 2 
 3     //打开文件，记录当前字符的出现字数
 4     if (strPath.empty()){
 5         cout << "文件目录错误" << endl;
 6         return;
 7     }
 8 
 9     FILE* Source_File = fopen(strPath.c_str(), "rb");        //以二进制的形式打开文件
10 
11     if (nullptr == Source_File){
12         cout << "打开文件失败" << endl;
13         return;
14     }
15 
16     //定义读取缓冲区,接收一次读取的数据
17 
18     UCH* ReadBuff = new UCH[1024];            
19 
20     //定义记录的数组，用来统计文档中每个字符出现的次数
21 
22     
23 
24     while (true){
25         size_t ReadSize = fread(ReadBuff, 1, 1024, Source_File);
26         if (0 == ReadSize){
27             //读取文件完成
28             break;
29         }
30 
31         //统计文档中字符出现的次数
32         for (size_t i = 0; i < ReadSize; ++i){
33             _char_info[ReadBuff[i]]._char_count++;
34         }
35     }
36 
37     //将char_info的数据进行排序，取权值最小的，建立haffman树
38     HaffmanTree<Char_info> ht;
39     ht.CreateHaffmanTree(_char_info,_char_info[0]);
40 
41     //获取haffman编码
42     GetHaffmanCode(ht.GetRoot());
43     //fseek(Source_File, 0, SEEK_SET);            //将文件指针偏移到文件的开始位置
44 
45     FILE* DestFile = fopen("finshFileCompress.hzp", "wb");        //打开压缩文件
46     assert(DestFile);
47 
48     WriteHead(DestFile, strPath);                //往目标文件中写入后缀，字符编码等信息
49 
50     //写压缩文件
51     //由于当前打开额源文件的文件指针已经到达文件末尾，所以需要将文件指针重置到开头
52     fseek(Source_File, 0, SEEK_SET);
53     char ch = 0;
54     int bitcount = 0;
55     while (true){
56         size_t Read_Size = fread(ReadBuff, 1, 1024, Source_File);
57         if (0 == Read_Size){
58             break;
59             //读取文件完毕，结束
60         }
61         
62         //以比特位的方式将字符编码写入到文件中
63         for (size_t i = 0; i < Read_Size; ++i){
64             string& char_Code = _char_info[ReadBuff[i]]._str_code;
65             for (size_t j = 0; j < char_Code.size(); ++j){
66                 ch <<= 1;
67                 if ('1' == char_Code[j]){
68                     ch |= 1;
69                 }
70                 bitcount++;
71                 if (8 == bitcount){
72                     fputc(ch, DestFile);
73                     //一个字节一个字节写入文件
74                     bitcount = 0;
75                 }
76             }
77         }
78     }
79     //写完之后还有比特位剩余，则将剩余的比特位写入文件
80     if (bitcount > 0 && bitcount < 8){
81         ch <<= (8 - bitcount);                //等于号啊啊啊啊啊
82         fputc(ch, DestFile);
83     }
84     
85     
86     delete[] ReadBuff;
87     fclose(DestFile);            //关闭压缩文件
88     fclose(Source_File);        //关闭源文件
89 
90     
91 }

 1 void FileCompressHaffMan::GetHaffmanCode(HaffmanTreeNode<Char_info>* pRoot)    //获取haffman编码
 2 {
 3     if (nullptr == pRoot){
 4         return;//没写这个判断条件就会造成死循环
 5     }
 6     GetHaffmanCode(pRoot->_pLeft);
 7     GetHaffmanCode(pRoot->_pRight);
 8 
 9     if ((pRoot->_pLeft == nullptr)&&(pRoot->_pRight == nullptr)){
10         HaffmanTreeNode<Char_info>* pCur = pRoot;
11         HaffmanTreeNode<Char_info>* pParent = pCur->_pParent;
12 
13         string& strCode = _char_info[pCur->_weight._ch]._str_code;
14         while (nullptr!=pParent){
15             if (pCur == pParent->_pLeft){
16                 strCode += '0';
17             }
18             else
19             {
20                 strCode += '1';
21             }
22             //pParent要往下走，不然会形成死循环
23             pCur = pParent;
24             pParent = pCur->_pParent;
25         }
26         reverse(strCode.begin(),strCode.end());        //翻转字符串，将haffman编码写入
27     }
28 }
29 
30 void FileCompressHaffMan::WriteHead(FILE* DestFile, const string& strPath){
31     string str_suffix = strPath.substr(strPath.rfind('.'));            //截取文件的后缀写入目标文件中
32     str_suffix += '
';
33 
34     string str_charinfo;
35     char szCount[32];
36     size_t linecount = 0;
37 
38     for (size_t i = 0; i < 256; ++i){
39         if (_char_info[i]._char_count){
40             str_charinfo += _char_info[i]._ch;
41             str_charinfo += '/';
42             _itoa(_char_info[i]._char_count, szCount, 10);
43             str_charinfo += szCount;
44             str_charinfo += '
';
45             linecount++;
46         }
47     }
48     _itoa(linecount, szCount, 10);
49     str_suffix += szCount;
50     str_suffix += '
';
51 
52     str_suffix += str_charinfo;
53     fwrite(str_suffix.c_str(), 1, str_suffix.size(), DestFile);
54 
55 }

　　　（4）创建haffman树（采用模板的方式）

  1 #pragma once
  2 #include<vector>
  3 #include<string>
  4 #include<queue>
  5 using namespace std;
  6 
  7 //以模板的形式写haffman树
  8 
  9 //haffman树的结点信息
 10 template<class T>
 11 struct HaffmanTreeNode{
 12 
 13     typedef HaffmanTreeNode<T> HFNode;
 14 
 15     T _weight;            //haffman树节点的权值，存放文档中字符出现的次数
 16     HFNode* _pLeft;        //haffman树的左孩子
 17     HFNode* _pRight;        //haffman树的右孩子
 18     HFNode* _pParent;    //haffman树的结点的双亲
 19 
 20     //构造函数，初始化haffman树节点的权值以及指针
 21     HaffmanTreeNode(const T& weight)
 22         :_weight(weight)
 23         , _pLeft(nullptr)
 24         , _pRight(nullptr)
 25         , _pParent(nullptr)
 26     {}
 27 };
 28 
 29 //自定义priority_queue的排序方式，建立小堆（默认比较方式是>，即默认情况下建立大堆）
 30 template<class T>
 31 struct compare{
 32     typedef HaffmanTreeNode<T>* pHFNode;
 33     bool operator()(const pHFNode& pLeft,const pHFNode& pRight){
 34         return pLeft->_weight > pRight->_weight;
 35     }
 36 };
 37 
 38 //haffman树
 39 template<class T>
 40 class HaffmanTree{
 41 public:
 42     typedef HaffmanTreeNode<T> HFNode;
 43     typedef HFNode* pHFNode;
 44 
 45     HaffmanTree()
 46         :_pRoot(nullptr)
 47     {}
 48     ~HaffmanTree(){
 49         _delete(_pRoot);
 50     }
 51 
 52     //v用来存放Char_info结构体
 53     void CreateHaffmanTree(const vector<T>& v,const T& invalid){
 54         if (v.empty()){
 55             return;
 56             //v中为空，则说明信息不存在
 57         }
 58 
 59         //通过priority_queue建立小堆，对v中的数据进行排序
 60         priority_queue<pHFNode, vector<pHFNode>, compare<T>> q;
 61         //入队
 62         for (size_t i = 0; i < v.size(); ++i){
 63             if (v[i] != invalid){
 64                 q.push(new HFNode(v[i]));
 65             }
 66             
 67         }
 68 
 69         //如果q中的个数大于1.则说明haffman树环没有创建成功
 70         while (q.size()>1){
 71             pHFNode pLeft = q.top();
 72             q.pop();
 73             pHFNode pRight = q.top();
 74             q.pop();
 75 
 76             pHFNode pParent = new HFNode(pLeft->_weight + pRight->_weight);
 77             pParent->_pLeft = pLeft;
 78             pParent->_pRight = pRight;
 79             pLeft->_pParent = pParent;
 80             pRight->_pParent = pParent;
 81             //将新的子树插入优先级队列
 82             q.push(pParent);
 83         }
 84         _pRoot = q.top();            //根节点赋值给_pRoot
 85     }
 86 
 87     //获取haffman树的根节点
 88     pHFNode GetRoot(){
 89         return _pRoot;
 90     }
 91 
 92     //销毁haffman树
 93     void _delete(HaffmanTreeNode<T>* pRoot){
 94         if (pRoot == nullptr){
 95             return;
 96         }
 97         _delete(pRoot->_pLeft);
 98         _delete(pRoot->_pRight);
 99 
100         delete pRoot;
101     }
102 
103 private:
104     pHFNode _pRoot;            //haffman树的根节点
105 };

　　　　(6)解压文件

 1 void FileCompressHaffMan::GetLine(FILE* fIn, string& str)//读取一行数据
 2 {
 3     //feof用来检测文件指针是否到达文件末尾
 4     while (!feof(fIn)){
 5         UCH ch = fgetc(fIn);
 6         if (ch == '
'){
 7             return;
 8         }
 9         str += ch;
10     }
11 }
12 
13 void FileCompressHaffMan::UNCompressFile(const string& strPath){
14     string& str_suffix = strPath.substr(strPath.rfind('.'));
15     if (".hzp" != str_suffix){
16         cout << "文件格式不匹配" << endl;
17         return;
18     }
19     FILE* fIn = fopen(strPath.c_str(), "rb");
20     if (nullptr == fIn){
21         return;
22         //打开文件失败
23     }
24     
25     string strfix = "";        //获取文件的后缀
26     GetLine(fIn, strfix);
27 
28     string strChar = "";
29     GetLine(fIn, strChar);
30     size_t linecount = atoi(strChar.c_str());
31     for (size_t i = 0; i < linecount; ++i){
32         strChar = "";
33         GetLine(fIn, strChar);
34         if (strChar.empty()){
35             strChar += '
';
36             GetLine(fIn, strChar);
37         }
38         _char_info[(UCH)strChar[0]]._char_count = atoi(strChar.c_str() + 2);
39         //加2的原因是要把字符以及分隔符跳过去
40     }
41 
42     //创建haffman树
43     HaffmanTree<Char_info> ht;
44     ht.CreateHaffmanTree(_char_info, _char_info[0]);
45     
46     string UNCompress = "finshFileCompress";
47     UNCompress += strfix;        //解压后文件的后缀
48 
49     FILE* fOut = fopen(UNCompress.c_str(), "wb");
50     assert(fOut);
51 
52     char* pReadBuff = new char[1024];
53     
54 
55     HaffmanTreeNode<Char_info>* pCur = ht.GetRoot();
56 
57     char pos = 7;
58     long long filesize = pCur->_weight._char_count;
59 
60     while (true){
61         size_t ReadSize = fread(pReadBuff, 1, 1024, fIn);
62         if (ReadSize == 0){
63             break;//文件读取完毕
64         }
65 
66         for (size_t i = 0; i < ReadSize; ++i){
67             pos = 7;
68             char ch = pReadBuff[i];
69             for (size_t j = 0; j < 8; ++j){
70                 //测试代码
71                 //cout << pReadBuff[j] << endl;
72                 //cout << (1 << pos) << endl;
73                 //cout << (pReadBuff[j] & (1 << pos)) << endl;
74                 if (ch&(1<<pos)){
75                     pCur = pCur->_pRight;
76                 }
77                 else{
78                     pCur = pCur->_pLeft;
79                 }
80 
81                 if (nullptr == pCur->_pLeft&&nullptr == pCur->_pRight){
82                     fputc(pCur->_weight._ch, fOut);
83                     pCur = ht.GetRoot();
84                     filesize--;
85                     if (0 == filesize){
86                         break;//解压缩完成
87                     }
88                 }
89                 pos--;
90             }
91         }
92     }
93     delete[] pReadBuff;
94     fclose(fIn);
95     fclose(fOut);
96 }

注意事项：

　　1.在写代码时注意语法错误，ch|1没“=”的这种错误编译器是无法检测出来的

　　2.创建haffman树的时候需要将那些权值不为0的字符信息存入haffman树，所以需要键入判断条件

　　3.在读写文件时需要以二进制的方式打开文件，否则如果以文本的方式打开文件，当文档中遇到-1时，会认为文档结束，因此会造成文档不能完全压缩，只压缩一部分。如果以二进制的方式打开文件，则会避免这种问题

　　4.解压文件的时候需要处理换行，如果读到一行，读到的字符串为空串，说明读到了换行，此时需要我们判断，如果字符串为空，则需要将换行加入字符串写入文件，并且再读一次，否则当文档中遇到换行的时候会解析出错

　　5.重建haffman树之后，读完叶子节点之后需要将pCur重置，归位

　　6.由于中文所占字节数比普通的英文字符多，因此不能用char，char类型存不下中文字符，因此需要用unsigned char存放