基于huffman编码的文件压缩与解压程序的实现

1、算法名称

基于huffman编码的文件压缩与解压程序的实现。

3、运行环境说明

改程序在microsoft visual c++6.0 环境下开发完成

可执行文件可以脱离改开发环境在任何windows系统中运行

4、运行过程说明

改程序是基于huffman编码的文件压缩与解压程序，可以讲任何格式的文件压缩为.huff格式的文件，也只能解压有改程序压缩完成的文件，可以将文件解压问指定的格式。

文件运行过程如下：

（1）打开可执行程序后，选择操作的类型，1压缩文件，2解压文件，0退出程序

（2）选择后可根据提示，压缩文件时输入带格式的文件名称（绝对或相对路径名称，带文件格式），然后输入压缩后的文件名称即可。解压文件输入压缩文件名称（该文件是由该程序压缩完成的，不能输入文件格式，只要名称），然后输入解压后的文件名称（带格式即可）。

5、算法设计

该算法使用了单向有序链表和二叉树的数据结构，单向有序链表用于存储字符及其出现频率，其第一个和第二个节点即为第一小和第二小的节点，用两个节点构造新节点插入链表中，依次类推即可得到huffman二叉树。

Huffman树构造算法的设计如下：

Huffman（C）

1 n<—|C|

2 L<—C

3 for i <—1 to n-1

4 do allocate a new node z

5 left[z] <— x <— EXITRACT-MIN(L)

6 right[z] <— y <— EXITRACT-MIN(L)

7 f[z]=f[x]+f[y]

8 INSERT(L,z)

9 return EXTRACT(L); //返回根节点

huffman.h

//#include <vector>
#include <iostream>
#include <map>
#include <bitset>
#include <string>
#include <cstring>
#include <queue>
#include <fstream>

using namespace std;
/*
本程序采用单向链表的数据结构来求解huffman树，使用到了带头结点的链表的插入、删除、排序等操作
*/
typedef struct linkNode        //单向链表的结构体
{
    struct treeNode *node;  //huffman树上的节点
    linkNode *next;//指向下一个节点的指针
};


struct treeNode               //huffman树上节点的结构体
{
    unsigned char name; //节点名称，叶节点为对应的字符，内部节点为‘*’
    treeNode *left;//指向它的左子树的指针
    treeNode *right;//指向它的右子树的指针

    double value;//节点的权值，也就是字符出现的频率值，本算法中使用器出现的次数，不受影响
    string code;//节点的编码，以01字符串的形式出现
};

linkNode * new_linknode(char name,double value);//该函数用于新建一个链表上的节点，链表上的节点的结构如上所示

treeNode * new_treenode(char name,double value);//该函数用于新建一个huffman树上的节点

unsigned long char_account(ifstream &input,map<char,double> &vec_char);//统计文件中的字符 以及字符的个数，最终形成一个字符对应其出现次数的映射表

treeNode* get_huffmantree(map<char,double> &vec_char);//从一个字符与其出现次数的映射表得到哈夫曼树 ，返回的是树的节点

linkNode *get_tree_from_link(linkNode *head);//从一个权值升序的单向链表中得到一颗huffman树，最终返回的是包含huffman树根节点的链表节点

int get_encode_dictionary(treeNode *root,map<char,string> &vec_encode);//从一颗huffman树得到其叶节点对应的huffman编码

void linklist_insert(linkNode *head,linkNode *element);//在一个有序链表上插入一个新节点，所得到的链表依然有序

void string_to_binary(string str,bool array[128]);//把一个01字符串转换为对应的01布尔值存储在一个bool数组中

int compress(string input_filename,string encode_filename);//文件的压缩过程

int uncompress(string input_filename,string encode_filename);//文件的解压过程

huffman.cpp

#pragma warning(disable:4786)

#include"huffman.h"
#include <cstring>

using namespace std;

treeNode * new_treenode(char name,double value)
{
    
    treeNode *newNode=new treeNode;
    newNode->name=name;
    newNode->left=NULL;
    newNode->right=NULL;
    newNode->value=value;
    newNode->code="";

    return newNode;
}

linkNode * new_linknode(char name,double value)
{
    linkNode *newnode=new linkNode;
    newnode->node=new_treenode(name,value);
    newnode->next=NULL;
    return newnode;
}

unsigned long char_account(ifstream &input,map<char,double> &vec_char)
{
    unsigned long sizes=0;
    map<char,double>::iterator it;    //定义map的迭代器
    
    if(input==NULL)//输入文件流为空直接返回
        return 0;
    
    unsigned char buf=0;

    input.read((char*)&buf,1);// 二进制读入一个字符
    sizes++;
    while (!input.eof())//连续读入字符知道文件结束
    {
        it=vec_char.find((char)buf);
        if (it==vec_char.end())//若字符尚未出现过则插入映射表，否则出现次数加一
            vec_char.insert(map<char,double>::value_type((char)buf,1));
        else
            it->second++;
        input.read((char*)&buf,1);
        sizes++;
        
    }

    input.close();
    return sizes-1;
}

treeNode *get_huffmantree(map<char,double> &vec_char)
{
    linkNode *temp=NULL;
    linkNode *head=new linkNode;

    head->node= NULL;
    head->next=NULL;

    map<char,double>::iterator it;//容器的迭代器

    for(it=vec_char.begin();it!=vec_char.end();it++)
    {        
        temp=new_linknode(it->first,it->second);
        linklist_insert(head,temp);
    }
    // 此时的head为什么是空
    treeNode *root;
    root=get_tree_from_link(head)->next->node;
    
    return root;

}

linkNode *get_tree_from_link(linkNode *head)
{
    if (head->next==NULL)    //链表中没有节点
    {
        return NULL;
    }

    if (head->next->next==NULL)    //链表中只有一个节点
    {
        return head;
    }

    linkNode *first_min=head->next;         //获得最小节点
    linkNode *second_min=head->next->next;//获得次小节点

    head->next=second_min->next;//删除前两个节点

    linkNode *newnode=new_linknode('*',first_min->node->value+second_min->node->value);//由前两个各节点构造新节点

    newnode->node->left=first_min->node;//添加新节点的左子树
    newnode->node->right=second_min->node;//添加新节点的右子树

    linklist_insert(head,newnode);//链表中插入新节点

    get_tree_from_link(head);//递归调用，直到只剩一个根节点然后退出

    return head;
}

void linklist_insert(linkNode *head,linkNode *element)//使用带头结点的单向链表，将一个节点插入有序链表
{
    if (element==NULL)
    {
        return;
    }

    if (head->next==NULL)
    {
        head->next=element;
        element->next=NULL;
        return;
    }

    linkNode *p=head->next; //工作指针，指向第一个节点

    linkNode *q=head;//记录插入点

    while (p)
    {
        if (p->node->value < element->node->value)
        {
            q=p;
            p=p->next;
        }else{
            element->next=p;
            q->next=element;
            break;
        }
    }

    if(p==NULL)
        q->next=element;
}

void string_to_binary(string str,bool array[256])
{
    
    int len=str.length();
    int i;
    
    for (i=0;i<len;i++)//将01字符串中的字符分别转换成bool值
    {
        if (str.at(i)=='0')
        {
            array[i]=0;
        }else{
            array[i]=1;
        }
    }
}

int get_encode_dictionary(treeNode *root,map<char,string> &vec_encode)
{
    static bool flag=false;

    if(root==NULL)
    {
        return 0;
    }
    if(root->left==NULL&&root->right==NULL){       //该节点是叶节点
        vec_encode.insert(map<char,string>::value_type(root->name,root->code));
    }
    else{//节点是内部节点
        flag=true;//进入该循环说明不是只有一个节点huffman树
        root->left->code=root->code+"0";
        root->right->code=root->code+"1";
        get_encode_dictionary(root->left,vec_encode);//左子树递归调用
        get_encode_dictionary(root->right,vec_encode);//右子树递归调用
    }

    if(!flag)                       //文件中只有一种类型的字符串；
        vec_encode.begin()->second='0';

    return 1;
    
}

int compress(string input_filename,string encode_filename)//文件的压缩过程
{
    map<char,double> ch_count;
    map<char,string> encode_dictionary;
    unsigned long sizes=0;

    ifstream input(input_filename.c_str(),ios::in|ios::binary);//二进制打开输入文件
    if (!input)
    {
        cout<<"can not open the input file"<<endl;
        return 0;
    }

    ofstream output(encode_filename.c_str(),ios::out|ios::binary);//二进制打开输出文件
    if (!output)
    {
        cout<<"can not open the output file"<<endl;
        return 0;
    }

    sizes=char_account(input,ch_count);//统计字符

    input.close();
    input.clear();

    get_encode_dictionary(get_huffmantree(ch_count),encode_dictionary);//获得用于压缩的huffman编码

    //////////////////////////////////////////////////////////////////////////////////////////////////////////
    //在压缩文件中写入解码信息

    output.write((char*)&sizes,4);//写入原始文件的大小
    unsigned char num=encode_dictionary.size();
    output.write((char*)&num,1);               //写入编码的个数,也就是huffman树的长度

    int i=0;
    int bitIndex=0;

    string str;
    int len;
    char temp;
    bool array[256];
    unsigned char buf=0;

    map<char,string>::iterator it;

    for (it=encode_dictionary.begin();it!=encode_dictionary.end();it++)
    {
        str=it->second;
        len=str.length();

        output.write((char*)&(it->first),1);// 写入字符的键值

        temp=(char)len;
        output.write((char*)&temp,1);        //写入编码的长度
        string_to_binary(str,array);
        for (i=0;i<len;i++)  //write in the code of each char
        {
            buf>>=1;
            if (array[i]==1)//从array[0]到array[len]一位一位的写入buffer
            {
                buf|=0x80;
            }
            bitIndex++;

            if (bitIndex==8)            //够八位就写入
            {
                bitIndex=0;
                output.write((char*)&buf,1);//buf的内容写入ouput文件流
                buf=0;                       //buf清空
            }

        }
        if (bitIndex)//最后一个位写完，但是不够一个整八位
        {
            buf>>=8-bitIndex;
            output.write((char*)&buf,1);
            bitIndex=0;
            buf=0;
        }    
    }


    ////////////////////////////////////////////////////////////////////////////////////////////////////////

    input.open(input_filename.c_str(),ios::in|ios::binary);//重新打开输入文件
    unsigned char ch;

    bitIndex=0;
    input.read((char*)&ch,1);//读取第一个字符

    while (!input.eof())
    {
        it=encode_dictionary.find((char)ch);//获取字符的huffman编码
        str=it->second;
        string_to_binary(str,array);

        for (i=0;i<str.length();i++)//将huffman编码按位一位一位的写入压缩文件
        {
            buf>>=1;
            if(array[i])
                buf|=0x80;
            bitIndex++;
            if (bitIndex==8)//huffman编码凑够八位就按照一个字符写入压缩文件
            {
                bitIndex=0;
                output.write((char*)&buf,1);
                buf=0;
            }
        }
        input.read((char*)&ch,1);
    }

    if (bitIndex)//同理 写完不够八位则右移到最低位
    {
        buf >>= (8 - bitIndex);
        output.write((char*)&buf, 1);
    }

    input.close();
    output.close();

    return 1;
    
}

int uncompress(string input_filename,string encode_filename)//文件的解压过程
{
    ifstream input(input_filename.c_str(),ios::in|ios::binary);
    if (!input)
    {
        cout<<"can not open the input file"<<endl;
        return 0;
    }
    
    ofstream output(encode_filename.c_str(),ios::out|ios::binary);
    if (!output)
    {
        cout<<"can not open the output file"<<endl;
        return 0;
    }
//首先从压缩文件的其实字符串得到文件的解码词典 之后便可根据解码词典进行解码

    map<string,char> decode_dictionary;//定义解码字典
    unsigned long sizes=0;//用于写入文件的长度

    int num=0;    //解码字典的长度
    unsigned char num1=0;
    unsigned char len=0;
    char ch;
    unsigned char buf=0;
    string str="";
    int bitIndex=0;
    bool array[256];

    int i=0;
    int j=0;
    input.read((char*)&sizes,4);

    input.read((char*)&num1,1);//
    if (num1==0)//文件中的字符最多256个超出八个bit的表示能力 会表示问八个零
    {
        num=256;
    }else{
        num=num1;
    }

    for (i=0;i<num;i++)
    {
        input.read((char*)&ch,1);         //读入解码字典的字符
        input.read((char*)&len,1);        //读入当前字符编码的编码长度
        input.read((char*)&buf,1);
        for (j=0;j<len;j++)                //根据长度读出后续的编码01串
        {
            array[j]=buf&0x01;
            buf>>=1;
            bitIndex++;
            if(bitIndex==8)//读入的八个字符已经全部处理完
            {
                bitIndex=0;
                if(j!=(len-1))//敲好是该编码的最后一个位，则处理完毕
                    input.read((char*)&buf,1);
            }

        }

        for (j=0;j<len;j++)//由01问构造字符串
        {
            if (array[j]==1)
                str+='1';
            else 
                str+='0';
        }

        decode_dictionary.insert(map<string,char>::value_type(str,ch));//将获得的字符对应的编码插入解码器
        str="";
        bitIndex=0;
            
    }
//根据获得的解码器对文件进行解码

    buf=0;

    i=0;
    bitIndex=0;
    str="";
    bool flag=false;
    buf=0;
    char ch_test=0;

    input.read((char*)&buf,1);//二进制读入压缩文件中的一个字符
    
    map<string,char>::iterator it;
    
    while(!input.eof()){ // 若文件没有结束则按照二进制一个字符一个字符的读入所有内容
        
        for (i=0;i<256;i++)
        {
            array[i]=buf&0x01;
            if (array[i]==1)
            {
                str=str+'1';
            }else{
                str=str+'0';
            }
            if ((it=decode_dictionary.find(str)) != decode_dictionary.end())//前缀编码中，根据编码找到对应的字符，将字符写入解压文件
            {
                ch_test=it->second;
                output.write((char*)&(it->second),1);
                sizes--;
                if(sizes==0)
                    return 1;
                str="";
            }
            buf>>=1;
            bitIndex++;
            if(bitIndex==8)//读入的字符处理完毕，继续读入一个字符
            {
                bitIndex=0;            
                input.read((char*)&buf, 1);                
            }
        }
    }
    return 1;
}