C++回顾统计词频问题 -- vector、map、hash_map（三种方式时间比较）

本博文我们通过三个程序比较统计词频问题的时间复杂度问题（末尾有用时及其分析）；

问题描述;

1）、找一篇文章，将所有单词输入至程序；（The Bible Holy为例）

2）、统计出每个单词的数量，即词频问题；

3）、增加停用词功能；（遇到此类词，直接略过）（网上搜）

4）、分别统计出读取文件并计算词频时间、排序所用时间；

5）、用类实现各函数（处统计时间的函数除外）。

vector、map、hash_map 都要处理字符串的去除标点符号、将大写字母转换成小写字母、不对数字进行统计问题。因此，我们可以将处理这些函数进行封装。

StringUtil.h

 1 #ifndef STRING_UTIL_H_
 2 #define STRING_UTIL_H_
 3 
 4 #include <string>
 5 
 6 namespace stringutil
 7 {
 8 void erasePunct(std::string &s);
 9 void stringToLower(std::string &s);
10 bool isAllDigit(const std::string &s);
11 }
12 
13 #endif  /* STRING_UTIL_H_ */

StringUtil.cpp

 1 #include "StringUtil.h"
 2 #include <ctype.h>
 3 
 4 using namespace std;
 5 
 6 namespace stringutil
 7 {
 8 
 9 void erasePunct(string &s)
10 {
11     string::iterator it = s.begin();
12     while(it != s.end())
13     {
14         if(ispunct(*it))
15             it = s.erase(it);
16         else
17             ++it;
18     }
19 }
20 void stringToLower(string &s)
21 {
22     for(string::iterator it = s.begin();
23         it != s.end();
24         ++it)
25     {
26         if(isupper(*it))
27             *it = tolower(*it);
28     }
29 }
30 
31 bool isAllDigit(const std::string &s)
32 {
33     for(string::const_iterator it = s.begin();
34         it != s.end();
35         ++it)
36     {
37         if(!isdigit(*it))
38             return false;
39     }
40 
41     return true;
42 }
43 
44 }

一、采用vector、pair；

采用的数据结构：

1）、vector<string> stopList用于存放停用词；

2）、vector<pair<string, int> > words 用于存放除禁用词之外的其他The Bible Holy 的单词；

思路：

1）、用ReadStopFile函数读取停用词到 stopList中；

2）、用ReadWordFile函数读取The Bible Holy到 words中；在我们读取的过程中首先要去除标点符号、将word转化为小写、跳过数字；

3）、然后判断该单词是否在 stoplist 中，若不在，最后添加单词至 words中。

4）、对词频排序，这里我们采用algorithm库中的sort函数；

5）、对words vector进行遍历，打印之；

这里注意一点：除了在 main.cpp中显式调用的函数，我们将其他函数声明为类的私有函数。

函数声明如下：

 1 #ifndef WORDFREQUENCY_H_
 2 #define WORDFREQUENCY_H_
 3 
 4 #include <vector>
 5 #include <string>
 6 #include <utility>
 7 
 8 class WordFrequency
 9 {
10     public:
11         WordFrequency(const std::string &filename, const std::string &stopFile);//初始化
12         void ReadStopFile();
13         void ReadWordFile();
14         void sortWordByFrequency();
15         void printWordFrequency()const;
16     private:
17         void addWordToDict(const std::string &word);
18         bool isStopWord(const std::string &word)const;
19 
20         typedef std::vector<std::pair<std::string, int> >::iterator Wordit;//为了方便
21         typedef std::vector<std::pair<std::string, int> >::const_iterator Wordkit;
22 
23         std::string filename_;
24         std::string stopFile_;
25         
26         std::vector<std::string> stopList_;
27         std::vector<std::pair<std::string, int> > words_;
28 
29 };
30 
31 #endif

函数实现如下：

 1 //WordFrequency.cpp
 2 #include "WordFrequency.h"
 3 #include "StringUtil.h"
 4 #include <algorithm>
 5 #include <stdexcept>
 6 #include <stdlib.h>
 7 #include <fstream>
 8 
 9 using namespace std;
10 using namespace stringutil;
11 
12 WordFrequency::WordFrequency(const std::string &filename, const std::string &stopFile)
13     :filename_(filename),stopFile_(stopFile)
14 { }
15 
16 void WordFrequency::ReadStopFile()
17 {
18     ifstream in(stopFile_.c_str());
19     if( !in )
20         throw runtime_error("open file failure");
21     string word;
22     while(in >> word)
23     {
24         stopList_.push_back(word);
25     }
26     in.close();
27 }
28 
29 void WordFrequency::ReadWordFile()
30 {
31     ifstream infile(filename_.c_str());
32     if( !infile )
33         throw runtime_error("open file failure!");
34     string word;
35     while(infile >> word)
36     {
37         erasePunct(word);
38         if( isAllDigit(word))
39             continue;
40         stringToLower(word);
41         if( !isStopWord(word))
42             addWordToDict(word);
43     }
44     infile.close();
45 }
46 
47 bool WordFrequency::isStopWord(const string &word)const
48 {
49     vector<string>::const_iterator it = stopList_.begin();
50     while( it  != stopList_.end())
51     {
52         if( *it == word) 
53             break;
54         it ++;
55     }
56     return (it != stopList_.end());
57 }
58 
59 void WordFrequency::addWordToDict(const string &word)
60 {
61     Wordit it = words_.begin();
62     while( it != words_.end())
63     {
64         if( it->first == word)
65         {
66             ++ it->second ;
67             break;
68         }
69         it ++;
70     }
71     if(it == words_.end())
72         words_.push_back(make_pair(word, 1)) ;
73 }
74 
75 bool cmp(const pair<string, int> &a, const pair<string, int>&b)
76 {
77     return a.second > b.second;
78 }
79 
80 void WordFrequency::sortWordByFrequency()
81 {
82     sort(words_.begin(), words_.end(), cmp);
83 }
84 
85 void WordFrequency::printWordFrequency()const
86 {
87     Wordkit it = words_.begin();
88     while(it != words_.end())
89     {
90         printf("words: %s, frequency: %d
",it->first.c_str(),it->second);
91         it ++;
92     }
93 }

main.cpp

 1 //main.cpp
 2 #ifndef _STDC_FORMAT_MACROS
 3 #define _STDC_FORMAT_MACROS
 4 #endif /* _STDC_FORMAT_MACROS */
 5 #include <inttypes.h>
 6 
 7 #include "WordFrequency.h"
 8 #include <iostream>
 9 #include <vector>
10 #include <stdexcept>
11 #include <sys/time.h>
12 #include <stdio.h>
13 #include <string.h>
14 using namespace std;
15 
16 int64_t gettime();
17 
18 int main(int argc, const char *argv[])
19 {
20     if(argc < 3)
21         throw runtime_error("exe,filename, stopfile");
22     int64_t starttime =gettime();
23     WordFrequency wf(argv[1], argv[2]);
24     wf.ReadStopFile();
25     wf.ReadWordFile();
26     
27     int64_t readtime = gettime();
28     wf.sortWordByFrequency();
29 
30     int64_t sorttime = gettime();
31 
32     printf("readfile time: %lld ms
",(readtime-starttime)/1000); 
33     printf("sortfile time: %lld ms
",(sorttime-readtime)/1000);  
34     
35     wf.printWordFrequency();
36     return 0;
37 }
38 
39 int64_t gettime()
40 {
41     struct timeval tv;
42     ::memset(&tv, 0, sizeof tv);
43     if(::gettimeofday(&tv, NULL) == -1)
44     {
45         throw runtime_error("gettimeofday");
46     }
47     int64_t t = tv.tv_usec;
48     t += tv.tv_sec * 1000 * 1000;
49     return t;
50 }

二、map、set；

相较于第一种，这种方式用到的数据结构有

1、std::set<std::string> StopList_ 用于存放停用词；
2、std::map<std::string, int> words_ 用于存放The Bible Holy 中除了停用词之外的词；

3、std::vector< std::pair<std::string,int> > copyWords_ 由于map不具备排序功能，故需要将其转存均vector中。

WordFrequency.h

 1 #ifndef WORDFREQUENCY_H_
 2 #define WORDFREQUENCY_H_
 3 
 4 #include <vector>
 5 #include <map>
 6 #include <set>
 7 #include <string>
 8 class WordFrequency
 9 {
10     public:
11         WordFrequency(const std::string &filename, const std::string &stopfile);
12         void ReadStopFile();
13         void ReadWordFile();
14 
15         void copyWordToVector();
16         void sortWordByFrequency();
17         void printFrequency()const;
18         
19     private:
20         std::string filename_;
21         std::string stopfile_;
22 
23         std::set<std::string> StopList_;
24         std::map<std::string, int> words_;
25             
26         std::vector< std::pair<std::string,int> > copyWords_;
27 };
28 
29 #endif

WordFrequency.cpp

 1 //WordFrequency.cpp
 2 #include "WordFrequency.h"
 3 #include "StringUtil.h"
 4 #include <fstream>
 5 #include <algorithm>
 6 #include <stdio.h>
 7 #include <stdexcept>
 8 using namespace std;
 9 using namespace stringutil;
10 
11 WordFrequency::WordFrequency(const string &filename, const string &stopfile)
12     :filename_(filename),stopfile_(stopfile)
13 { }
14 
15 void WordFrequency::ReadStopFile()
16 {
17     ifstream infile(stopfile_.c_str());
18     if( !infile )
19         throw runtime_error("open file failure");
20     string word;
21     while(infile >> word)
22         StopList_.insert(word);
23 
24     infile.close();
25 }
26 
27 void WordFrequency::ReadWordFile()
28 {
29     ifstream infile(filename_.c_str());
30     if( !infile )
31         throw runtime_error("open file failure");
32     words_.clear();
33     string word;
34     while(infile >> word) //读取单词
35     {
36         erasePunct(word); //去除标点
37         stringToLower(word);//转为小写
38         if(isAllDigit(word))//去除数字
39             continue;
40         if( StopList_.count(word) == 0) //set中 count 标识 word这个单词是否存在
41             words_[word]++; //如果存在，int++， 如果不存在，增添至 map中 
42             //words_[word]= cnt（词频数）
43     }
44     
45     infile.close();
46 }
47 
48 void WordFrequency::copyWordToVector()
49 {
50     copyWords_.clear();
51     //back_inserter(copyWords_)
52     //front_inserter
53     copy(words_.begin(),words_.end(),back_inserter(copyWords_));
54 }
55 
56 bool cmp(const pair<string, int> &a, const pair<string, int> &b)
57 {
58     return a.second > b.second;
59 }
60     
61 void WordFrequency::sortWordByFrequency()
62 {
63     sort(copyWords_.begin(), copyWords_.end(),cmp); //排序
64 }
65 
66 void WordFrequency::printFrequency() const
67 {
68     for(vector<pair<string, int> >::const_iterator it = copyWords_.begin(); 
69          it != copyWords_.end(); 
70          ++it)
71     {
72         printf("word :%s, frequency: %d
", it->first.c_str(), it->second); //打印
73     }
74 }

main.cpp：

比上一程序的main函数多了一行

1 //多出的一行copyWordToVector
2 int64_t readtime = gettime();
3 wf.copyWordToVector();
4 wf.sortWordByFrequency();

三、哈希表（c11特性）：

程序三与程序二非常相似，只是其头文件WordFrequency.h不同，因此我们只给出 .h文件；

注意编译时，g++ *.cpp -std=c++0x

WordFrequency.h

 1 //WordFrequency.h
 2 #ifndef WORDFREQUENCY_H_
 3 #define WORDFREQUENCY_H_
 4 
 5 #include <vector>
 6 #include <unordered_map>
 7 #include <unordered_set>
 8 #include <string>
 9 class WordFrequency
10 {
11     public:
12         WordFrequency(const std::string &filename, const std::string &stopfile);
13         void ReadStopFile();
14         void ReadWordFile();
15 
16         void copyWordToVector();
17         void sortWordByFrequency();
18         void printFrequency()const;
19         
20     private:
21         std::string filename_;
22         std::string stopfile_;
23 
24         std::unordered_set<std::string> StopList_;
25         std::unordered_map<std::string, int> words_;
26             
27         std::vector< std::pair<std::string,int> > copyWords_;
28 };
29 
30 #endif

以上三个程序（所采用的数据结构不同）性能比较：

程序一：读取文件时间：42997ms、排序时间 7ms；

程序二：读取文件时间：1294ms、排序时间 10ms；

程序三：读取文件时间： 616ms、排序时间 9ms；

分析：

读取时间：

程序一采用的是vector，所以每一次从 txt 读取一个单词都要先遍历一遍停用词的vector，然后再顺序遍历一次 vector，以查找该单词是否已经存在，故其平均查找时间复杂度为 O（N*N）；

程序二采用的是 map与set。map的实现是根据平衡二叉树（具体是红黑树），其查找平均速度是n*lgN；而且如果不存在时，自动添加该元素至map中，词频置为1。

程序三采用的是哈希表， unordered_map，与unordered_set ，其查找时间复杂度为常量O(1)；

排序时间：

程序一所用时间比程序二、三多了几毫秒，这是因为map不具备排序功能，因为要转存至vector中。而多出来的这几毫秒就是转存时间。

C++回顾 统计词频问题 -- vector、map、hash_map（三种方式时间比较）

C++回顾统计词频问题 -- vector、map、hash_map（三种方式时间比较）