第二次作业——词频统计2.0

作业要求：

　　参见博客：http://www.cnblogs.com/jiel/p/3311400.html

设计思路：

　　定义一个词典类，实现查找，插入，排序，输出等操作。

　　对于扩展要求-e，通过整合合并词典操作即可。

已经实现功能：

统计词频，按照出现次数由高到低输出到文件；
普通操作中，大小写不同算作同一个单词，按照ASCII序列统计输出；
-e 操作中，尾缀（仅数字）不一样算作同一单词，按照ASCII序列统计输出。

代码：

　　GitHub: https://github.com/chynphh/Word-frequency-program

#include <iostream>
#include <string>
#include <stdio.h>
#include <fstream>
#include <algorithm>
using namespace std;

const int LIST_MAX = 10000;
static int num = 0;

//-e 功能中 提取前面部分
string extract(string w)
{
    string s;
    int l = w.size(), i;
    for(i = l - 1; i >= 0; i--)
    {
        if(w[i] > 57)break;
    }
    s = w.substr(0, i + 1);
    return s;
}

//词典类
class Dictionary
{
public:
    int find(string w); //查找是否有这个单词
    void put(string w, int loc); //放入词典
    void sort(); //词典排序
    void out(); // 词典输出
    int merge(); //整合词典 -e功能

private:
    string word[LIST_MAX];
    int freq[LIST_MAX] = {0};
};

int Dictionary::find(string w)
{
    string w_lower, word_lower;
    w_lower = w;
    transform(w_lower.begin(), w_lower.end(), w_lower.begin(), ::tolower);
    for(int i = 0; i < num; i++)
    {
        word_lower = word[i];
        transform(word_lower.begin(), word_lower.end(), word_lower.begin(), ::tolower);
        if(w_lower.compare(word_lower) == 0)
        return i;
    }
    return -1;
}

void Dictionary::put(string w, int loc)
{
    freq[loc]++;
    if(loc == num)
    {
        word[loc] = w;
        num++;
    }
    if(w.compare(word[loc]) < 0)
        word[loc].assign(w);
    return ;

}


void Dictionary::sort()
{
    for(int j = 1; j < num; j++)//进行n-1次循环，n-i趟比较
        for(int i = 0; i < num - j;i++)//每趟进行n-i-j次比较
        {
            if(freq[i] < freq[i+1])//相邻两数进行比较
            {
                int t = freq[i];
                freq[i] = freq[i+1];
                freq[i+1] = t;
                string ts = word[i];
                word[i] = word[i+1];
                word[i+1] = ts;
            }
            else if(freq[i] == freq[i+1])
            {
                if(word[i].compare("") == 0 || word[i].compare(word[i+1]) > 0)
                {
                    int t = freq[i];
                    freq[i] = freq[i+1];
                    freq[i+1] = t;
                    string ts = word[i];
                    word[i] = word[i+1];
                    word[i+1] = ts;
                }
            }
        }
    return ;
}

void Dictionary::out()
{
    ofstream fout("output.txt");
    if(!fout.is_open())
    {
        cout << "file can not open" << endl;
    }
    else
    {
        for(int i = 0; i < num; i++)
            fout << word[i] << " : " << freq[i] << endl;
        fout.close();
    }

    return ;
}

int Dictionary::merge()
{
    int sum = 0;
    string s1, s2;
    for(int i = 0; i < num - 1; i++)
    {
        s1 = extract(word[i]);
        transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
        for(int j = i + 1; j < num; j++)
        {
            s2 = extract(word[j]);
            transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
            if(s1.compare(s2) == 0)
            {
                freq[j] = freq[j] + freq[i];
                freq[i] = 0;
                if(word[i].compare(word[j]) < 0)
                {
                    word[j] = word[i];
                    word[i] = "";
                }
                sum++;
                break;
            }

        }
    }
    return sum;
}

// 提取字符串中的第一个单词
string check(string *s)
{
    string b;
    int l = 0, start = -1, end = 0, flag = 1;
    l =(*s).size();
    if(l < 4) return "";
    for(int i = 0; (i < l) && flag; i++)
    {
        if( (*s)[i] < 48 || ((*s)[i] > 57 && (*s)[i] < 65) || ((*s)[i] > 90 && (*s)[i] < 97) || (*s)[i] > 122)
        {
            if( start > -1)
            {
                end = i;
                flag = 0;
            }
        }
        else if( start == -1 && (*s)[i] > 57)
        {
            start = i;
        }

    }
    if(end == 0 && start > -1)
    {
        end = l;
        flag = 0;
    }
    if(!flag)
    {
        b = (*s).substr(start, end - start);
        (*s).assign((*s).substr(end, end - l));
        if(b.size() < 4) b = check(s);
    }
    else
    {
        b = "";
        (*s).assign("");
    }

    return b;
}

int main(int argc, char const *argv[])
{
    Dictionary dictionary;
    string path_in, s, w = "";
    int loc = 0;
    ifstream fin (argv[argc-1]);
    //ifstream fin ("test.txt");
    if(!fin.is_open())
    {
        cout << "file can not open" << endl;
    }
    else
    {
        while(! fin.eof() )
        {
            getline(fin, s);
            w = check(&s);
            while(w != "")
            {
                loc = dictionary.find(w);
                if(loc == -1) dictionary.put(w, num);
                else dictionary.put(w, loc);
                w = check(&s);
            }
        }
        int cut = 0;
        if(argc == 3)
        {
            cut = dictionary.merge();
        }
        dictionary.sort();
        num = num - cut;
        dictionary.out();
        fin.close();
    }
    return 0;
}

View Code

数据：

　　GitHub上有10组测试数据，input1-4是china daily中的新闻，input5和input6是普通英语作文，input7-10是为了测试功能自编数据(包括分隔符、大小写、尾缀等测试)。output-e*.txt是-e功能下的输出，output*.txt是普通输出，均与输入数据对应。

使用说明：

　　命令行输入：词频统计.exe -e filename.txt 或者 词频统计.exe filename.txt

　　结果将输出至：output.txt

输出截图：

用时：

　　原计划全部用时3小时，实际用时5小时。

分工：

　　程昊：代码编写与博客编写

　　陈金满：代码测试