简单的中文分词加上kmean聚类 (c++)

程序代码参考了csdn某博客,具体名字忘记了 

变量命名的头文件

//common.h
#ifndef COMM_H
#define COMM_H

#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include  <iterator>
using namespace std;

typedef vector<string> StrVec; //字符串向量
typedef vector<int> IntVec; //整数向量
typedef vector<vector<int> > Int2DVec;//整数二维向量
typedef vector<vector<double>> Double2DVec;//浮点数二维向量
typedef vector<double> DoubleVec;//浮点数向量
#endif

去除停用词语

#pragma once
#include "common.h"

// 用于移除停止词
class StopWordsHandler
{
public:
    StopWordsHandler(void);
    ~StopWordsHandler(void);
    bool IsStopWord(string& str);
private:
    StrVec stopwords;
};
#include "StopWordHandler.h"

string StopWordList[] = {"", "我们","","自己","","","","","","","","","","","","","","","","","","","","","","","","",""};//停用词
int strwordlen = sizeof(StopWordList) / sizeof(StopWordList[0]);
StopWordsHandler::StopWordsHandler()
{
    for ( int i = 0 ; i < strwordlen ; i++)
        stopwords.push_back(StopWordList[i]);
}
StopWordsHandler::~StopWordsHandler()
{

}

bool StopWordsHandler::IsStopWord(string& str)
{
    transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化
    return find(stopwords.begin(),stopwords.end(),str)!=stopwords.end();
}

分词选用了最简单的分词方法,预先用空格做好了分词

#pragma once
#include "Common.h"

class ITokeniser
{
public:
    virtual void Partition(string input,StrVec& retWords)=0;//分词算法
};
#pragma once
#include "Itokenisher.h"

class Tokeniser :public  ITokeniser
{
public:
    Tokeniser();
    ~Tokeniser();
    void Partition(string input , StrVec& retWords);
};
#include "Tokeniser.h"
#include "StopWordHandler.h"
#include <iterator>
Tokeniser::Tokeniser()
{
}
Tokeniser::~Tokeniser()
{

}
void Tokeniser::Partition(string input ,StrVec& retWord)
{
    transform(input.begin() , input.end(),input.begin(),tolower);
    string::iterator pos = input.begin();
    StopWordsHandler stopHandler;
    do
    {
        string temp;
        pos = find(input.begin() , input.end(),' ');
        copy(input.begin() , pos ,back_inserter(temp));
        if ( !stopHandler.IsStopWord(temp))
            retWord.push_back(temp);
        if ( pos == input.end())
            break;
        else
            input.erase(input.begin() ,++pos);
    }while ( pos != input.end());
}

TFIDF的计算

#pragma once

#include "Itokenisher.h"
#include <map>

class TFIDFMeasure
{
private:
    StrVec _docs; //文档集合 , 每一行字符串代表一个文档
    int _numDocs; //文档数目
    int _numTerms;//单词数目
    StrVec _terms;//单词集合
    Int2DVec _termFreq ;//每个单词出现在每份文档的频率
    Double2DVec _termWeight;//每个单词在每份文档的权重
    IntVec _maxTermFreq ;//记录每份文档的最大词频
    IntVec _docFreq;//出现这个单词的文档频率
    ITokeniser* _tokeniser;//分词器
    map<string , int > _wordIndex;//单词映射表
public :
    TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser);
    ~TFIDFMeasure();
    inline int NumTerm( ) const
    {
        return this->_numTerms;
    }
    void GetTermVector(int doc , DoubleVec& vec);//获取项向量

protected:
    void init();//初始化tf-idf计数
    void GenerateTerms(const StrVec& ,StrVec& terms);//分词处理
    void GenerateTermFrequency();//计算词频
    void GenerateTermWeight();//计算词的权重
    void GetWordFrequency( string & input ,map<string,int> &freq);
    int CountWords(string& word ,const StrVec& words);
    int GetTermIndex(const string& term);//查询词语对应的下标
    double ComputeTermWeight(int term ,int doc);//计算词语在制定文档的频率
    double GetTermFrequency(int term , int doc);//获取词语在文档的频率
    double GetInverseDoucumentFrequency(int term); //计算逆文档频率

    

};
#include "TF_IDF.h"

TFIDFMeasure::~TFIDFMeasure()
{
    if (this->_tokeniser != NULL)
    {
        delete _tokeniser;
        _tokeniser = NULL;
    }
    _docs.clear();
    _terms.clear();
    _wordIndex.clear();
}
TFIDFMeasure::TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser )
{
    _docs = document;
    _numDocs = document.size();
    _tokeniser = tokeniser;
    this->init();
}
void TFIDFMeasure::init()
{
    //初始化
    this->GenerateTerms(_docs,_terms); //分词
    this->_numTerms = _terms.size(); //所有文档中的词项数目

    //申请空间
    _maxTermFreq.resize(_numDocs);
    _docFreq.resize(_numTerms);
    _termFreq.resize(_numTerms);
    _termWeight.resize(_numTerms);

    for (int i = 0 ; i < _terms.size() ; i++)
    {
        _termWeight[i].resize(_numDocs);
        _termFreq[i].resize(_numDocs);
        _wordIndex[_terms[i]] = i; //将单词放入单词映射表中

    }
    this->GenerateTermFrequency();
    this->GenerateTermWeight();

}
void TFIDFMeasure::GenerateTerms(const StrVec& docs ,StrVec &terms)
{
    for (int i = 0 ; i < docs.size() ;  i++)
    {
        StrVec words;
        _tokeniser->Partition(docs[i] , words); //分词部分

        for ( int j = 0 ; j < words.size() ; j++)
        {
            if ( find(terms.begin() , terms.end(),words[j] ) == terms.end())
                terms.push_back(words[j]);
        }

    }
}
void TFIDFMeasure::GenerateTermFrequency()
{
    //计算每个单词在每份文档中出现的概率
    for ( int i = 0 ; i < _numDocs ; i++)
    {
        string curDoc = _docs[i]; //当前待处理的文档
        map<string,int> freq;
        this->GetWordFrequency(curDoc ,freq);
        map<string,int>::iterator iter;
        _maxTermFreq[i] = numeric_limits<int>::min();
        for ( iter = freq.begin() ; iter != freq.end() ; iter++)
        {
            string word = iter->first;
            int wordFreq = iter->second;
            int termIndex = GetTermIndex(word); //单词下标
            if ( termIndex == -1)
                continue;
            _termFreq[termIndex][i] = wordFreq;
            _docFreq[termIndex]++;

            if ( wordFreq > _maxTermFreq[i]) _maxTermFreq[i] = wordFreq;
        }
    }
}
int TFIDFMeasure::GetTermIndex(const string & term)
{
    map<string , int> ::iterator pos = _wordIndex.find(term);
    if ( pos != _wordIndex.end())
        return pos->second;
    else
        return -1;
}
class WordComp 
{
public:
    WordComp(string& sWord) : word(sWord)
      {

      }
      bool operator() (const string& lhs) 
      {
          return lhs.compare(word)==0;
      }       
private:
    string word;        
};
void TFIDFMeasure::GetWordFrequency( string & input , map<string,int>& freq)
{
    //计算单词频率
    transform(input.begin(),input.end(),input.begin(),tolower);
    StrVec temp;
    this->_tokeniser->Partition(input , temp);
    unique(temp.begin() , temp.end());
    StrVec::iterator iter;
    for ( iter = temp.begin() ; iter != temp.end() ; iter++)
    {
        int count = CountWords(*iter , temp); //计算单词在文档中出现的次数
        freq[*iter] = count;
    }


}
int TFIDFMeasure::CountWords(string & word ,const StrVec& temp)
{
    //计算每个单词在该文档的词频数目
    int ncount = 0 ;
    ncount = count_if(temp.begin() , temp.end() , WordComp(word));
    return ncount ;
}
void TFIDFMeasure::GenerateTermWeight()
{
    for (int i = 0 ; i < _numTerms ; i++)
        for (int j = 0 ; j < _numDocs ; j++)
            _termWeight[i][j] = ComputeTermWeight( i , j );
}
double TFIDFMeasure::ComputeTermWeight(int term , int doc)
{
    float tf = GetTermFrequency(term , doc);
    float idf = GetInverseDoucumentFrequency(term);
    return tf * idf ;
}
double TFIDFMeasure::GetTermFrequency(int term , int doc)
{
    int freq = _termFreq[term][doc]; //词频
    int maxfreq = _maxTermFreq[doc];
    return ((float) freq /(float)maxfreq);
}
double TFIDFMeasure::GetInverseDoucumentFrequency(int term)
{
    int df = _docFreq[term];
    return log((float)(_numDocs)/(float)df);

}
void TFIDFMeasure::GetTermVector(int doc ,DoubleVec& vec)
{
    vec.resize(this->_numTerms);
    for ( int i = 0 ; i < this->_numTerms ; i++)
        vec[i] = _termWeight[i][doc];
}

计算余弦相似性距离

#pragma once
#include "common.h"

class TermVector
{
public:
    static double ComputerCosineSimilarity(const DoubleVec& vector1 , const DoubleVec& vector2 );
    static double innerProduct(const DoubleVec& v1 ,const DoubleVec& v2);
    static double VectorLength(const DoubleVec & v);
};
#include "TermVector.h"
#include <cmath>

double TermVector::ComputerCosineSimilarity(const DoubleVec & v1 , const DoubleVec& v2)
{
    if ( v1.size() != v2.size())
        throw string("different length");

    double denom = (VectorLength(v1) * VectorLength(v2));

    if ( denom == 0 )
        return 0 ;
    else
        return (innerProduct(v1 , v2) / denom);
}

double TermVector::innerProduct(const DoubleVec & v1 , const DoubleVec& v2)
{
    if ( v1.size() != v2.size())
        throw string ("different length");

    double result = 0.0f;
    for ( int i = 0 ; i < v1.size() ; i++)
        result+=v1[i]*v2[i];
    return result;

}
double TermVector::VectorLength(const DoubleVec & v)
{
    double sum = 0.0f;
    for ( int i = 0 ; i < v.size() ; i++)
        sum= sum+(v[i] * v[i]);
    return (double)sqrt(sum);
}

定义cluster的类

#pragma once
#include "common.h"
class Cluster
{
public:
    IntVec CurrentMembership; //该类簇的数据成员索引
    DoubleVec Mean ; //该簇类的聚类中心
    Cluster();
    ~Cluster();
    Cluster(int dataindex , DoubleVec & data);
    void UpdateMean(Double2DVec & coordinates);
};
#include "cluster.h"

Cluster::Cluster()
{
    
}
Cluster::Cluster(int dataindex , DoubleVec& data)
{
    CurrentMembership.push_back(dataindex);
    copy(data.begin() , data.end() ,back_inserter(Mean));
}

void Cluster::UpdateMean(Double2DVec & coordinates)
{
    //根据 mcurrentmembership取得原始资料点对象
    //根据该子集的均值,corrdinate是一个m* n的矩阵,其实就是要求每列的均值
    for (int i = 0 ; i< CurrentMembership.size();i++)
    {
        DoubleVec& coord = coordinates[CurrentMembership[i]];
        for ( int j = 0 ; j < coord.size() ; j++)
            Mean[j]+=coord[j];
        for (int k = 0 ; k <Mean.size() ; k++)
            Mean[k] /= coord.size();
    }
}
Cluster::~Cluster()
{

}
#pragma once
#include "common.h"

class Cluster;

class KMeans
{
public:
    vector<Cluster*> _clusters;
    KMeans(Double2DVec& data, int K);
    void Start();
    ~KMeans();
private:
    int _coordCount; //数据的数量
    Double2DVec _coordinates;//原始数据
    int _k;  //聚类的簇个数
    IntVec _clusterAssignments;

    IntVec _nearestCluster;

    Double2DVec _distanceCache;
    void InitRandom();
    static double getDistance(const DoubleVec & coord ,const DoubleVec& center);
    int NearestCluster(int ndx);
    
    
};
#include "kmean.h"
#include <time.h>
#include "cluster.h"
#include "TermVector.h"
#include <limits>
KMeans::KMeans(Double2DVec &data , int k )
{
    int i ;
    this->_coordinates.resize(data.size());
    for ( i = 0 ; i <data.size() ; i++)
        copy(data[i].begin() , data[i].end(),back_inserter(_coordinates[i]));
    _coordCount = data.size();
    _k = k;
    _clusters.resize(k);
    _clusterAssignments.resize(_coordCount);
    _nearestCluster.resize(_coordCount);
    _distanceCache.resize(_coordCount);
    for ( int i = 0 ; i <_coordCount ; i++)
        _distanceCache[i].resize(_coordCount);
    InitRandom();
}
void KMeans::InitRandom()
{
    srand(unsigned(time(NULL)));
    for (int i = 0 ; i < _k ; i++)
    {
        int temp = rand() %(_coordCount); //产生随机数
        _clusterAssignments[temp] = i; 
        _clusters[i] = new Cluster(temp ,_coordinates[temp]);
    }
}

void KMeans::Start()
{
    int iter = 0 , i , j ;
    while ( true)
    {
        cout <<"Iteration " << iter++ << " ...." <<endl;

        //重新计算每个簇类的均值
        for ( int i = 0 ; i <_k ; i++)
        {
            _clusters[i]->UpdateMean(_coordinates);
        }
        //计算每个数据和每个簇类中心的距离
        for ( i = 0 ; i <_coordCount ; i++)
        {
            for ( j = 0 ; j <_k ; j++)
            {
                double dist = getDistance(_coordinates[i],_clusters[j]->Mean);
                _distanceCache[i][j] = dist;
            }
        }
        //计算每个数据离簇类最近

        for ( i = 0 ; i <_coordCount ; i++)
            _nearestCluster[i] = this->NearestCluster(i);
        int k = 0 ; 
        for ( i = 0 ; i <_coordCount ; i++)
        {
            if (_nearestCluster[i] == _clusterAssignments[i])
                k++;
        }
        if ( k == _coordCount)
            break;

        for ( j = 0 ; j < _k ; j++)
        {
            _clusters[j]->CurrentMembership.clear();
        }
        for ( i = 0 ; i <_coordCount ; i++)
        {
            _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i);
            _clusterAssignments[i] = _nearestCluster[i];
        }

    }
}

double KMeans::getDistance(const DoubleVec& coord , const DoubleVec& center)
{
    return 1 - TermVector::ComputerCosineSimilarity(coord,center);
}
int KMeans::NearestCluster(int ndx)
{
    int near = -1 ;
    double min = numeric_limits<double>::max();
    for ( int c = 0 ; c <_k ; c++)
    {
         double d = _distanceCache[ndx][c];
         if ( d < min)
         {
            min = d ;
            near = c ;
         }
    }
    return near;
}
KMeans::~KMeans()
{
    vector<Cluster*>::iterator iter;
    for ( iter = this->_clusters.begin(); iter!=_clusters.end() ; iter++)
        delete (*iter);
    _clusters.clear();
}
#include "TF_IDF.h"
#include "Tokeniser.h"
#include <fstream>
#include "kmean.h"
#include "cluster.h"
int main()
{
    // 读入文档数据 
    StrVec strVec;
    ifstream inFile("c:\\input.txt");
    string tempstr;
    while ( getline(inFile , tempstr))
    {
        strVec.push_back(tempstr);
    }
    TFIDFMeasure tf(strVec , new Tokeniser());

    int K =3 ; //聚类的个数
    int docCount = strVec.size();
    //生成k-mean的输入数据
    Double2DVec data;
    data.resize(docCount);
    int dimension = tf.NumTerm();
    for ( int i = 0 ; i < docCount ; i++)
    {
        
            tf.GetTermVector( i , data[i]); //获取第i个文档的TFIDF权重向量
    }
    KMeans kmeans(data , K );
    kmeans.Start();

    vector<Cluster*> clusters = kmeans._clusters;
    vector<Cluster*>::iterator iter;
    IntVec::iterator it2 ;
    for ( iter = clusters.begin() ; iter != clusters.end() ; iter++)
    {
        cout <<"------------------------------------" <<endl;
        IntVec & vec = (*iter)->CurrentMembership;
        for ( it2 = vec.begin() ; it2 != vec.end() ; it2++)
            cout <<strVec[*it2] <<endl;

    }
    system("pause");
    return 0 ;
}
原文地址:https://www.cnblogs.com/lzhenf/p/2442526.html