Kmeans文本聚类：获取weka计算的聚类中心，完成文本聚类

作者：finallyliuyu 转载使用等请注明出处

上一讲：Kmeans文本聚类之VSM模型中，给出了如何建立文档向量模型，以及写weka软件所要求的数据格式arff的代码。这里我们将介绍从weka中获取聚类中心，完成聚类的代码。

至于如何用weka聚类，该软件使用说明等之类问题，本系列博客不做介绍，请大家自行google之。

$V_D{[{XRDF)ZD`O)[77J2$Q$

我们找到我们已经写好的arff文件：

点击start,出现结果后，单击鼠标右键出现”sava result buffer”选项，就可以将右侧客户区的信息保存下来，按照上面控制台程序的提示，我们将这份信息保存为F：\cluster\InfoFromWeka.dat

以下代码为从F：\cluster\InfoFromWeka.dat中取出聚类中心，实现文本聚类的模块

************************************************************************/
/* 获得Weka提供的聚类信息                                                                     */
/************************************************************************/
map<string,vector<double> > Preprocess::GetClusters()
{

	map<string,vector<double> >clusters;
	ifstream ifile(infoFromWekaAddress);
	string temp;
	while(getline(ifile,temp))
	{   boost::smatch matchcluster;
	boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
	if(boost::regex_search(temp,matchcluster,regcluster))	
	{   
		string clustertmp=matchcluster[0].str();
		string ordinates="";
		getline(ifile,ordinates);
		boost::regex regordinates("\\d+(\\.\\d{1,4})?");
		boost::smatch matchordinates;
		std::string::const_iterator it=ordinates.begin();  
		std::string::const_iterator end=ordinates.end();
		while (boost::regex_search(it,end,matchordinates,regordinates)) 
		{       
			string digitstemp=matchordinates[0].str();
			double digitval=0.0;
			std::stringstream ss;
			ss<<digitstemp;
			ss>>digitval;
			clusters[clustertmp].push_back(digitval);
			it=matchordinates[0].second; 
		}





	}
	}
	return clusters;
}

建立文档向量模型的代码：注意此处只对整个文档集合建立文档向量模型，而不字符串化，这是该函数与Kmeans文本聚类之VSM模型中 VSMFormation函数的主要区别。此处建立的文档向量模型用于和聚类中心计算余弦相似度，然后将该篇文档划分给与它最相似的聚类中心。

map<int,vector<double> > Preprocess::VSMConstruction(map<string,vector<pair<int,int>>> &mymap)
{   
	int corpus_N=endIndex-beginIndex+1;
	map<int,vector<double>> vsmMatrix;
	vector<string> myKeys=GetFinalKeyWords();
	vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
	for(int i=beginIndex;i<=endIndex;i++)
	{   
		vector<pair<int,double> >tempVSM;
		for(vector<string>::size_type j=0;j<myKeys.size();j++)
		{
			//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
			double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
			TF=0.5+(double)TF/(maxTFandDF[j].first);
			TF*=log((double)corpus_N/maxTFandDF[j].second);
			tempVSM.push_back(make_pair(j,TF));

		}
		if(!tempVSM.empty())
		{
			tempVSM=NormalizationVSM(tempVSM);
			for(vector<pair<int,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
			{
				vsmMatrix[i].push_back(it->second);
			}



		}
		tempVSM.clear();



	}
	return vsmMatrix;

}

/**计算向量内积*/
double Preprocess::CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2)
{
	double result = 0.0f;
	for (int i = 0; i < vector1.size(); i++)
		result += vector1[i] * vector2[i];
	return result;
}

/**计算向量余弦相似度*/
double Preprocess::CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2)
{
	double numerator=CalDotProductOfVectors(vector1,vector2);
	double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
	denominator=sqrt(denominator);
	return numerator/denominator;
}

聚类，给每一篇文章打上类别标签

vector<pair<int,string> > Preprocess::GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters)
{
	vector<pair<int,string> >resultInfo;
	for(map<int,vector<double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
	{
		vector<pair<string,double> >clusterDistanceAist;
		for(map<string,vector<double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
		{

			double temp=CalCosineofVectors(it->second,clusterit->second);
			clusterDistanceAist.push_back(make_pair(clusterit->first,temp));

		}
		sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
		vector<pair<string,double> >::iterator cDAit=clusterDistanceAist.begin();

		resultInfo.push_back(make_pair(it->first,cDAit->first));
		clusterDistanceAist.clear();
	}
	return  resultInfo;

}

/************************************************************************/
/* 获取每个类别所包含的文章ID                                           */
/************************************************************************/
map<string,vector<int> > Preprocess::FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo)
{
	map<string,vector<int>> articlesInfo;

	for(vector<pair<int,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
	{
		for(map<string,vector<double> >::iterator it=clusters.begin();it!=clusters.end();it++)
		{
			if(retit->second==it->first)
			{
				articlesInfo[it->first].push_back(retit->first);
			}
		}
	}





	return articlesInfo;


}