借助weka实现的分类器进行针对文本分类问题的特征词选择实验(实验代码备份)

主函数头文件

View Code
1 include "stdafx.h"
2 #include"Preprocess.h"
3 #include"common.h"
4 #include "CorpusProcess.h"
5 #include "LibSvmClassifier.h"

第一部分:

建立词典和关联表

 1 Preprocess::FUNCSEG seg=&Preprocess::goodWordsinPieceArticle;
 2     int beginIndex=1;
 3     int endIndex=6950;
 4     Preprocess p(beginIndex,endIndex);
 5     DICTIONARY mymap;
 6     CONTINGENCY contigencyTable;
 7     FeatureWeight mymapweight;
 8     DOCMATRIX_1 trainingSet;
 9     DOCMATRIX_1 testingSet;
10     vector<string>labels;
11     string testCorpusTable="ReteursTestingCorpus";
12     string trainCorpusTable="ReteursTrainingCorpus";
13     char*dictaddress="D:\\ReteursForWeka\\dict.dat";
14     char*contigencyaddress="D:\\ReteursForWeka\\contigency.dat";
15     labels=p.GetLabels(testCorpusTable);
16     p.ConstructDictionary(mymap,seg,trainCorpusTable);
17     cout<<"finish construct dictionary"<<endl;
18     p.SaveDictionary(mymap,dictaddress);
19     cout<<"finish save dictionary"<<endl;
20     p.LoadDictionary(mymap,dictaddress);
21     cout<<"finish load dictionary"<<endl;
22     p.GetContingencyTable(mymap,labels,contigencyTable,trainCorpusTable);
23     cout<<"finish construct contigencytable"<<endl;
24     p.SaveContingencyTable(contigencyTable,contigencyaddress);
25     cout<<"finish save contigencytable"<<endl;
26     p.LoadContingencyTable(contigencyTable,contigencyaddress);
27     cout<<"finish loadcontigencytable"<<endl;

第二部分:

遴选特征词,形成VSM模型,形成arff数据格式

 1 char* dest="D:\\ReteursForWeka\\chi\\";
 2     int featuredimension[10]={50,100,200,300,400,500,1000,3000,5000,8000};
 3     char *weightaddress="D:\\ReteursForWeka\\chi\\wordsweight.dat";
 4     char *keywordaddress=new char[1000];
 5     char *trainvsmaddress=new char[1000]; 
 6     char *testvsmaddress=new char[1000];
 7     p.LoadDictionary(mymap,dictaddress);
 8     p.LoadContingencyTable(contigencyTable,contigencyaddress);
 9     p.InformationGainFeatureSelection(labels,mymap,mymapweight,contigencyTable,weightaddress);
10 
11 
12     for (int i=0;i<10;i++)
13     {
14 
15         memset(keywordaddress,0,1000);
16         memset(trainvsmaddress,0,1000);
17         memset(testvsmaddress,0,1000);
18 
19         sprintf_s(keywordaddress,1000,"%s%skeywords.dat",dest,p.do_fraction(featuredimension[i]).c_str());
20         sprintf_s(trainvsmaddress,1000,"%s%strainCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
21         sprintf_s(testvsmaddress,1000,"%s%stestCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
22         p.FeatureSelectionFactory(labels,mymapweight,weightaddress,keywordaddress,featuredimension[i],true,trainCorpusTable);
23 
24         cout<<keywordaddress<<"finish"<<endl;
25         p.WriteHeadArff(testvsmaddress,keywordaddress,labels);
26         p.GetManyVSM(1,2676,testCorpusTable,mymap,testingSet,keywordaddress);
27         p.WriteDataBodyArff(testingSet,testCorpusTable,testvsmaddress,featuredimension[i]);
28         testingSet.clear();
29         cout<<testvsmaddress<<"finish"<<endl;
30         p.WriteHeadArff(trainvsmaddress,keywordaddress,labels);
31         p.VSMConstruction(mymap,trainingSet,keywordaddress);
32         p.WriteDataBodyArff(trainingSet,trainCorpusTable,trainvsmaddress,featuredimension[i]);
33         trainingSet.clear();
34         cout<<trainvsmaddress<<"finish"<<endl;
35 
36     }
37 
38     delete []keywordaddress;
39     delete []trainvsmaddress;
40     delete []testvsmaddress;
41     
42     cout<<"finish"<<endl;
43     int end;
44     cin>>end;
45     return 0;
原文地址:https://www.cnblogs.com/finallyliuyu/p/1996252.html