主函数头文件
View Code
1 include "stdafx.h"
2 #include"Preprocess.h"
3 #include"common.h"
4 #include "CorpusProcess.h"
5 #include "LibSvmClassifier.h"
2 #include"Preprocess.h"
3 #include"common.h"
4 #include "CorpusProcess.h"
5 #include "LibSvmClassifier.h"
第一部分:
建立词典和关联表
1 Preprocess::FUNCSEG seg=&Preprocess::goodWordsinPieceArticle;
2 int beginIndex=1;
3 int endIndex=6950;
4 Preprocess p(beginIndex,endIndex);
5 DICTIONARY mymap;
6 CONTINGENCY contigencyTable;
7 FeatureWeight mymapweight;
8 DOCMATRIX_1 trainingSet;
9 DOCMATRIX_1 testingSet;
10 vector<string>labels;
11 string testCorpusTable="ReteursTestingCorpus";
12 string trainCorpusTable="ReteursTrainingCorpus";
13 char*dictaddress="D:\\ReteursForWeka\\dict.dat";
14 char*contigencyaddress="D:\\ReteursForWeka\\contigency.dat";
15 labels=p.GetLabels(testCorpusTable);
16 p.ConstructDictionary(mymap,seg,trainCorpusTable);
17 cout<<"finish construct dictionary"<<endl;
18 p.SaveDictionary(mymap,dictaddress);
19 cout<<"finish save dictionary"<<endl;
20 p.LoadDictionary(mymap,dictaddress);
21 cout<<"finish load dictionary"<<endl;
22 p.GetContingencyTable(mymap,labels,contigencyTable,trainCorpusTable);
23 cout<<"finish construct contigencytable"<<endl;
24 p.SaveContingencyTable(contigencyTable,contigencyaddress);
25 cout<<"finish save contigencytable"<<endl;
26 p.LoadContingencyTable(contigencyTable,contigencyaddress);
27 cout<<"finish loadcontigencytable"<<endl;
2 int beginIndex=1;
3 int endIndex=6950;
4 Preprocess p(beginIndex,endIndex);
5 DICTIONARY mymap;
6 CONTINGENCY contigencyTable;
7 FeatureWeight mymapweight;
8 DOCMATRIX_1 trainingSet;
9 DOCMATRIX_1 testingSet;
10 vector<string>labels;
11 string testCorpusTable="ReteursTestingCorpus";
12 string trainCorpusTable="ReteursTrainingCorpus";
13 char*dictaddress="D:\\ReteursForWeka\\dict.dat";
14 char*contigencyaddress="D:\\ReteursForWeka\\contigency.dat";
15 labels=p.GetLabels(testCorpusTable);
16 p.ConstructDictionary(mymap,seg,trainCorpusTable);
17 cout<<"finish construct dictionary"<<endl;
18 p.SaveDictionary(mymap,dictaddress);
19 cout<<"finish save dictionary"<<endl;
20 p.LoadDictionary(mymap,dictaddress);
21 cout<<"finish load dictionary"<<endl;
22 p.GetContingencyTable(mymap,labels,contigencyTable,trainCorpusTable);
23 cout<<"finish construct contigencytable"<<endl;
24 p.SaveContingencyTable(contigencyTable,contigencyaddress);
25 cout<<"finish save contigencytable"<<endl;
26 p.LoadContingencyTable(contigencyTable,contigencyaddress);
27 cout<<"finish loadcontigencytable"<<endl;
第二部分:
遴选特征词,形成VSM模型,形成arff数据格式
1 char* dest="D:\\ReteursForWeka\\chi\\";
2 int featuredimension[10]={50,100,200,300,400,500,1000,3000,5000,8000};
3 char *weightaddress="D:\\ReteursForWeka\\chi\\wordsweight.dat";
4 char *keywordaddress=new char[1000];
5 char *trainvsmaddress=new char[1000];
6 char *testvsmaddress=new char[1000];
7 p.LoadDictionary(mymap,dictaddress);
8 p.LoadContingencyTable(contigencyTable,contigencyaddress);
9 p.InformationGainFeatureSelection(labels,mymap,mymapweight,contigencyTable,weightaddress);
10
11
12 for (int i=0;i<10;i++)
13 {
14
15 memset(keywordaddress,0,1000);
16 memset(trainvsmaddress,0,1000);
17 memset(testvsmaddress,0,1000);
18
19 sprintf_s(keywordaddress,1000,"%s%skeywords.dat",dest,p.do_fraction(featuredimension[i]).c_str());
20 sprintf_s(trainvsmaddress,1000,"%s%strainCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
21 sprintf_s(testvsmaddress,1000,"%s%stestCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
22 p.FeatureSelectionFactory(labels,mymapweight,weightaddress,keywordaddress,featuredimension[i],true,trainCorpusTable);
23
24 cout<<keywordaddress<<"finish"<<endl;
25 p.WriteHeadArff(testvsmaddress,keywordaddress,labels);
26 p.GetManyVSM(1,2676,testCorpusTable,mymap,testingSet,keywordaddress);
27 p.WriteDataBodyArff(testingSet,testCorpusTable,testvsmaddress,featuredimension[i]);
28 testingSet.clear();
29 cout<<testvsmaddress<<"finish"<<endl;
30 p.WriteHeadArff(trainvsmaddress,keywordaddress,labels);
31 p.VSMConstruction(mymap,trainingSet,keywordaddress);
32 p.WriteDataBodyArff(trainingSet,trainCorpusTable,trainvsmaddress,featuredimension[i]);
33 trainingSet.clear();
34 cout<<trainvsmaddress<<"finish"<<endl;
35
36 }
37
38 delete []keywordaddress;
39 delete []trainvsmaddress;
40 delete []testvsmaddress;
41
42 cout<<"finish"<<endl;
43 int end;
44 cin>>end;
45 return 0;
2 int featuredimension[10]={50,100,200,300,400,500,1000,3000,5000,8000};
3 char *weightaddress="D:\\ReteursForWeka\\chi\\wordsweight.dat";
4 char *keywordaddress=new char[1000];
5 char *trainvsmaddress=new char[1000];
6 char *testvsmaddress=new char[1000];
7 p.LoadDictionary(mymap,dictaddress);
8 p.LoadContingencyTable(contigencyTable,contigencyaddress);
9 p.InformationGainFeatureSelection(labels,mymap,mymapweight,contigencyTable,weightaddress);
10
11
12 for (int i=0;i<10;i++)
13 {
14
15 memset(keywordaddress,0,1000);
16 memset(trainvsmaddress,0,1000);
17 memset(testvsmaddress,0,1000);
18
19 sprintf_s(keywordaddress,1000,"%s%skeywords.dat",dest,p.do_fraction(featuredimension[i]).c_str());
20 sprintf_s(trainvsmaddress,1000,"%s%strainCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
21 sprintf_s(testvsmaddress,1000,"%s%stestCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
22 p.FeatureSelectionFactory(labels,mymapweight,weightaddress,keywordaddress,featuredimension[i],true,trainCorpusTable);
23
24 cout<<keywordaddress<<"finish"<<endl;
25 p.WriteHeadArff(testvsmaddress,keywordaddress,labels);
26 p.GetManyVSM(1,2676,testCorpusTable,mymap,testingSet,keywordaddress);
27 p.WriteDataBodyArff(testingSet,testCorpusTable,testvsmaddress,featuredimension[i]);
28 testingSet.clear();
29 cout<<testvsmaddress<<"finish"<<endl;
30 p.WriteHeadArff(trainvsmaddress,keywordaddress,labels);
31 p.VSMConstruction(mymap,trainingSet,keywordaddress);
32 p.WriteDataBodyArff(trainingSet,trainCorpusTable,trainvsmaddress,featuredimension[i]);
33 trainingSet.clear();
34 cout<<trainvsmaddress<<"finish"<<endl;
35
36 }
37
38 delete []keywordaddress;
39 delete []trainvsmaddress;
40 delete []testvsmaddress;
41
42 cout<<"finish"<<endl;
43 int end;
44 cin>>end;
45 return 0;