声明:
按类别特征词选择算法声明
vector<pair<string,double> >LocalDFFeatureSelectionForPerclass(DICTIONARY& mymap,CONTINGENCY& contingencyTable,string classLabel);//局部DF法针对每个词对每个类别进行排序
void DFFeatureSelection(vector<string> classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address);//调用局部DF特征词选择函数
void DFFeatureSelection(vector<string> classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address);//调用局部DF特征词选择函数
函数实现:
对词典中的每个词,统计其在某一个类别中出现的次数,并按词频从大到小排序
/************************************************************************/
/* 按类别的DF特征词选择法 */
/************************************************************************/
vector<pair<string,double> > Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY& mymap,CONTINGENCY& contingencyTable ,string classLabel)
{
//int finalKeyWordsCount=0;//计算共取了多少个关键词
clock_t start,finish;
double totaltime;
start=clock();
vector<pair<string,double> >DFinfo;
for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
{
pair<string,string>compoundKey=make_pair(it->first,classLabel);
double classCount=(double)contingencyTable[compoundKey].first;
DFinfo.push_back(make_pair(it->first,classCount));
}
stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
cout<<"为类别"<<classLabel<<"遴选特征词共用了"<<totaltime<<endl;
return DFinfo;
}
/* 按类别的DF特征词选择法 */
/************************************************************************/
vector<pair<string,double> > Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY& mymap,CONTINGENCY& contingencyTable ,string classLabel)
{
//int finalKeyWordsCount=0;//计算共取了多少个关键词
clock_t start,finish;
double totaltime;
start=clock();
vector<pair<string,double> >DFinfo;
for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
{
pair<string,string>compoundKey=make_pair(it->first,classLabel);
double classCount=(double)contingencyTable[compoundKey].first;
DFinfo.push_back(make_pair(it->first,classCount));
}
stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
cout<<"为类别"<<classLabel<<"遴选特征词共用了"<<totaltime<<endl;
return DFinfo;
}
DF特征词选择法:
代码
/************************************************************************/
/* DF特征词选择法 */
/************************************************************************/
void Preprocess:: DFFeatureSelection(vector<string >classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address)
{
clock_t start,finish;
double totaltime;
int totalTraingingCorpus=endIndex-beginIndex+1;//训练语料库总共的文章数目
set<string>finalKeywords;//存放最终遴选出的特征词
vector<pair<string,double>>DFInfo;
start=clock();
for(vector<string>::iterator it=classLabels.begin();it!=classLabels.end();it++)
{
//训练语料库中某个类别的文章数目
int N_subClassCnt=getCategorizationNum(*it,"TrainingCorpus");
//threshold决定每个类别遴选多少个特征词
int threshold=N_subClassCnt*N/totalTraingingCorpus;
DFInfo=LocalDFFeatureSelectionForPerclass(mymap,contingencyTable,*it);
for(vector<pair<string,double> >::size_type j=0;j<threshold;j++)
{
finalKeywords.insert(DFInfo[j].first);
}
DFInfo.clear();
}
ofstream outfile(address);
int finalKeyWordsCount=finalKeywords.size();
for (set<string>::iterator it=finalKeywords.begin();it!=finalKeywords.end();it++)
{
outfile<<*it<<endl;
}
outfile.close();
cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
cout<<"遴选特征词共有了"<<totaltime<<endl;
}
/* DF特征词选择法 */
/************************************************************************/
void Preprocess:: DFFeatureSelection(vector<string >classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address)
{
clock_t start,finish;
double totaltime;
int totalTraingingCorpus=endIndex-beginIndex+1;//训练语料库总共的文章数目
set<string>finalKeywords;//存放最终遴选出的特征词
vector<pair<string,double>>DFInfo;
start=clock();
for(vector<string>::iterator it=classLabels.begin();it!=classLabels.end();it++)
{
//训练语料库中某个类别的文章数目
int N_subClassCnt=getCategorizationNum(*it,"TrainingCorpus");
//threshold决定每个类别遴选多少个特征词
int threshold=N_subClassCnt*N/totalTraingingCorpus;
DFInfo=LocalDFFeatureSelectionForPerclass(mymap,contingencyTable,*it);
for(vector<pair<string,double> >::size_type j=0;j<threshold;j++)
{
finalKeywords.insert(DFInfo[j].first);
}
DFInfo.clear();
}
ofstream outfile(address);
int finalKeyWordsCount=finalKeywords.size();
for (set<string>::iterator it=finalKeywords.begin();it!=finalKeywords.end();it++)
{
outfile<<*it<<endl;
}
outfile.close();
cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
cout<<"遴选特征词共有了"<<totaltime<<endl;
}
主函数调用:
代码
p.LoadDictionary(mymap,"F:\\finallyliuyu\\dict.dat");
p.LoadContingencyTable(contingenyTable,"F:\\finallyliuyu\\contingency.dat");
p.DFFeatureSelection(labels,mymap,contingenyTable,2000,"F:\\finallyliuyu\\keywords.dat");
p.LoadContingencyTable(contingenyTable,"F:\\finallyliuyu\\contingency.dat");
p.DFFeatureSelection(labels,mymap,contingenyTable,2000,"F:\\finallyliuyu\\keywords.dat");