头文件:
#ifndef _Preprocess_H #define _Preprocess_H #include<iostream> #include<map> #include<set> #include<vector> #include<string> #include<iomanip> #include<fstream> #include<algorithm> #include<cmath> #include<sstream> #include<limits> #include <xstring> #include"ictclas30.h" #include"boost\tr1\regex.hpp" #include"boost/algorithm/string.hpp" #include"windows.h" /************************************************************************/ /* WkaPreprocess类完成如下功能 将文本集合分词-》去停用词-》建立词袋子模型=》特征词选择=》对文章建立VSM模型= 》写成weka数据格式(arff)-》输出聚类信息 */ /************************************************************************/ //一些谓词函数 using namespace std; class Preprocess { //typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>); private: char *bagofwordsAddress;//存放词袋子模型的位置 char * featurewordsAddress;//存放特征词文件的位置; char *arffFileAddress;//存放ARFF文件的位置 char *infoFromWekaAddress;//存放调用weka后的实验结果 char *articleIdsAddress;//存放被聚类的文章的ID号 char *dbconnection;//数据库的链接字符串 char *dbselect;//数据库select语句 char *dbfield;//数据库字段 int beginIndex;//开始聚类的文章id int endIndex;//结束聚类的文章id public: typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>); Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex) { bagofwordsAddress=new char[c_style_stringsize]; featurewordsAddress=new char[c_style_stringsize]; arffFileAddress=new char[c_style_stringsize]; infoFromWekaAddress=new char[c_style_stringsize]; articleIdsAddress=new char[c_style_stringsize]; dbconnection=new char[c_style_stringsize]; dbselect=new char[c_style_stringsize]; this->beginIndex=beginIndex; this->endIndex=endIndex; sprintf_s(bagofwordsAddress,c_style_stringsize,mydict); sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo); sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster); sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka); sprintf_s(articleIdsAddress,c_style_stringsize,artileIds); sprintf_s(dbconnection,c_style_stringsize,conn); sprintf_s(dbselect,c_style_stringsize,selectsql); } /*Preprocess() { }*/ ~Preprocess() { delete []bagofwordsAddress; delete []featurewordsAddress; delete []arffFileAddress; delete [] infoFromWekaAddress; delete []articleIdsAddress; delete []dbconnection; delete []dbselect; } void trim(string &str,const string val);//去除字符串首尾空白 //构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf int ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg); inline void TruncateArff() { ofstream ofile; ofile.open(arffFileAddress,ios::trunc); ofile.close(); } //保存词袋子到硬盘 void save(map<string,vector<pair<int,int> > >&mymap); //从内存中加载词袋子模型 void load(map<string,vector<pair<int,int> > >&mymap); //打印词袋子模型 void print(map<string,vector<pair<int,int> > >&mymap); //窄字符串转化成宽字符串 wstring myMultibyteToWideChar(string sResult); //宽字符串转化成窄字符串 string myWideCharToMultibyte(wstring wsResult); //调用ICTclass分词 string ICTsplit(const char *sInput); //构造停用词表 set<string>MakeStopSet(); //去除停用词,噪声词 vector<string>goodWordsinPieceArticle(string rawtext,set<string> stopwords); //整数转化成字符串 string do_fraction(int val); //浮点数转化成字符串 string do_fraction(double val, int decplaces=5); //特征词选择算法 void DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold); //获取最后的特征词 vector<string> GetFinalKeyWords(); //获取特征词的maxTF,DF vector<pair<int,int> >GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap); //文档向量模型规范化 vector<pair<int,double> > NormalizationVSM(vector<pair<int,double> > tempVSM); //建立文档向量模型并且写到arff文件里 void VSMFormation(map<string,vector<pair<int,int>>> &mymap); /***单个文档向量模型字符串化***/ string FormatVSMtoString(vector<pair<int,double> > tempVSM); //写Arff文件头部 void WriteHeadArff(); void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg); /******************************************************以下函数完成聚类功能**********************************/ /***************建立文档向量模型,但是不形成字符串***********/ map<int,vector<double> >VSMConstruction(map<string,vector<pair<int,int>>> &mymap); /************从weka给出的结果中获取聚类中心******/ map<string,vector<double> > GetClusters(); /**计算向量的内积*****************8*/ double CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2); /************计算余弦相似度*******/ double CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2); /* 获取聚类信息,即给每篇文章附上一个类别label */ vector<pair<int,string> >GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters); /****返回聚类中每个类别的文章ID******************/ map<string,vector<int> >FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo); void RetreiveArticleInfoFromDataBase(); vector<string> mySplit(string s,set<string> stopwords);//分割关键词 }; #endif
Preprocess类的函数功能实现文件:
#include"stdafx.h" #include "Preprocess.h" #pragma comment(lib, "ICTCLAS30.lib") using namespace std; /************************************************************************/ /* 去掉字符串首尾空白 */ /************************************************************************/ bool isLonger(const pair<string,int> &pair1, const pair<string,int> &pair2) { return pair1.second>pair2.second; } bool cntAssist(const pair<string,int> &pair1) { return pair1.second<=100; } bool PredTF(const pair<int,int>& pair1,int articleId) { return pair1.first==articleId; } class PredTFclass { private: const int m; public: PredTFclass(int id):m(id){}; bool operator()(const pair<int,int>& pair1){return PredTF(pair1,m);}; }; bool myCmp(const pair<string,double>&pair1,const pair<string,double>&pair2 ) { return pair1.second>=pair2.second; } void Preprocess:: trim(string &str,const string val) { str.erase(0,str.find_first_not_of(val)); str.erase(str.find_last_not_of(val)+val.size()); } /************************************************************************/ /* 建立词袋子模型 */ /************************************************************************/ int Preprocess::ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg) { //set<string>MakeStopSet(); CoInitialize(NULL); _ConnectionPtr pConn(__uuidof(Connection)); _RecordsetPtr pRst(__uuidof(Recordset)); pConn->ConnectionString=dbconnection; pConn->Open("","","",adConnectUnspecified); pRst=pConn->Execute(dbselect,NULL,adCmdText); set<string>stopwords=MakeStopSet(); while(!pRst->rsEOF) { vector<string>wordcollection; //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord"); string rawtext=(_bstr_t)pRst->GetCollect(dbfield); if(rawtext!="") { wordcollection=(this->*seg)(rawtext,stopwords); string tempid=(_bstr_t)pRst->GetCollect("ArticleId"); int articleid=atoi(tempid.c_str()); for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++) { vector<pair<int,int>>::iterator it; if(mymap[*strit].empty()) { pair<int,int>mytemppair=make_pair(articleid,1); mymap[*strit].push_back(mytemppair); } else { for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++) { if(it->first==articleid) { it->second=++(it->second); break; } } if(it==mymap[*strit].end()) { pair<int,int>mytemppair=make_pair(articleid,1); mymap[*strit].push_back(mytemppair); } } } } pRst->MoveNext(); wordcollection.clear(); } pRst->Close(); pConn->Close(); pRst.Release(); pConn.Release(); CoUninitialize(); return 0; } /************************************************************************/ /* 保存词袋子模型到硬盘 */ /************************************************************************/ void Preprocess::save(map<string,vector<pair<int,int> > >&mymap) { ofstream outfile(bagofwordsAddress,ios::binary); outfile<<mymap.size()<<endl; map<string,vector<pair<int,int> > >::iterator it; for (it=mymap.begin();it!=mymap.end();it++) { outfile<<it->first<<endl; vector<pair<int,int>>::iterator subit; outfile<<it->second.size()<<endl; for(subit=(it->second).begin();subit!=(it->second).end();++subit) { outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" "; } outfile<<endl; } //outfile.write((char *)&mymap,sizeof(mymap)); outfile.close(); } /************************************************************************/ /* 加载词典信息到内存 */ /************************************************************************/ void Preprocess::load(map<string,vector<pair<int,int> > >&mymap) { std::locale loc1 = std::locale::global(std::locale(".936")); { // 在这里使用std::ifstream 或者 std::fstream ifstream infile(bagofwordsAddress,ios::binary); int lenMyMap;//保存词典长度 int lenVector;//保存每个词出现的文章数目 string key;//保存读出的map的键值 int articleId;//文章标号 int count;//在该文章中刚出现的数目 string comma; string semicolon; infile>>lenMyMap; while(!infile.eof()) { infile>>key; infile>>lenVector; vector<pair<int,int> >temp; for (int i=0;i<lenVector;i++) { infile>>articleId>>count>>semicolon; temp.push_back(make_pair(articleId,count)); } mymap[key]=temp; } infile.close(); } std::locale::global(std::locale(loc1)); } /************************************************************************/ /* 打印词典信息 */ /************************************************************************/ void print(map<string,vector<pair<int,int> > >&mymap) { cout<<mymap.size()<<endl; map<string,vector<pair<int,int> > >::iterator it; for (it=mymap.begin();it!=mymap.end();it++) { cout<<it->first<<endl; vector<pair<int,int>>::iterator subit; cout<<it->second.size()<<endl; for(subit=(it->second).begin();subit!=(it->second).end();++subit) { cout<<subit->first<<','<<subit->second<<";"; } cout<<endl; } } /************************************************************************/ /* 构造停用词表 */ /************************************************************************/ set<string> Preprocess::MakeStopSet() { set<string> stopwordsSet; ifstream ifile("stopwords.txt"); while(!ifile.eof()) { string temp; trim(temp," "); ifile>>temp; stopwordsSet.insert(temp); } return stopwordsSet; } /************************************************************************/ /* 将整数转化成字符串 */ /************************************************************************/ string Preprocess::do_fraction(int val) { ostringstream out; out<<val; string str= out.str(); //从流中取出字符串 str.swap(string(str.c_str()));//删除nul之后的多余字符 return str; } /************************************************************************/ /* 将浮点数转化成指定精度的字符串 */ /************************************************************************/ string Preprocess::do_fraction(double val,int decplaces) { //int prec=numeric_limits<double>::digits10; char DECIMAL_POINT='.'; ostringstream out; //out.precision(prec); out<<val; string str=out.str(); size_t n=str.find(DECIMAL_POINT); if((n!=string::npos)&&n+decplaces<str.size()) { str[n+decplaces]='\0'; } str.swap(string(str.c_str())); return str; } /************************************************************************/ /* 窄字符串砖宽字符串 */ /************************************************************************/ wstring Preprocess::myMultibyteToWideChar(string sResult) { int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符) wchar_t *lpwsz= new wchar_t [iWLen+1]; MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。 lpwsz[iWLen] = L'\0'; wstring wsResult(lpwsz); delete []lpwsz; return wsResult; } /************************************************************************/ /* 宽字符串转窄字符串 */ /************************************************************************/ string Preprocess::myWideCharToMultibyte(wstring wsResult) { string sResult; int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符) char *lpsz= new char[iLen]; WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。 sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。 delete []lpsz; return sResult; } /************************************************************************/ /* 调用ICTclas进行中文分词 */ /************************************************************************/ string Preprocess::ICTsplit(const char *sInput) { if(!ICTCLAS_Init()) { printf("ICTCLAS INIT FAILED!\n"); string strerr(sInput); return strerr; } ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND); //导入用户词典后 /*printf("\n导入用户词典后:\n"); int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典 //保存用户词典 ICTCLAS_SaveTheUsrDic(); printf("导入%d个用户词。\n", nCount);*/ const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0); string strresult(sResult); //printf("%s\n", sResult); //把字符串转化成宽字符串 wstring wsResult=myMultibyteToWideChar(strresult); boost::wregex wreg(L"\\s+"); wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|")); strresult=myWideCharToMultibyte(wsResult); //ofile<<str1; //ofile.close(); //cout<<str1<<endl; //ICTCLAS_FileProcess("text.txt","test_result.txt",1); ICTCLAS_Exit(); return strresult; } /************************************************************************/ /* 对每一篇文章去掉噪声词,剩下好词 */ /************************************************************************/ vector<string>Preprocess::goodWordsinPieceArticle(string rawtext,set<string> stopwords) { vector<wstring> goodWordstemp; vector<string> goodWords; const char* sInput=rawtext.c_str(); string sResult=ICTsplit(sInput); wstring wsResult=myMultibyteToWideChar(sResult); boost::wregex wreg(L"\\d+");//去掉中文空格 wsResult=boost::regex_replace(wsResult,wreg,wstring(L"")); //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg); boost::split(goodWordstemp,wsResult,boost::is_any_of("|")); for(vector<wstring>::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++) { string temp=myWideCharToMultibyte(*it); trim(temp," "); if(!stopwords.count(temp)&&!temp.empty()) { goodWords.push_back(temp); } } return goodWords; } /************************************************************************/ /* DF特征词选择法 */ /************************************************************************/ void Preprocess::DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold) { int finalKeyWordsCount=0;//计算共取了多少个关键词 vector<pair<string,int> >tempvector; for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it) { tempvector.push_back(make_pair(it->first,(it->second).size())); } stable_sort(tempvector.begin(),tempvector.end(),isLonger); ofstream outfile(featurewordsAddress); for(vector<pair<string,int> >::iterator it=tempvector.begin();it!=tempvector.end();it++) { if(it->second>=DFthreshold) { //outfile<<it->first<<" "<<it->second<<endl; outfile<<it->first<<endl; finalKeyWordsCount++; } } outfile.close(); cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl; cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl; } /************************************************************************/ /* 获得最终选定的构造文档向量模型的特征词 */ /************************************************************************/ vector<string>Preprocess::GetFinalKeyWords() { vector<string>myKeys; ifstream infile(featurewordsAddress); while(!infile.eof()) { string temp; infile>>temp; if(temp!="") { myKeys.push_back(temp); } } return myKeys; } /************************************************************************/ /* 获得特征词的maxTF,DF */ /************************************************************************/ vector<pair<int,int> >Preprocess::GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap) { vector<pair<int,int> >maxTFandDF; vector<string>myKeys=GetFinalKeyWords(); for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++) { int DF=mymap[*it].size(); int maxTF=0; for(vector<pair<int,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++) { if(subit->second>maxTF) { maxTF=subit->second; } } maxTFandDF.push_back(make_pair(maxTF,DF)); //find_if(mymap[*it].begin(),mymap[*it].end(), } return maxTFandDF; } /************************************************************************/ /* 文档向量模型归一化 */ /************************************************************************/ vector<pair<int,double> >Preprocess::NormalizationVSM(vector<pair<int,double> > tempVSM) { double sum=0; for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) { sum+=pow(vsmit->second,2); } for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) { vsmit->second/=sqrt(sum); } return tempVSM; } /************************************************************************/ /* 单个文档向量模型字符串化 */ /************************************************************************/ string Preprocess::FormatVSMtoString(vector<pair<int,double> > tempVSM) { string ret="{"; int commaindication=0; for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) { ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8); if(commaindication<tempVSM.size()-1) { ret+=","; } commaindication++; } ret+="}"; return ret; } /************************************************************************/ /* 写Arff头文件 */ /************************************************************************/ void Preprocess::WriteHeadArff() { ofstream ofile(arffFileAddress,ios::binary); ofile<<"@relation aticle"<<endl; ofile<<"\n"; vector<string> myKeys=GetFinalKeyWords(); for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++) { //string temp="@attribute "+"'"+(*it)+"'"+" real"; string temp=""; temp+="@attribute "; temp+="'"; temp+=*(it); temp+="'"; temp+=" real"; /*strcpy(temp,"@attribute "); strcpy(temp,"'"); strcpy(temp,*(it)); strcpy(temp,"'"); strcpy(temp," real");*/ ofile<<temp<<endl; } ofile<<"\n"<<endl; ofile<<"@data"<<endl; ofile.close(); } /************************************************************************/ /* 将实验数据写成arff @data格式 */ /************************************************************************/ void Preprocess::VSMFormation(map<string,vector<pair<int,int>>> &mymap) { int corpus_N=endIndex-beginIndex+1; ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件 ofstream ofile2(arffFileAddress,ios::binary|ios::app); vector<string> myKeys=GetFinalKeyWords(); vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap); for(int i=beginIndex;i<=endIndex;i++) { vector<pair<int,double> >tempVSM; for(vector<string>::size_type j=0;j<myKeys.size();j++) { //vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); TF=0.5+0.5*(double)TF/(maxTFandDF[j].first); TF*=log((double)corpus_N/maxTFandDF[j].second); if(TF!=0) { tempVSM.push_back(make_pair(j,TF)); } } if(!tempVSM.empty()) { tempVSM=NormalizationVSM(tempVSM); string vsmStr=FormatVSMtoString(tempVSM); ofile1<<i<<endl; ofile2<<vsmStr<<endl; } tempVSM.clear(); } ofile1.close(); ofile2.close(); } void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg) { map<string,vector<pair<int,int>>> mymap; if(!isbagOfWordsExist) { ConstructMap(mymap,dbfield,seg); save(mymap); cout<<"词袋子信息已经保存到硬盘"<<endl; } else { load(mymap); } DFcharicteristicWordSelection(mymap,DFthreshold); WriteHeadArff(); VSMFormation(mymap); cout<<"arff文件已经形成"<<endl; string temp(infoFromWekaAddress); cout<<"请您将使用weka聚类,并保存为"<<temp<<endl; } /*****************以下函数辅助完成聚类功能*********************************************************************8**********************/ /************************************************************************/ /* 建立文档向量模型 */ /************************************************************************/ map<int,vector<double> > Preprocess::VSMConstruction(map<string,vector<pair<int,int>>> &mymap) { int corpus_N=endIndex-beginIndex+1; map<int,vector<double>> vsmMatrix; vector<string> myKeys=GetFinalKeyWords(); vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap); for(int i=beginIndex;i<=endIndex;i++) { vector<pair<int,double> >tempVSM; for(vector<string>::size_type j=0;j<myKeys.size();j++) { //vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); TF=0.5+(double)TF/(maxTFandDF[j].first); TF*=log((double)corpus_N/maxTFandDF[j].second); tempVSM.push_back(make_pair(j,TF)); } if(!tempVSM.empty()) { tempVSM=NormalizationVSM(tempVSM); for(vector<pair<int,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++) { vsmMatrix[i].push_back(it->second); } } tempVSM.clear(); } return vsmMatrix; } /************************************************************************/ /* 获得Weka提供的聚类信息 */ /************************************************************************/ map<string,vector<double> > Preprocess::GetClusters() { map<string,vector<double> >clusters; ifstream ifile(infoFromWekaAddress); string temp; while(getline(ifile,temp)) { boost::smatch matchcluster; boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase); if(boost::regex_search(temp,matchcluster,regcluster)) { string clustertmp=matchcluster[0].str(); string ordinates=""; getline(ifile,ordinates); boost::regex regordinates("\\d+(\\.\\d{1,4})?"); boost::smatch matchordinates; std::string::const_iterator it=ordinates.begin(); std::string::const_iterator end=ordinates.end(); while (boost::regex_search(it,end,matchordinates,regordinates)) { string digitstemp=matchordinates[0].str(); double digitval=0.0; std::stringstream ss; ss<<digitstemp; ss>>digitval; clusters[clustertmp].push_back(digitval); it=matchordinates[0].second; } } } return clusters; } /**计算向量内积*/ double Preprocess::CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2) { double result = 0.0f; for (int i = 0; i < vector1.size(); i++) result += vector1[i] * vector2[i]; return result; } /**计算向量余弦相似度*/ double Preprocess::CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2) { double numerator=CalDotProductOfVectors(vector1,vector2); double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2); denominator=sqrt(denominator); return numerator/denominator; } /**为每篇文章打上个类别标签*/ vector<pair<int,string> > Preprocess::GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters) { vector<pair<int,string> >resultInfo; for(map<int,vector<double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++) { vector<pair<string,double> >clusterDistanceAist; for(map<string,vector<double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++) { double temp=CalCosineofVectors(it->second,clusterit->second); clusterDistanceAist.push_back(make_pair(clusterit->first,temp)); } sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp); vector<pair<string,double> >::iterator cDAit=clusterDistanceAist.begin(); resultInfo.push_back(make_pair(it->first,cDAit->first)); clusterDistanceAist.clear(); } return resultInfo; } /************************************************************************/ /* 获取每个类别所包含的文章ID */ /************************************************************************/ map<string,vector<int> > Preprocess::FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo) { map<string,vector<int>> articlesInfo; for(vector<pair<int,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++) { for(map<string,vector<double> >::iterator it=clusters.begin();it!=clusters.end();it++) { if(retit->second==it->first) { articlesInfo[it->first].push_back(retit->first); } } } return articlesInfo; } void Preprocess::RetreiveArticleInfoFromDataBase() { map<string,vector<pair<int,int>>> mymap; vector<pair<int,string>>resultInfo; map<string,vector<double> >clusters; map<int,vector<double> >vsmMatrix; map<string,vector<int>> articlesInfo; ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt"); //boost::regex_replace(strresult) //ConstructMap(mymap,1,500); //save(mymap); load(mymap); vsmMatrix=VSMConstruction(mymap); clusters=GetClusters(); resultInfo=GenerateClusterInfo(vsmMatrix,clusters); articlesInfo=FetchArticlesOFClusters(clusters,resultInfo); /*for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++) { ofile<<it->first<<endl; int count=0; ofile<<"("; for(int i=0;i<it->second.size();i++) { ofile<<(it->second)[i]; if(count<it->second.size()-1) { ofile<<","; } count++; } ofile<<")"; ofile<<endl; }*/ for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++) { ostringstream out; string selectassist; char *selectsql=new char[5000]; int count=0; CoInitialize(NULL); _ConnectionPtr pConn(__uuidof(Connection)); _RecordsetPtr pRst(__uuidof(Recordset)); pConn->ConnectionString=dbconnection; pConn->Open("","","",adConnectUnspecified); cout <<it->first<<endl; ofile<<it->first<<endl; out<<"("; count=0; for(int i=0;i<it->second.size();i++) { out<<(it->second)[i]; if(count<it->second.size()-1) { out<<","; } count++; } out<<")"; selectassist=out.str(); sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str()); pRst=pConn->Execute(selectsql,NULL,adCmdText); while(!pRst->rsEOF) { //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord"); string title=(_bstr_t)pRst->GetCollect("ArticleTitle"); //string rawtext=(_bstr_t)pRst->GetCollect("ArticleText"); string categorization=(_bstr_t)pRst->GetCollect("class"); cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; pRst->MoveNext(); } pRst->Close(); pConn->Close(); pRst.Release(); pConn.Release(); CoUninitialize(); } ofile.close(); } /********按空白把关键词分割开*****************/ vector<string>Preprocess:: mySplit(string s,set<string> stopwords) { vector<string> wordCollection; trim(s," "); int nPosBegin=0; int nPosEnd=s.find(' ',nPosBegin); while(nPosEnd!=string::npos) { string temp=s.substr(nPosBegin,nPosEnd-nPosBegin); trim(temp," "); wordCollection.push_back(temp); nPosBegin=s.find_first_not_of(' ',nPosEnd); nPosEnd=s.find(' ',nPosBegin); } string temp=s.substr(nPosBegin,s.size()-nPosBegin); trim(temp," "); wordCollection.push_back(temp); return wordCollection; }