K-meams文本聚类算法C++实现
FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html
1 头文件: 2 3 4 #ifndef _Preprocess_H 5 #define _Preprocess_H 6 #include 7 #include 8 #include 9 #include 10 #include 11 #include 12 #include 13 #include 14 #include 15 #include 16 #include 17 #include 18 #include"ictclas30.h" 19 #include"boost\tr1\regex.hpp" 20 #include"boost/algorithm/string.hpp" 21 #include"windows.h" 22 23 //一些谓词函数 24 using namespace std; 25 26 class Preprocess 27 { 28 //typedef vector(Preprocess::*FUNCSEG)(string,set); 29 private: 30 char *bagofwordsAddress;//存放词袋子模型的位置 31 char * featurewordsAddress;//存放特征词文件的位置; 32 char *arffFileAddress;//存放ARFF文件的位置 33 char *infoFromWekaAddress;//存放调用weka后的实验结果 34 char *articleIdsAddress;//存放被聚类的文章的ID号 35 char *dbconnection;//数据库的链接字符串 36 char *dbselect;//数据库select语句 37 char *dbfield;//数据库字段 38 int beginIndex;//开始聚类的文章id 39 int endIndex;//结束聚类的文章id 40 public: 41 typedef vector(Preprocess::*FUNCSEG)(string,set); 42 Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex) 43 { 44 bagofwordsAddress=new char[c_style_stringsize]; 45 featurewordsAddress=new char[c_style_stringsize]; 46 arffFileAddress=new char[c_style_stringsize]; 47 infoFromWekaAddress=new char[c_style_stringsize]; 48 articleIdsAddress=new char[c_style_stringsize]; 49 dbconnection=new char[c_style_stringsize]; 50 dbselect=new char[c_style_stringsize]; 51 this->beginIndex=beginIndex; 52 this->endIndex=endIndex; 53 sprintf_s(bagofwordsAddress,c_style_stringsize,mydict); 54 sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo); 55 sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster); 56 sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka); 57 sprintf_s(articleIdsAddress,c_style_stringsize,artileIds); 58 sprintf_s(dbconnection,c_style_stringsize,conn); 59 sprintf_s(dbselect,c_style_stringsize,selectsql); 60 61 62 63 } 64 65 66 ~Preprocess() 67 { 68 delete []bagofwordsAddress; 69 delete []featurewordsAddress; 70 delete []arffFileAddress; 71 delete [] infoFromWekaAddress; 72 delete []articleIdsAddress; 73 delete []dbconnection; 74 delete []dbselect; 75 76 77 } 78 void trim(string &str,const string val);//去除字符串首尾空白 79 //构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf 80 int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg); 81 inline void TruncateArff() 82 { 83 ofstream ofile; 84 ofile.open(arffFileAddress,ios::trunc); 85 ofile.close(); 86 } 87 //保存词袋子到硬盘 88 void save(mapint,int> > >&mymap); 89 //从内存中加载词袋子模型 90 void load(mapint,int> > >&mymap); 91 //打印词袋子模型 92 void print(mapint,int> > >&mymap); 93 //窄字符串转化成宽字符串 94 wstring myMultibyteToWideChar(string sResult); 95 //宽字符串转化成窄字符串 96 string myWideCharToMultibyte(wstring wsResult); 97 //调用ICTclass分词 98 string ICTsplit(const char *sInput); 99 //构造停用词表 100 setMakeStopSet(); 101 //去除停用词,噪声词 102 vectorgoodWordsinPieceArticle(string rawtext,set stopwords); 103 //整数转化成字符串 104 string do_fraction(int val); 105 //浮点数转化成字符串 106 string do_fraction(double val, int decplaces=5); 107 //特征词选择算法 108 void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold); 109 //获取最后的特征词 110 vector GetFinalKeyWords(); 111 //获取特征词的maxTF,DF 112 vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap); 113 //文档向量模型规范化 114 vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM); 115 //建立文档向量模型并且写到arff文件里 116 void VSMFormation(mapint,int>>> &mymap); 117 118 string FormatVSMtoString(vectorint,double> > tempVSM); 119 //写Arff文件头部 120 void WriteHeadArff(); 121 void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg); 122 123 124 map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap); 125 126 map<</code>double> > GetClusters(); 127 128 double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); 129 130 double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); 131 132 vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters); 133 134 map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo); 135 void RetreiveArticleInfoFromDataBase(); 136 vector mySplit(string s,set stopwords);//分割关键词 137 138 139 140 141 142 143 144 145 }; 146 147 148 149 #endif 150 151 152 Preprocess类的函数功能实现文件: 153 154 155 156 157 #include"stdafx.h" 158 #include "Preprocess.h" 159 160 #pragma comment(lib, "ICTCLAS30.lib") 161 using namespace std; 162 bool isLonger(const pairint> &pair1, const pairint> &pair2) 163 { 164 return pair1.second>pair2.second; 165 } 166 bool cntAssist(const pairint> &pair1) 167 { 168 return pair1.second<=100; 169 } 170 bool PredTF(const pair<</code>int,int>& pair1,int articleId) 171 { 172 return pair1.first==articleId; 173 174 } 175 class PredTFclass 176 { 177 private: const int m; 178 public: 179 PredTFclass(int id):m(id){}; 180 bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);}; 181 }; 182 bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 ) 183 { 184 return pair1.second>=pair2.second; 185 } 186 187 void Preprocess:: trim(string &str,const string val) 188 { 189 str.erase(0,str.find_first_not_of(val)); 190 str.erase(str.find_last_not_of(val)+val.size()); 191 } 192 int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg) 193 { 194 //setMakeStopSet(); 195 CoInitialize(NULL); 196 _ConnectionPtr pConn(__uuidof(Connection)); 197 _RecordsetPtr pRst(__uuidof(Recordset)); 198 pConn->ConnectionString=dbconnection; 199 pConn->Open("","","",adConnectUnspecified); 200 pRst=pConn->Execute(dbselect,NULL,adCmdText); 201 setstopwords=MakeStopSet(); 202 203 while(!pRst->rsEOF) 204 { vectorwordcollection; 205 //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord"); 206 string rawtext=(_bstr_t)pRst->GetCollect(dbfield); 207 if(rawtext!="") 208 { 209 wordcollection=(this->*seg)(rawtext,stopwords); 210 string tempid=(_bstr_t)pRst->GetCollect("ArticleId"); 211 int articleid=atoi(tempid.c_str()); 212 for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++) 213 { 214 vectorint,int>>::iterator it; 215 if(mymap[*strit].empty()) 216 { 217 pair<</code>int,int>mytemppair=make_pair(articleid,1); 218 mymap[*strit].push_back(mytemppair); 219 220 } 221 else 222 { 223 for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++) 224 { 225 if(it->first==articleid) 226 { 227 it->second=++(it->second); 228 break; 229 } 230 231 } 232 if(it==mymap[*strit].end()) 233 { 234 pair<</code>int,int>mytemppair=make_pair(articleid,1); 235 mymap[*strit].push_back(mytemppair); 236 } 237 238 } 239 240 } 241 242 243 } 244 245 246 pRst->MoveNext(); 247 wordcollection.clear(); 248 } 249 pRst->Close(); 250 pConn->Close(); 251 pRst.Release(); 252 pConn.Release(); 253 CoUninitialize(); 254 255 return 0; 256 257 } 258 void Preprocess::save(mapint,int> > >&mymap) 259 { 260 ofstream outfile(bagofwordsAddress,ios::binary); 261 outfile<<mymap.size()<<endl; 262 mapint,int> > >::iterator it; 263 for (it=mymap.begin();it!=mymap.end();it++) 264 { outfile<<it->first<<endl; 265 vectorint,int>>::iterator subit; 266 outfile<<it->second.size()<<endl; 267 for(subit=(it->second).begin();subit!=(it->second).end();++subit) 268 { 269 outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" "; 270 } 271 outfile<<endl; 272 } 273 //outfile.write((char *)&mymap,sizeof(mymap)); 274 275 outfile.close(); 276 277 } 278 void Preprocess::load(mapint,int> > >&mymap) 279 { 280 std::locale loc1 = std::locale::global(std::locale(".936")); 281 { 282 // 在这里使用std::ifstream 或者 std::fstream 283 ifstream infile(bagofwordsAddress,ios::binary); 284 int lenMyMap;//保存词典长度 285 int lenVector;//保存每个词出现的文章数目 286 string key;//保存读出的map的键值 287 int articleId;//文章标号 288 int count;//在该文章中刚出现的数目 289 string comma; 290 string semicolon; 291 infile>>lenMyMap; 292 while(!infile.eof()) 293 { 294 infile>>key; 295 infile>>lenVector; 296 vectorint,int> >temp; 297 for (int i=0;i 298 { 299 infile>>articleId>>count>>semicolon; 300 temp.push_back(make_pair(articleId,count)); 301 } 302 mymap[key]=temp; 303 304 305 } 306 307 308 infile.close(); 309 } 310 std::locale::global(std::locale(loc1)); 311 312 } 313 void print(mapint,int> > >&mymap) 314 { 315 cout<<mymap.size()<<endl; 316 mapint,int> > >::iterator it; 317 for (it=mymap.begin();it!=mymap.end();it++) 318 { cout<<it->first<<endl; 319 vectorint,int>>::iterator subit; 320 cout<<it->second.size()<<endl; 321 for(subit=(it->second).begin();subit!=(it->second).end();++subit) 322 { 323 cout<<subit->first<<','<<subit->second<<";"; 324 } 325 cout<<endl; 326 } 327 328 } 329 set Preprocess::MakeStopSet() 330 { 331 set stopwordsSet; 332 ifstream ifile("stopwords.txt"); 333 while(!ifile.eof()) 334 { 335 string temp; 336 trim(temp," "); 337 ifile>>temp; 338 stopwordsSet.insert(temp); 339 } 340 return stopwordsSet; 341 } 342 343 string Preprocess::do_fraction(int val) 344 { 345 ostringstream out; 346 out<<val; 347 string str= out.str(); //从流中取出字符串 348 str.swap(string(str.c_str()));//删除nul之后的多余字符 349 return str; 350 351 } 352 string Preprocess::do_fraction(double val,int decplaces) 353 { 354 355 //int prec=numeric_limits::digits10; 356 char DECIMAL_POINT='.'; 357 ostringstream out; 358 //out.precision(prec); 359 out<<val; 360 string str=out.str(); 361 size_t n=str.find(DECIMAL_POINT); 362 if((n!=string::npos)&&n+decplaces 363 { 364 str[n+decplaces]='\0'; 365 } 366 str.swap(string(str.c_str())); 367 368 return str; 369 } 370 wstring Preprocess::myMultibyteToWideChar(string sResult) 371 { 372 int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符) 373 wchar_t *lpwsz= new wchar_t [iWLen+1]; 374 MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。 375 lpwsz[iWLen] = L'\0'; 376 wstring wsResult(lpwsz); 377 delete []lpwsz; 378 return wsResult; 379 } 380 string Preprocess::myWideCharToMultibyte(wstring wsResult) 381 { 382 string sResult; 383 int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符) 384 char *lpsz= new char[iLen]; 385 WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。 386 sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。 387 delete []lpsz; 388 return sResult; 389 390 } 391 string Preprocess::ICTsplit(const char *sInput) 392 { 393 if(!ICTCLAS_Init()) 394 { 395 printf("ICTCLAS INIT FAILED!\n"); 396 string strerr(sInput); 397 return strerr; 398 } 399 ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND); 400 //导入用户词典后 401 402 403 const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0); 404 string strresult(sResult); 405 //printf("%s\n", sResult); 406 //把字符串转化成宽字符串 407 wstring wsResult=myMultibyteToWideChar(strresult); 408 boost::wregex wreg(L"\\s+"); 409 wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|")); 410 strresult=myWideCharToMultibyte(wsResult); 411 412 413 414 //ofile<<str1; 415 //ofile.close(); 416 //cout<<str1<<endl; 417 //ICTCLAS_FileProcess("text.txt","test_result.txt",1); 418 ICTCLAS_Exit(); 419 420 return strresult; 421 } 422 vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords) 423 { 424 vector goodWordstemp; 425 vector goodWords; 426 const char* sInput=rawtext.c_str(); 427 string sResult=ICTsplit(sInput); 428 wstring wsResult=myMultibyteToWideChar(sResult); 429 boost::wregex wreg(L"\\d+");//去掉中文空格 430 wsResult=boost::regex_replace(wsResult,wreg,wstring(L"")); 431 //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg); 432 boost::split(goodWordstemp,wsResult,boost::is_any_of("|")); 433 434 for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++) 435 { 436 string temp=myWideCharToMultibyte(*it); 437 trim(temp," "); 438 if(!stopwords.count(temp)&&!temp.empty()) 439 { 440 goodWords.push_back(temp); 441 } 442 443 444 } 445 446 return goodWords; 447 } 448 void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold) 449 { 450 int finalKeyWordsCount=0;//计算共取了多少个关键词 451 vectorint> >tempvector; 452 for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it) 453 { 454 tempvector.push_back(make_pair(it->first,(it->second).size())); 455 } 456 457 stable_sort(tempvector.begin(),tempvector.end(),isLonger); 458 ofstream outfile(featurewordsAddress); 459 for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++) 460 { 461 if(it->second>=DFthreshold) 462 { 463 //outfile<<it->first<<" "<<it->second<<endl; 464 outfile<<it->first<<endl; 465 finalKeyWordsCount++; 466 467 } 468 469 } 470 outfile.close(); 471 cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl; 472 cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl; 473 474 } 475 vectorPreprocess::GetFinalKeyWords() 476 { 477 vectormyKeys; 478 ifstream infile(featurewordsAddress); 479 while(!infile.eof()) 480 { 481 string temp; 482 infile>>temp; 483 if(temp!="") 484 { 485 myKeys.push_back(temp); 486 } 487 488 489 } 490 return myKeys; 491 } 492 vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap) 493 { 494 vectorint,int> >maxTFandDF; 495 vectormyKeys=GetFinalKeyWords(); 496 for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++) 497 { 498 int DF=mymap[*it].size(); 499 int maxTF=0; 500 for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++) 501 { 502 if(subit->second>maxTF) 503 { 504 maxTF=subit->second; 505 } 506 507 } 508 maxTFandDF.push_back(make_pair(maxTF,DF)); 509 //find_if(mymap[*it].begin(),mymap[*it].end(), 510 } 511 return maxTFandDF; 512 } 513 vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM) 514 { 515 516 double sum=0; 517 for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) 518 { 519 sum+=pow(vsmit->second,2); 520 } 521 for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) 522 { 523 vsmit->second/=sqrt(sum); 524 } 525 return tempVSM; 526 527 } 528 string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM) 529 { 530 string ret="{"; 531 int commaindication=0; 532 for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit) 533 { 534 535 ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8); 536 if(commaindication 537 { 538 ret+=","; 539 } 540 commaindication++; 541 } 542 ret+="}"; 543 return ret; 544 } 545 void Preprocess::WriteHeadArff() 546 { 547 ofstream ofile(arffFileAddress,ios::binary); 548 ofile<<"@relation aticle"<<endl; 549 ofile<<"\n"; 550 vector myKeys=GetFinalKeyWords(); 551 for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++) 552 { 553 //string temp="@attribute "+"'"+(*it)+"'"+" real"; 554 string temp=""; 555 temp+="@attribute "; 556 temp+="'"; 557 temp+=*(it); 558 temp+="'"; 559 temp+=" real"; 560 561 562 ofile<<temp<<endl; 563 } 564 ofile<<"\n"<<endl; 565 ofile<<"@data"<<endl; 566 ofile.close(); 567 } 568 void Preprocess::VSMFormation(mapint,int>>> &mymap) 569 { int corpus_N=endIndex-beginIndex+1; 570 ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件 571 ofstream ofile2(arffFileAddress,ios::binary|ios::app); 572 573 vector myKeys=GetFinalKeyWords(); 574 vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap); 575 for(int i=beginIndex;i<=endIndex;i++) 576 { vectorint,double> >tempVSM; 577 for(vector::size_type j=0;j 578 { 579 //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); 580 double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); 581 582 583 TF=0.5+0.5*(double)TF/(maxTFandDF[j].first); 584 TF*=log((double)corpus_N/maxTFandDF[j].second); 585 if(TF!=0) 586 { 587 tempVSM.push_back(make_pair(j,TF)); 588 589 } 590 591 592 593 } 594 if(!tempVSM.empty()) 595 { 596 tempVSM=NormalizationVSM(tempVSM); 597 string vsmStr=FormatVSMtoString(tempVSM); 598 ofile1<<i<<endl; 599 ofile2<<vsmStr<<endl; 600 } 601 tempVSM.clear(); 602 603 604 605 } 606 ofile1.close(); 607 ofile2.close(); 608 609 610 } 611 void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg) 612 { 613 614 615 mapint,int>>> mymap; 616 if(!isbagOfWordsExist) 617 { 618 ConstructMap(mymap,dbfield,seg); 619 save(mymap); 620 cout<<"词袋子信息已经保存到硬盘"<<endl; 621 } 622 else 623 { 624 load(mymap); 625 } 626 DFcharicteristicWordSelection(mymap,DFthreshold); 627 WriteHeadArff(); 628 VSMFormation(mymap); 629 cout<<"arff文件已经形成"<<endl; 630 631 632 string temp(infoFromWekaAddress); 633 634 cout<<"请您将使用weka聚类,并保存为"<<temp<<endl; 635 } 636 map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap) 637 { 638 int corpus_N=endIndex-beginIndex+1; 639 map<</code>int,vector<</code>double>> vsmMatrix; 640 vector myKeys=GetFinalKeyWords(); 641 vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap); 642 for(int i=beginIndex;i<=endIndex;i++) 643 { 644 vectorint,double> >tempVSM; 645 for(vector::size_type j=0;j 646 { 647 //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); 648 double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); 649 TF=0.5+(double)TF/(maxTFandDF[j].first); 650 TF*=log((double)corpus_N/maxTFandDF[j].second); 651 tempVSM.push_back(make_pair(j,TF)); 652 653 } 654 if(!tempVSM.empty()) 655 { 656 tempVSM=NormalizationVSM(tempVSM); 657 for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++) 658 { 659 vsmMatrix[i].push_back(it->second); 660 } 661 662 663 664 } 665 tempVSM.clear(); 666 667 668 669 } 670 return vsmMatrix; 671 672 } 673 map<</code>double> > Preprocess::GetClusters() 674 { 675 676 map<</code>double> >clusters; 677 ifstream ifile(infoFromWekaAddress); 678 string temp; 679 while(getline(ifile,temp)) 680 { boost::smatch matchcluster; 681 boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase); 682 if(boost::regex_search(temp,matchcluster,regcluster)) 683 { 684 string clustertmp=matchcluster[0].str(); 685 string ordinates=""; 686 getline(ifile,ordinates); 687 boost::regex regordinates("\\d+(\\.\\d{1,4})?"); 688 boost::smatch matchordinates; 689 std::string::const_iterator it=ordinates.begin(); 690 std::string::const_iterator end=ordinates.end(); 691 while (boost::regex_search(it,end,matchordinates,regordinates)) 692 { 693 string digitstemp=matchordinates[0].str(); 694 double digitval=0.0; 695 std::stringstream ss; 696 ss<<digitstemp; 697 ss>>digitval; 698 clusters[clustertmp].push_back(digitval); 699 it=matchordinates[0].second; 700 } 701 702 703 704 705 706 } 707 } 708 return clusters; 709 } 710 double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2) 711 { 712 double result = 0.0f; 713 for (int i = 0; i < vector1.size(); i++) 714 result += vector1[i] * vector2[i]; 715 return result; 716 } 717 double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2) 718 { 719 double numerator=CalDotProductOfVectors(vector1,vector2); 720 double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2); 721 denominator=sqrt(denominator); 722 return numerator/denominator; 723 } 724 vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters) 725 { 726 vectorint,string> >resultInfo; 727 for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++) 728 { 729 vectordouble> >clusterDistanceAist; 730 for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++) 731 { 732 733 double temp=CalCosineofVectors(it->second,clusterit->second); 734 clusterDistanceAist.push_back(make_pair(clusterit->first,temp)); 735 736 } 737 sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp); 738 vectordouble> >::iterator cDAit=clusterDistanceAist.begin(); 739 740 resultInfo.push_back(make_pair(it->first,cDAit->first)); 741 clusterDistanceAist.clear(); 742 } 743 return resultInfo; 744 745 } 746 map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo) 747 { 748 map<</code>int>> articlesInfo; 749 750 for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++) 751 { 752 for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++) 753 { 754 if(retit->second==it->first) 755 { 756 articlesInfo[it->first].push_back(retit->first); 757 } 758 } 759 } 760 761 762 763 764 765 return articlesInfo; 766 767 768 } 769 void Preprocess::RetreiveArticleInfoFromDataBase() 770 { 771 mapint,int>>> mymap; 772 vectorint,string>>resultInfo; 773 map<</code>double> >clusters; 774 map<</code>int,vector<</code>double> >vsmMatrix; 775 map<</code>int>> articlesInfo; 776 ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt"); 777 //boost::regex_replace(strresult) 778 //ConstructMap(mymap,1,500); 779 //save(mymap); 780 load(mymap); 781 vsmMatrix=VSMConstruction(mymap); 782 clusters=GetClusters(); 783 resultInfo=GenerateClusterInfo(vsmMatrix,clusters); 784 articlesInfo=FetchArticlesOFClusters(clusters,resultInfo); 785 786 787 for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++) 788 { 789 ostringstream out; 790 string selectassist; 791 char *selectsql=new char[5000]; 792 int count=0; 793 CoInitialize(NULL); 794 _ConnectionPtr pConn(__uuidof(Connection)); 795 _RecordsetPtr pRst(__uuidof(Recordset)); 796 pConn->ConnectionString=dbconnection; 797 pConn->Open("","","",adConnectUnspecified); 798 cout <<it->first<<endl; 799 ofile<<it->first<<endl; 800 out<<"("; 801 count=0; 802 for(int i=0;isecond.size();i++) 803 { 804 out<<(it->second)[i]; 805 if(countsecond.size()-1) 806 { 807 out<<","; 808 } 809 count++; 810 811 812 } 813 out<<")"; 814 selectassist=out.str(); 815 sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str()); 816 817 pRst=pConn->Execute(selectsql,NULL,adCmdText); 818 while(!pRst->rsEOF) 819 { 820 //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord"); 821 string title=(_bstr_t)pRst->GetCollect("ArticleTitle"); 822 //string rawtext=(_bstr_t)pRst->GetCollect("ArticleText"); 823 string categorization=(_bstr_t)pRst->GetCollect("class"); 824 cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; 825 ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; 826 827 828 829 830 831 832 pRst->MoveNext(); 833 834 } 835 pRst->Close(); 836 pConn->Close(); 837 pRst.Release(); 838 pConn.Release(); 839 CoUninitialize(); 840 841 } 842 843 844 845 846 ofile.close(); 847 848 849 } 850 vectorPreprocess:: mySplit(string s,set stopwords) 851 { 852 vector wordCollection; 853 trim(s," "); 854 855 int nPosBegin=0; 856 int nPosEnd=s.find(' ',nPosBegin); 857 while(nPosEnd!=string::npos) 858 { 859 string temp=s.substr(nPosBegin,nPosEnd-nPosBegin); 860 trim(temp," "); 861 wordCollection.push_back(temp); 862 nPosBegin=s.find_first_not_of(' ',nPosEnd); 863 nPosEnd=s.find(' ',nPosBegin); 864 } 865 string temp=s.substr(nPosBegin,s.size()-nPosBegin); 866 trim(temp," "); 867 wordCollection.push_back(temp); 868 869 870 return wordCollection; 871 872 } 873