K-meams文本聚类算法C++实现


FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html

  1 头文件:
  2 
  3 
  4  #ifndef _Preprocess_H
  5 #define  _Preprocess_H
  6 #include
  7 #include
  8 #include
  9 #include
 10 #include
 11 #include
 12 #include
 13 #include
 14 #include
 15 #include
 16 #include
 17 #include
 18 #include"ictclas30.h"
 19 #include"boost\tr1\regex.hpp"
 20 #include"boost/algorithm/string.hpp"
 21 #include"windows.h"
 22  
 23 //一些谓词函数
 24 using namespace std;
 25  
 26 class Preprocess
 27 {      
 28     //typedef  vector(Preprocess::*FUNCSEG)(string,set);
 29     private:
 30          char *bagofwordsAddress;//存放词袋子模型的位置
 31         char * featurewordsAddress;//存放特征词文件的位置;
 32         char *arffFileAddress;//存放ARFF文件的位置
 33         char *infoFromWekaAddress;//存放调用weka后的实验结果
 34         char *articleIdsAddress;//存放被聚类的文章的ID号
 35         char *dbconnection;//数据库的链接字符串
 36         char *dbselect;//数据库select语句
 37         char *dbfield;//数据库字段
 38         int beginIndex;//开始聚类的文章id
 39         int endIndex;//结束聚类的文章id
 40     public:
 41         typedef vector(Preprocess::*FUNCSEG)(string,set);
 42         Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
 43         {
 44                 bagofwordsAddress=new char[c_style_stringsize];
 45                 featurewordsAddress=new char[c_style_stringsize];
 46                 arffFileAddress=new char[c_style_stringsize];
 47                 infoFromWekaAddress=new char[c_style_stringsize];
 48                 articleIdsAddress=new char[c_style_stringsize];
 49                 dbconnection=new char[c_style_stringsize];
 50                 dbselect=new char[c_style_stringsize];
 51                 this->beginIndex=beginIndex;
 52                 this->endIndex=endIndex;
 53                 sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
 54                 sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
 55                 sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
 56                 sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
 57                 sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
 58                 sprintf_s(dbconnection,c_style_stringsize,conn);
 59                 sprintf_s(dbselect,c_style_stringsize,selectsql);
 60              
 61  
 62  
 63         }
 64         
 65  
 66         ~Preprocess()
 67         {
 68             delete []bagofwordsAddress;
 69             delete []featurewordsAddress;
 70             delete []arffFileAddress;
 71             delete [] infoFromWekaAddress;
 72             delete []articleIdsAddress;
 73             delete []dbconnection;
 74             delete []dbselect;
 75              
 76  
 77         }
 78         void trim(string  &str,const string val);//去除字符串首尾空白
 79         //构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
 80         int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg);
 81         inline void TruncateArff()
 82         {
 83             ofstream ofile;
 84             ofile.open(arffFileAddress,ios::trunc);
 85             ofile.close();
 86         }
 87         //保存词袋子到硬盘
 88         void save(mapint,int> > >&mymap);
 89         //从内存中加载词袋子模型
 90         void load(mapint,int> > >&mymap);
 91         //打印词袋子模型
 92         void print(mapint,int> > >&mymap);
 93         //窄字符串转化成宽字符串
 94         wstring myMultibyteToWideChar(string sResult);
 95         //宽字符串转化成窄字符串
 96         string myWideCharToMultibyte(wstring wsResult);
 97         //调用ICTclass分词
 98         string ICTsplit(const char *sInput);
 99         //构造停用词表
100         setMakeStopSet();
101         //去除停用词,噪声词
102         vectorgoodWordsinPieceArticle(string rawtext,set stopwords);
103         //整数转化成字符串
104         string do_fraction(int val);
105         //浮点数转化成字符串
106         string do_fraction(double val, int decplaces=5);
107         //特征词选择算法
108         void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold);
109         //获取最后的特征词
110         vector GetFinalKeyWords();
111         //获取特征词的maxTF,DF
112         vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap);
113         //文档向量模型规范化
114         vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM);
115         //建立文档向量模型并且写到arff文件里
116         void VSMFormation(mapint,int>>> &mymap);
117         
118         string FormatVSMtoString(vectorint,double> > tempVSM);
119         //写Arff文件头部
120         void WriteHeadArff();
121         void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);
122         
123         
124         map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap);
125         
126         map<</code>double> > GetClusters();
127         
128         double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);
129         
130         double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);
131         
132         vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters);
133         
134         map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo);
135         void RetreiveArticleInfoFromDataBase();
136         vector mySplit(string s,set stopwords);//分割关键词
137  
138          
139  
140  
141  
142  
143  
144  
145 };
146  
147  
148  
149 #endif 
150 
151 
152  Preprocess类的函数功能实现文件: 
153 
154 
155 
156 
157  #include"stdafx.h"
158 #include "Preprocess.h"
159  
160 #pragma comment(lib, "ICTCLAS30.lib")
161 using namespace std;
162 bool isLonger(const  pairint> &pair1, const pairint>  &pair2)
163 {
164     return pair1.second>pair2.second;
165 }
166 bool cntAssist(const  pairint> &pair1)
167 {
168     return pair1.second<=100;
169 }
170 bool PredTF(const pair<</code>int,int>& pair1,int articleId)
171 {
172     return pair1.first==articleId;
173  
174 }
175 class PredTFclass
176 {
177 private: const int m;
178 public:
179     PredTFclass(int id):m(id){};
180     bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);};
181 };
182 bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 )
183 {
184     return pair1.second>=pair2.second;
185 }
186  
187 void Preprocess:: trim(string  &str,const string val)
188 {
189     str.erase(0,str.find_first_not_of(val));
190     str.erase(str.find_last_not_of(val)+val.size());
191 }
192 int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg)
193 {
194     //setMakeStopSet();
195     CoInitialize(NULL);
196     _ConnectionPtr pConn(__uuidof(Connection));
197     _RecordsetPtr pRst(__uuidof(Recordset));
198     pConn->ConnectionString=dbconnection;
199     pConn->Open("","","",adConnectUnspecified);
200     pRst=pConn->Execute(dbselect,NULL,adCmdText);
201     setstopwords=MakeStopSet();
202      
203     while(!pRst->rsEOF)
204     {   vectorwordcollection;
205        //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
206         string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
207         if(rawtext!="")
208         {
209             wordcollection=(this->*seg)(rawtext,stopwords);
210             string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
211             int articleid=atoi(tempid.c_str());
212             for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
213             {
214                 vectorint,int>>::iterator it;
215                 if(mymap[*strit].empty())
216                 {
217                     pair<</code>int,int>mytemppair=make_pair(articleid,1);
218                     mymap[*strit].push_back(mytemppair);
219  
220                 }
221                 else
222                 {
223                     for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
224                     { 
225                         if(it->first==articleid)
226                         {
227                             it->second=++(it->second);
228                             break;
229                         }
230  
231                 }
232                 if(it==mymap[*strit].end())
233                 {
234                     pair<</code>int,int>mytemppair=make_pair(articleid,1);
235                     mymap[*strit].push_back(mytemppair);
236                 }
237  
238             }
239  
240         }
241  
242  
243     }
244  
245  
246     pRst->MoveNext();
247     wordcollection.clear();
248  }
249     pRst->Close();
250     pConn->Close();
251     pRst.Release();
252     pConn.Release();
253     CoUninitialize();
254      
255     return 0;
256  
257 }
258 void Preprocess::save(mapint,int> > >&mymap)
259 {
260     ofstream outfile(bagofwordsAddress,ios::binary);
261     outfile<<mymap.size()<<endl;
262     mapint,int> > >::iterator it;
263     for (it=mymap.begin();it!=mymap.end();it++)
264     {   outfile<<it->first<<endl;
265     vectorint,int>>::iterator subit;
266     outfile<<it->second.size()<<endl;
267     for(subit=(it->second).begin();subit!=(it->second).end();++subit)
268     {
269         outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
270     }
271     outfile<<endl;
272     }
273     //outfile.write((char *)&mymap,sizeof(mymap));
274  
275     outfile.close();
276  
277 }
278 void Preprocess::load(mapint,int> > >&mymap)
279 {
280     std::locale loc1 = std::locale::global(std::locale(".936"));
281     {
282         // 在这里使用std::ifstream 或者 std::fstream
283         ifstream infile(bagofwordsAddress,ios::binary);
284         int lenMyMap;//保存词典长度
285         int lenVector;//保存每个词出现的文章数目
286         string key;//保存读出的map的键值
287         int articleId;//文章标号
288         int count;//在该文章中刚出现的数目
289         string comma;
290         string semicolon;
291         infile>>lenMyMap;
292         while(!infile.eof())
293         {
294             infile>>key;
295             infile>>lenVector;
296             vectorint,int> >temp;
297             for (int i=0;i
298             {
299                 infile>>articleId>>count>>semicolon;
300                 temp.push_back(make_pair(articleId,count));
301             }
302             mymap[key]=temp;
303  
304  
305         }
306  
307  
308         infile.close();
309     }
310     std::locale::global(std::locale(loc1));
311  
312 }
313 void print(mapint,int> > >&mymap)
314 {  
315     cout<<mymap.size()<<endl;
316     mapint,int> > >::iterator it;
317     for (it=mymap.begin();it!=mymap.end();it++)
318     {   cout<<it->first<<endl;
319     vectorint,int>>::iterator subit;
320     cout<<it->second.size()<<endl;
321     for(subit=(it->second).begin();subit!=(it->second).end();++subit)
322     {
323         cout<<subit->first<<','<<subit->second<<";";
324     }
325     cout<<endl;
326     }
327  
328 }
329 set Preprocess::MakeStopSet()
330 {
331     set stopwordsSet;
332     ifstream ifile("stopwords.txt");
333     while(!ifile.eof())
334     {
335         string temp;
336         trim(temp," ");
337         ifile>>temp;
338         stopwordsSet.insert(temp);
339     }
340     return stopwordsSet;
341 }
342  
343 string Preprocess::do_fraction(int val)
344 {
345     ostringstream out;
346     out<<val;
347     string str= out.str(); //从流中取出字符串
348     str.swap(string(str.c_str()));//删除nul之后的多余字符
349     return str;
350  
351 }
352 string Preprocess::do_fraction(double val,int decplaces)
353 {
354      
355     //int prec=numeric_limits::digits10;
356     char DECIMAL_POINT='.';
357     ostringstream out;
358     //out.precision(prec);
359     out<<val;
360     string str=out.str();
361     size_t n=str.find(DECIMAL_POINT);
362     if((n!=string::npos)&&n+decplaces
363     {
364         str[n+decplaces]='\0';
365     }
366     str.swap(string(str.c_str()));
367  
368     return str;
369 }
370 wstring Preprocess::myMultibyteToWideChar(string sResult)
371 {
372     int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符)
373     wchar_t *lpwsz= new wchar_t [iWLen+1];
374     MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
375     lpwsz[iWLen] = L'\0';
376     wstring wsResult(lpwsz);
377     delete []lpwsz;
378     return wsResult;
379 }
380 string Preprocess::myWideCharToMultibyte(wstring wsResult)
381 {
382     string sResult;
383     int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
384     char *lpsz= new char[iLen];
385     WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
386     sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
387     delete []lpsz;
388     return sResult;
389  
390 }
391 string Preprocess::ICTsplit(const char *sInput)
392 {
393     if(!ICTCLAS_Init())
394     {
395         printf("ICTCLAS INIT FAILED!\n");
396         string strerr(sInput);
397         return strerr;
398     }
399     ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
400     //导入用户词典后
401     
402  
403     const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);
404     string strresult(sResult);
405     //printf("%s\n", sResult);
406     //把字符串转化成宽字符串
407     wstring wsResult=myMultibyteToWideChar(strresult);
408     boost::wregex wreg(L"\\s+");
409     wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
410     strresult=myWideCharToMultibyte(wsResult);
411  
412  
413  
414     //ofile<<str1;
415     //ofile.close();
416     //cout<<str1<<endl;
417     //ICTCLAS_FileProcess("text.txt","test_result.txt",1);
418     ICTCLAS_Exit();
419  
420     return strresult;
421 }
422 vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords)
423 {
424     vector goodWordstemp;
425     vector goodWords;
426     const char* sInput=rawtext.c_str();
427     string sResult=ICTsplit(sInput);
428     wstring wsResult=myMultibyteToWideChar(sResult);
429     boost::wregex wreg(L"\\d+");//去掉中文空格
430     wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
431     //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
432     boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));
433  
434     for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
435     {
436         string temp=myWideCharToMultibyte(*it);
437         trim(temp," ");
438         if(!stopwords.count(temp)&&!temp.empty())
439         {
440             goodWords.push_back(temp);
441         }
442  
443  
444     }
445  
446     return goodWords;
447 }
448 void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold)
449 {
450     int finalKeyWordsCount=0;//计算共取了多少个关键词
451     vectorint> >tempvector;
452     for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
453     {
454         tempvector.push_back(make_pair(it->first,(it->second).size()));
455     }
456  
457     stable_sort(tempvector.begin(),tempvector.end(),isLonger);
458     ofstream outfile(featurewordsAddress);
459     for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
460     {  
461         if(it->second>=DFthreshold)
462         {
463             //outfile<<it->first<<" "<<it->second<<endl;
464             outfile<<it->first<<endl;
465             finalKeyWordsCount++;
466  
467         }
468  
469     }
470     outfile.close();
471     cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
472     cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;
473  
474 }
475 vectorPreprocess::GetFinalKeyWords()
476 {
477     vectormyKeys;
478     ifstream infile(featurewordsAddress);
479     while(!infile.eof())
480     {
481         string temp;
482         infile>>temp;
483         if(temp!="")
484         {
485             myKeys.push_back(temp);
486         }
487  
488  
489     }
490     return myKeys;
491 }
492 vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap)
493 {
494     vectorint,int> >maxTFandDF;
495     vectormyKeys=GetFinalKeyWords();
496     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
497     { 
498         int DF=mymap[*it].size();
499         int maxTF=0;
500         for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
501         {
502             if(subit->second>maxTF)
503             {
504                 maxTF=subit->second;
505             }
506  
507         }
508         maxTFandDF.push_back(make_pair(maxTF,DF));
509         //find_if(mymap[*it].begin(),mymap[*it].end(),
510     }
511     return maxTFandDF;
512 }
513 vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM)
514 {
515  
516     double sum=0;
517     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
518     {
519         sum+=pow(vsmit->second,2);
520     }
521     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
522     {
523         vsmit->second/=sqrt(sum);
524     }
525     return tempVSM;
526  
527 }
528 string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM)
529 {
530     string ret="{";
531     int commaindication=0;
532     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
533     {  
534  
535         ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8);
536         if(commaindication
537         {
538             ret+=",";
539         }
540         commaindication++;
541     }
542     ret+="}";
543     return ret;
544 }
545 void Preprocess::WriteHeadArff()
546 {
547     ofstream ofile(arffFileAddress,ios::binary);
548     ofile<<"@relation aticle"<<endl;
549     ofile<<"\n";
550     vector myKeys=GetFinalKeyWords();
551     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
552     {
553         //string temp="@attribute "+"'"+(*it)+"'"+" real";
554         string temp="";
555         temp+="@attribute ";
556         temp+="'";
557         temp+=*(it);
558         temp+="'";
559         temp+=" real";
560         
561  
562         ofile<<temp<<endl;
563     }
564     ofile<<"\n"<<endl;
565     ofile<<"@data"<<endl;
566     ofile.close();
567 }
568 void Preprocess::VSMFormation(mapint,int>>> &mymap)
569 {   int corpus_N=endIndex-beginIndex+1;
570     ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
571     ofstream ofile2(arffFileAddress,ios::binary|ios::app);
572  
573     vector myKeys=GetFinalKeyWords();
574     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
575     for(int i=beginIndex;i<=endIndex;i++)
576     {   vectorint,double> >tempVSM;
577         for(vector::size_type j=0;j
578         {
579         //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
580             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
581  
582  
583             TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
584             TF*=log((double)corpus_N/maxTFandDF[j].second);
585             if(TF!=0)
586             {
587                 tempVSM.push_back(make_pair(j,TF));
588  
589             }
590  
591  
592  
593         }
594         if(!tempVSM.empty())
595         {
596             tempVSM=NormalizationVSM(tempVSM);
597             string vsmStr=FormatVSMtoString(tempVSM);
598             ofile1<<i<<endl;
599             ofile2<<vsmStr<<endl;
600         }
601         tempVSM.clear();
602  
603  
604  
605     }
606     ofile1.close();
607     ofile2.close();
608  
609  
610 }
611 void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
612 {
613      
614      
615     mapint,int>>> mymap;
616     if(!isbagOfWordsExist)
617     {
618         ConstructMap(mymap,dbfield,seg);
619         save(mymap);
620         cout<<"词袋子信息已经保存到硬盘"<<endl;
621     }
622     else
623     {
624         load(mymap);
625     }
626     DFcharicteristicWordSelection(mymap,DFthreshold);
627     WriteHeadArff();
628     VSMFormation(mymap);
629     cout<<"arff文件已经形成"<<endl;
630      
631      
632     string temp(infoFromWekaAddress);
633  
634     cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
635 }
636 map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap)
637 {  
638     int corpus_N=endIndex-beginIndex+1;
639     map<</code>int,vector<</code>double>> vsmMatrix;
640     vector myKeys=GetFinalKeyWords();
641     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
642     for(int i=beginIndex;i<=endIndex;i++)
643     {  
644         vectorint,double> >tempVSM;
645         for(vector::size_type j=0;j
646         {
647             //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
648             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
649             TF=0.5+(double)TF/(maxTFandDF[j].first);
650             TF*=log((double)corpus_N/maxTFandDF[j].second);
651             tempVSM.push_back(make_pair(j,TF));
652  
653         }
654         if(!tempVSM.empty())
655         {
656             tempVSM=NormalizationVSM(tempVSM);
657             for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
658             {
659                 vsmMatrix[i].push_back(it->second);
660             }
661  
662  
663  
664         }
665         tempVSM.clear();
666  
667  
668  
669     }
670     return vsmMatrix;
671  
672 }
673 map<</code>double> > Preprocess::GetClusters()
674 {
675  
676     map<</code>double> >clusters;
677     ifstream ifile(infoFromWekaAddress);
678     string temp;
679     while(getline(ifile,temp))
680     {   boost::smatch matchcluster;
681     boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
682     if(boost::regex_search(temp,matchcluster,regcluster))  
683     {  
684         string clustertmp=matchcluster[0].str();
685         string ordinates="";
686         getline(ifile,ordinates);
687         boost::regex regordinates("\\d+(\\.\\d{1,4})?");
688         boost::smatch matchordinates;
689         std::string::const_iterator it=ordinates.begin(); 
690         std::string::const_iterator end=ordinates.end();
691         while (boost::regex_search(it,end,matchordinates,regordinates))
692         {      
693             string digitstemp=matchordinates[0].str();
694             double digitval=0.0;
695             std::stringstream ss;
696             ss<<digitstemp;
697             ss>>digitval;
698             clusters[clustertmp].push_back(digitval);
699             it=matchordinates[0].second;
700         }
701  
702  
703  
704  
705  
706     }
707     }
708     return clusters;
709 }
710 double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
711 {
712     double result = 0.0f;
713     for (int i = 0; i < vector1.size(); i++)
714         result += vector1[i] * vector2[i];
715     return result;
716 }
717 double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
718 {
719     double numerator=CalDotProductOfVectors(vector1,vector2);
720     double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
721     denominator=sqrt(denominator);
722     return numerator/denominator;
723 }
724 vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters)
725 {
726     vectorint,string> >resultInfo;
727     for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
728     {
729         vectordouble> >clusterDistanceAist;
730         for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
731         {
732  
733             double temp=CalCosineofVectors(it->second,clusterit->second);
734             clusterDistanceAist.push_back(make_pair(clusterit->first,temp));
735  
736         }
737         sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
738         vectordouble> >::iterator cDAit=clusterDistanceAist.begin();
739  
740         resultInfo.push_back(make_pair(it->first,cDAit->first));
741         clusterDistanceAist.clear();
742     }
743     return  resultInfo;
744  
745 }
746 map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo)
747 {
748     map<</code>int>> articlesInfo;
749  
750     for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
751     {
752         for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++)
753         {
754             if(retit->second==it->first)
755             {
756                 articlesInfo[it->first].push_back(retit->first);
757             }
758         }
759     }
760  
761  
762  
763  
764  
765     return articlesInfo;
766  
767  
768 }
769 void Preprocess::RetreiveArticleInfoFromDataBase()
770 {
771     mapint,int>>> mymap;
772     vectorint,string>>resultInfo;
773     map<</code>double> >clusters;
774     map<</code>int,vector<</code>double> >vsmMatrix;
775     map<</code>int>> articlesInfo;
776     ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
777     //boost::regex_replace(strresult)
778     //ConstructMap(mymap,1,500);
779     //save(mymap);
780     load(mymap);
781     vsmMatrix=VSMConstruction(mymap);
782     clusters=GetClusters();
783     resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
784     articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);
785  
786     
787     for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
788     {
789         ostringstream out;
790         string selectassist;
791         char *selectsql=new char[5000];
792         int count=0;
793         CoInitialize(NULL);
794         _ConnectionPtr pConn(__uuidof(Connection));
795         _RecordsetPtr pRst(__uuidof(Recordset));
796         pConn->ConnectionString=dbconnection;
797         pConn->Open("","","",adConnectUnspecified);
798         cout <<it->first<<endl;
799         ofile<<it->first<<endl;
800         out<<"(";
801         count=0;
802         for(int i=0;isecond.size();i++)
803         {
804             out<<(it->second)[i];
805             if(countsecond.size()-1)
806             {
807                 out<<",";
808             }
809             count++;
810              
811          
812         }
813         out<<")";
814         selectassist=out.str();
815         sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());
816  
817         pRst=pConn->Execute(selectsql,NULL,adCmdText);
818         while(!pRst->rsEOF)
819         {  
820         //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
821             string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
822             //string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
823             string categorization=(_bstr_t)pRst->GetCollect("class");
824             cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
825             ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
826  
827  
828              
829          
830  
831  
832             pRst->MoveNext();
833              
834         }
835         pRst->Close();
836         pConn->Close();
837         pRst.Release();
838         pConn.Release();
839         CoUninitialize();
840      
841     }
842      
843      
844  
845  
846 ofile.close(); 
847      
848      
849 }
850 vectorPreprocess:: mySplit(string s,set stopwords)
851 {
852     vector wordCollection;
853     trim(s," ");
854  
855     int nPosBegin=0;
856     int nPosEnd=s.find(' ',nPosBegin);
857     while(nPosEnd!=string::npos)
858     {
859         string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
860         trim(temp," ");
861         wordCollection.push_back(temp);
862         nPosBegin=s.find_first_not_of(' ',nPosEnd);
863         nPosEnd=s.find(' ',nPosBegin);
864     }
865     string temp=s.substr(nPosBegin,s.size()-nPosBegin);
866     trim(temp," ");
867     wordCollection.push_back(temp);
868  
869  
870     return wordCollection;
871  
872 }
873  

 

posted @ 2013-08-22 20:20  wq920  阅读(895)  评论(0编辑  收藏  举报