语料数据库

实验结果以及中间数据

文本预处理开源框架源代码

头文件:

#ifndef _Preprocess_H
#define  _Preprocess_H
#include<iostream>
#include<map>
#include<set>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
#include<algorithm>
#include<cmath>
#include<sstream>
#include<limits>
#include <xstring>
#include"ictclas30.h"
#include"boost\tr1\regex.hpp"
#include"boost/algorithm/string.hpp"
#include"windows.h"

/************************************************************************/
/* WkaPreprocess类完成如下功能
将文本集合分词-》去停用词-》建立词袋子模型=》特征词选择=》对文章建立VSM模型=
》写成weka数据格式(arff)-》输出聚类信息                                    */
/************************************************************************/
//一些谓词函数
using namespace std;

class Preprocess
{       
	//typedef  vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
	private:
		 char *bagofwordsAddress;//存放词袋子模型的位置
		char * featurewordsAddress;//存放特征词文件的位置;
		char *arffFileAddress;//存放ARFF文件的位置
	    char *infoFromWekaAddress;//存放调用weka后的实验结果
		char *articleIdsAddress;//存放被聚类的文章的ID号
		char *dbconnection;//数据库的链接字符串
		char *dbselect;//数据库select语句
		char *dbfield;//数据库字段
		int beginIndex;//开始聚类的文章id
		int endIndex;//结束聚类的文章id 
	public:
		typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
		Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
		{
				bagofwordsAddress=new char[c_style_stringsize];
				featurewordsAddress=new char[c_style_stringsize];
				arffFileAddress=new char[c_style_stringsize];
				infoFromWekaAddress=new char[c_style_stringsize];
				articleIdsAddress=new char[c_style_stringsize];
				dbconnection=new char[c_style_stringsize];
				dbselect=new char[c_style_stringsize];
				this->beginIndex=beginIndex;
				this->endIndex=endIndex;
				sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
				sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
				sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
				sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
				sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
				sprintf_s(dbconnection,c_style_stringsize,conn);
				sprintf_s(dbselect,c_style_stringsize,selectsql);
			


		}
		/*Preprocess()
		{

		}*/

		~Preprocess()
		{
			delete []bagofwordsAddress;
			delete []featurewordsAddress;
			delete []arffFileAddress;
			delete [] infoFromWekaAddress;
			delete []articleIdsAddress;
			delete []dbconnection;
			delete []dbselect;
			

		}
		void trim(string  &str,const string val);//去除字符串首尾空白
		//构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
		int ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg);
		inline void TruncateArff()
		{
			ofstream ofile;
			ofile.open(arffFileAddress,ios::trunc);
			ofile.close();
		}
		//保存词袋子到硬盘
		void save(map<string,vector<pair<int,int> > >&mymap);
		//从内存中加载词袋子模型
		void load(map<string,vector<pair<int,int> > >&mymap);
		//打印词袋子模型
		void print(map<string,vector<pair<int,int> > >&mymap);
		//窄字符串转化成宽字符串
		wstring myMultibyteToWideChar(string sResult);
		//宽字符串转化成窄字符串
		string myWideCharToMultibyte(wstring wsResult);
		//调用ICTclass分词
		string ICTsplit(const char *sInput);
		//构造停用词表
		set<string>MakeStopSet();
		//去除停用词,噪声词
		vector<string>goodWordsinPieceArticle(string rawtext,set<string> stopwords);
		//整数转化成字符串
		string do_fraction(int val);
		//浮点数转化成字符串
		string do_fraction(double val, int decplaces=5);
		//特征词选择算法
		void DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold);
		//获取最后的特征词
		vector<string> GetFinalKeyWords();
		//获取特征词的maxTF,DF
		vector<pair<int,int> >GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap);
		//文档向量模型规范化
		vector<pair<int,double> > NormalizationVSM(vector<pair<int,double> > tempVSM);
		//建立文档向量模型并且写到arff文件里
		void VSMFormation(map<string,vector<pair<int,int>>> &mymap);
		/***单个文档向量模型字符串化***/
		string FormatVSMtoString(vector<pair<int,double> > tempVSM);
		//写Arff文件头部
		void WriteHeadArff();
		void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);
		/******************************************************以下函数完成聚类功能**********************************/
		/***************建立文档向量模型,但是不形成字符串***********/
		map<int,vector<double> >VSMConstruction(map<string,vector<pair<int,int>>> &mymap);
		/************从weka给出的结果中获取聚类中心******/
		map<string,vector<double> > GetClusters();
		/**计算向量的内积*****************8*/
		double CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2);
		/************计算余弦相似度*******/
		double CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2);
		/* 获取聚类信息,即给每篇文章附上一个类别label    */
		vector<pair<int,string> >GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters);
		/****返回聚类中每个类别的文章ID******************/
		map<string,vector<int> >FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo);
		void RetreiveArticleInfoFromDataBase();
		vector<string> mySplit(string s,set<string> stopwords);//分割关键词

		






};



#endif
 
Preprocess类的函数功能实现文件:
#include"stdafx.h"
#include "Preprocess.h"

#pragma comment(lib, "ICTCLAS30.lib")
using namespace std;
/************************************************************************/
/* 去掉字符串首尾空白                                                                     */
/************************************************************************/
bool isLonger(const  pair<string,int> &pair1, const pair<string,int>  &pair2)
{
	return pair1.second>pair2.second;
}
bool cntAssist(const  pair<string,int> &pair1)
{
	return pair1.second<=100;
}
bool PredTF(const pair<int,int>& pair1,int articleId)
{
	return pair1.first==articleId;

}
class PredTFclass
{
private: const int m;
public: 
	PredTFclass(int id):m(id){};
	bool operator()(const pair<int,int>& pair1){return PredTF(pair1,m);};
};
bool myCmp(const pair<string,double>&pair1,const pair<string,double>&pair2 )
{
	return pair1.second>=pair2.second;
}

void Preprocess:: trim(string  &str,const string val)
{
	str.erase(0,str.find_first_not_of(val));
	str.erase(str.find_last_not_of(val)+val.size());
}
/************************************************************************/
/* 建立词袋子模型                                                                     */
/************************************************************************/
int Preprocess::ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg)
{
	//set<string>MakeStopSet();
	CoInitialize(NULL);
	_ConnectionPtr pConn(__uuidof(Connection));
	_RecordsetPtr pRst(__uuidof(Recordset));
	pConn->ConnectionString=dbconnection;
	pConn->Open("","","",adConnectUnspecified);
	pRst=pConn->Execute(dbselect,NULL,adCmdText);
	set<string>stopwords=MakeStopSet();
	
	while(!pRst->rsEOF)
	{	vector<string>wordcollection;
	   //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
		string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
		if(rawtext!="")
		{
			wordcollection=(this->*seg)(rawtext,stopwords);
			string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
			int articleid=atoi(tempid.c_str());
			for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
			{
				vector<pair<int,int>>::iterator it;
				if(mymap[*strit].empty())
				{
					pair<int,int>mytemppair=make_pair(articleid,1);
					mymap[*strit].push_back(mytemppair);

				}
				else
				{
					for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
					{  
						if(it->first==articleid)
						{
							it->second=++(it->second);
							break;
						}

				}
				if(it==mymap[*strit].end())
				{
					pair<int,int>mytemppair=make_pair(articleid,1);
					mymap[*strit].push_back(mytemppair);
				}

			}

		}


	}


	pRst->MoveNext();
	wordcollection.clear();
 }
	pRst->Close();
	pConn->Close();
	pRst.Release();
	pConn.Release();
	CoUninitialize();
	
	return 0;

}
/************************************************************************/
/* 保存词袋子模型到硬盘                                                                     */
/************************************************************************/
void Preprocess::save(map<string,vector<pair<int,int> > >&mymap)
{
	ofstream outfile(bagofwordsAddress,ios::binary);
	outfile<<mymap.size()<<endl;
	map<string,vector<pair<int,int> > >::iterator it;
	for (it=mymap.begin();it!=mymap.end();it++)
	{   outfile<<it->first<<endl;
	vector<pair<int,int>>::iterator subit;
	outfile<<it->second.size()<<endl;
	for(subit=(it->second).begin();subit!=(it->second).end();++subit)
	{
		outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
	}
	outfile<<endl;
	}
	//outfile.write((char *)&mymap,sizeof(mymap));

	outfile.close();

}
/************************************************************************/
/* 加载词典信息到内存                                                                     */
/************************************************************************/
void Preprocess::load(map<string,vector<pair<int,int> > >&mymap)
{
	std::locale loc1 = std::locale::global(std::locale(".936"));
	{
		// 在这里使用std::ifstream 或者 std::fstream
		ifstream infile(bagofwordsAddress,ios::binary);
		int lenMyMap;//保存词典长度
		int lenVector;//保存每个词出现的文章数目
		string key;//保存读出的map的键值
		int articleId;//文章标号
		int count;//在该文章中刚出现的数目
		string comma;
		string semicolon;
		infile>>lenMyMap;
		while(!infile.eof())
		{
			infile>>key;
			infile>>lenVector;
			vector<pair<int,int> >temp;
			for (int i=0;i<lenVector;i++)
			{
				infile>>articleId>>count>>semicolon;
				temp.push_back(make_pair(articleId,count));
			}
			mymap[key]=temp;


		}


		infile.close();
	}
	std::locale::global(std::locale(loc1));

}
/************************************************************************/
/* 打印词典信息                                                         */
/************************************************************************/
void print(map<string,vector<pair<int,int> > >&mymap)
{   
	cout<<mymap.size()<<endl;
	map<string,vector<pair<int,int> > >::iterator it;
	for (it=mymap.begin();it!=mymap.end();it++)
	{   cout<<it->first<<endl;
	vector<pair<int,int>>::iterator subit;
	cout<<it->second.size()<<endl;
	for(subit=(it->second).begin();subit!=(it->second).end();++subit)
	{
		cout<<subit->first<<','<<subit->second<<";";
	}
	cout<<endl;
	}

}
/************************************************************************/
/* 构造停用词表                                                                     */
/************************************************************************/
set<string> Preprocess::MakeStopSet()
{
	set<string> stopwordsSet;
	ifstream ifile("stopwords.txt");
	while(!ifile.eof())
	{
		string temp;
		trim(temp," ");
		ifile>>temp;
		stopwordsSet.insert(temp);
	}
	return stopwordsSet;
}
/************************************************************************/
/* 将整数转化成字符串                                                   */
/************************************************************************/

string Preprocess::do_fraction(int val)
{
	ostringstream out;
	out<<val;
	string str= out.str(); //从流中取出字符串
	str.swap(string(str.c_str()));//删除nul之后的多余字符
	return str;

}
/************************************************************************/
/* 将浮点数转化成指定精度的字符串                                       */
/************************************************************************/
string Preprocess::do_fraction(double val,int decplaces)
{
	
	//int prec=numeric_limits<double>::digits10;
	char DECIMAL_POINT='.'; 
	ostringstream out;
	//out.precision(prec);
	out<<val;
	string str=out.str();
	size_t n=str.find(DECIMAL_POINT);
	if((n!=string::npos)&&n+decplaces<str.size())
	{
		str[n+decplaces]='\0';
	}
	str.swap(string(str.c_str()));

	return str;
}
/************************************************************************/
/* 窄字符串砖宽字符串                                                    */
/************************************************************************/
wstring Preprocess::myMultibyteToWideChar(string sResult)
{
	int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符)
	wchar_t *lpwsz= new wchar_t [iWLen+1];
	MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
	lpwsz[iWLen] = L'\0'; 
	wstring wsResult(lpwsz);
	delete []lpwsz;
	return wsResult;
}
/************************************************************************/
/* 宽字符串转窄字符串                                                                     */
/************************************************************************/
string Preprocess::myWideCharToMultibyte(wstring wsResult)
{
	string sResult;
	int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
	char *lpsz= new char[iLen];
	WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
	sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
	delete []lpsz;
	return sResult;

}
/************************************************************************/
/* 调用ICTclas进行中文分词                                               */
/************************************************************************/
string Preprocess::ICTsplit(const char *sInput)
{
	if(!ICTCLAS_Init())
	{
		printf("ICTCLAS INIT FAILED!\n");
		string strerr(sInput);
		return strerr;
	}
	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
	//导入用户词典后
	/*printf("\n导入用户词典后:\n");
	int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典
	//保存用户词典
	ICTCLAS_SaveTheUsrDic();
	printf("导入%d个用户词。\n", nCount);*/

	const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);
	string strresult(sResult);
	//printf("%s\n", sResult);
	//把字符串转化成宽字符串
	wstring wsResult=myMultibyteToWideChar(strresult);
	boost::wregex wreg(L"\\s+");
	wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
	strresult=myWideCharToMultibyte(wsResult);



	//ofile<<str1;
	//ofile.close();
	//cout<<str1<<endl;
	//ICTCLAS_FileProcess("text.txt","test_result.txt",1);
	ICTCLAS_Exit();

	return strresult;
}
/************************************************************************/
/* 对每一篇文章去掉噪声词,剩下好词                                     */
/************************************************************************/
vector<string>Preprocess::goodWordsinPieceArticle(string rawtext,set<string> stopwords)
{
	vector<wstring> goodWordstemp;
	vector<string> goodWords;
	const char* sInput=rawtext.c_str();
	string sResult=ICTsplit(sInput);
	wstring wsResult=myMultibyteToWideChar(sResult);
	boost::wregex wreg(L"\\d+");//去掉中文空格
	wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
	//boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
	boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));

	for(vector<wstring>::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
	{
		string temp=myWideCharToMultibyte(*it);
		trim(temp," ");
		if(!stopwords.count(temp)&&!temp.empty())
		{
			goodWords.push_back(temp);
		}


	}

	return goodWords;
}
/************************************************************************/
/* DF特征词选择法                                                       */
/************************************************************************/
void Preprocess::DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold)
{
	int finalKeyWordsCount=0;//计算共取了多少个关键词
	vector<pair<string,int> >tempvector;
	for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
	{
		tempvector.push_back(make_pair(it->first,(it->second).size()));
	}

	stable_sort(tempvector.begin(),tempvector.end(),isLonger);
	ofstream outfile(featurewordsAddress);
	for(vector<pair<string,int> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
	{   
		if(it->second>=DFthreshold)
		{
			//outfile<<it->first<<" "<<it->second<<endl;
			outfile<<it->first<<endl;
			finalKeyWordsCount++;

		}

	}
	outfile.close();
	cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
	cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;

}
/************************************************************************/
/* 获得最终选定的构造文档向量模型的特征词                               */
/************************************************************************/
vector<string>Preprocess::GetFinalKeyWords()
{
	vector<string>myKeys;
	ifstream infile(featurewordsAddress);
	while(!infile.eof())
	{
		string temp;
		infile>>temp;
		if(temp!="")
		{
			myKeys.push_back(temp);
		}


	}
	return myKeys;
}
/************************************************************************/
/* 获得特征词的maxTF,DF                                                 */
/************************************************************************/
vector<pair<int,int> >Preprocess::GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap)
{
	vector<pair<int,int> >maxTFandDF;
	vector<string>myKeys=GetFinalKeyWords();
	for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
	{  
		int DF=mymap[*it].size();
		int maxTF=0;
		for(vector<pair<int,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
		{
			if(subit->second>maxTF)
			{
				maxTF=subit->second;
			}

		}
		maxTFandDF.push_back(make_pair(maxTF,DF));
		//find_if(mymap[*it].begin(),mymap[*it].end(),
	}
	return maxTFandDF;
}
/************************************************************************/
/* 文档向量模型归一化                                                                     */
/************************************************************************/
vector<pair<int,double> >Preprocess::NormalizationVSM(vector<pair<int,double> > tempVSM)
{

	double sum=0;
	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
	{
		sum+=pow(vsmit->second,2);
	}
	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
	{
		vsmit->second/=sqrt(sum);
	}
	return tempVSM;

}
/************************************************************************/
/*              单个文档向量模型字符串化                                                        */
/************************************************************************/
string Preprocess::FormatVSMtoString(vector<pair<int,double> > tempVSM)
{
	string ret="{";
	int commaindication=0;
	for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
	{   

		ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8);
		if(commaindication<tempVSM.size()-1)
		{
			ret+=",";
		}
		commaindication++;
	}
	ret+="}";
	return ret;
}
/************************************************************************/
/* 写Arff头文件                                                                     */
/************************************************************************/
void Preprocess::WriteHeadArff()
{
	ofstream ofile(arffFileAddress,ios::binary);
	ofile<<"@relation aticle"<<endl;
	ofile<<"\n";
	vector<string> myKeys=GetFinalKeyWords();
	for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
	{
		//string temp="@attribute "+"'"+(*it)+"'"+" real";
		string temp="";
		temp+="@attribute ";
		temp+="'";
		temp+=*(it);
		temp+="'";
		temp+=" real";
		/*strcpy(temp,"@attribute ");
		strcpy(temp,"'");
		strcpy(temp,*(it));
		strcpy(temp,"'");
		strcpy(temp," real");*/

		ofile<<temp<<endl;
	}
	ofile<<"\n"<<endl;
	ofile<<"@data"<<endl;
	ofile.close();
}
/************************************************************************/
/* 将实验数据写成arff @data格式                                                                     */
/************************************************************************/
void Preprocess::VSMFormation(map<string,vector<pair<int,int>>> &mymap)
{   int corpus_N=endIndex-beginIndex+1;
	ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
	ofstream ofile2(arffFileAddress,ios::binary|ios::app);

	vector<string> myKeys=GetFinalKeyWords();
	vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
	for(int i=beginIndex;i<=endIndex;i++)
	{   vector<pair<int,double> >tempVSM;
		for(vector<string>::size_type j=0;j<myKeys.size();j++)
		{
		//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
			double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));


			TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
			TF*=log((double)corpus_N/maxTFandDF[j].second);
			if(TF!=0)
			{
				tempVSM.push_back(make_pair(j,TF));

			}



		}
		if(!tempVSM.empty())
		{
			tempVSM=NormalizationVSM(tempVSM);
			string vsmStr=FormatVSMtoString(tempVSM);
			ofile1<<i<<endl;
			ofile2<<vsmStr<<endl;
		}
		tempVSM.clear();



	}
	ofile1.close();
	ofile2.close();


}
void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
{
	
	
	map<string,vector<pair<int,int>>> mymap;
	if(!isbagOfWordsExist)
	{
		ConstructMap(mymap,dbfield,seg);
		save(mymap);
		cout<<"词袋子信息已经保存到硬盘"<<endl;
	}
	else
	{
		load(mymap);
	}
	DFcharicteristicWordSelection(mymap,DFthreshold);
	WriteHeadArff();
	VSMFormation(mymap);
	cout<<"arff文件已经形成"<<endl;
	
	
	string temp(infoFromWekaAddress);

	cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
}
/*****************以下函数辅助完成聚类功能*********************************************************************8**********************/
/************************************************************************/
/* 建立文档向量模型                                                                     */
/************************************************************************/
map<int,vector<double> > Preprocess::VSMConstruction(map<string,vector<pair<int,int>>> &mymap)
{   
	int corpus_N=endIndex-beginIndex+1;
	map<int,vector<double>> vsmMatrix;
	vector<string> myKeys=GetFinalKeyWords();
	vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
	for(int i=beginIndex;i<=endIndex;i++)
	{   
		vector<pair<int,double> >tempVSM;
		for(vector<string>::size_type j=0;j<myKeys.size();j++)
		{
			//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
			double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
			TF=0.5+(double)TF/(maxTFandDF[j].first);
			TF*=log((double)corpus_N/maxTFandDF[j].second);
			tempVSM.push_back(make_pair(j,TF));

		}
		if(!tempVSM.empty())
		{
			tempVSM=NormalizationVSM(tempVSM);
			for(vector<pair<int,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
			{
				vsmMatrix[i].push_back(it->second);
			}



		}
		tempVSM.clear();



	}
	return vsmMatrix;

}
/************************************************************************/
/* 获得Weka提供的聚类信息                                                                     */
/************************************************************************/
map<string,vector<double> > Preprocess::GetClusters()
{

	map<string,vector<double> >clusters;
	ifstream ifile(infoFromWekaAddress);
	string temp;
	while(getline(ifile,temp))
	{   boost::smatch matchcluster;
	boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
	if(boost::regex_search(temp,matchcluster,regcluster))	
	{   
		string clustertmp=matchcluster[0].str();
		string ordinates="";
		getline(ifile,ordinates);
		boost::regex regordinates("\\d+(\\.\\d{1,4})?");
		boost::smatch matchordinates;
		std::string::const_iterator it=ordinates.begin();  
		std::string::const_iterator end=ordinates.end();
		while (boost::regex_search(it,end,matchordinates,regordinates)) 
		{       
			string digitstemp=matchordinates[0].str();
			double digitval=0.0;
			std::stringstream ss;
			ss<<digitstemp;
			ss>>digitval;
			clusters[clustertmp].push_back(digitval);
			it=matchordinates[0].second; 
		}





	}
	}
	return clusters;
}
/**计算向量内积*/
double Preprocess::CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2)
{
	double result = 0.0f;
	for (int i = 0; i < vector1.size(); i++)
		result += vector1[i] * vector2[i];
	return result;
}
/**计算向量余弦相似度*/
double Preprocess::CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2)
{
	double numerator=CalDotProductOfVectors(vector1,vector2);
	double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
	denominator=sqrt(denominator);
	return numerator/denominator;
}
/**为每篇文章打上个类别标签*/
vector<pair<int,string> > Preprocess::GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters)
{
	vector<pair<int,string> >resultInfo;
	for(map<int,vector<double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
	{
		vector<pair<string,double> >clusterDistanceAist;
		for(map<string,vector<double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
		{

			double temp=CalCosineofVectors(it->second,clusterit->second);
			clusterDistanceAist.push_back(make_pair(clusterit->first,temp));

		}
		sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
		vector<pair<string,double> >::iterator cDAit=clusterDistanceAist.begin();

		resultInfo.push_back(make_pair(it->first,cDAit->first));
		clusterDistanceAist.clear();
	}
	return  resultInfo;

}
/************************************************************************/
/* 获取每个类别所包含的文章ID                                           */
/************************************************************************/
map<string,vector<int> > Preprocess::FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo)
{
	map<string,vector<int>> articlesInfo;

	for(vector<pair<int,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
	{
		for(map<string,vector<double> >::iterator it=clusters.begin();it!=clusters.end();it++)
		{
			if(retit->second==it->first)
			{
				articlesInfo[it->first].push_back(retit->first);
			}
		}
	}





	return articlesInfo;


}
void Preprocess::RetreiveArticleInfoFromDataBase()
{
	map<string,vector<pair<int,int>>> mymap;
	vector<pair<int,string>>resultInfo;
	map<string,vector<double> >clusters;
	map<int,vector<double> >vsmMatrix;
	map<string,vector<int>> articlesInfo;
	ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
	//boost::regex_replace(strresult)
	//ConstructMap(mymap,1,500);
	//save(mymap);
	load(mymap);
	vsmMatrix=VSMConstruction(mymap);
	clusters=GetClusters();
	resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
	articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);

	/*for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
	{
		ofile<<it->first<<endl;
		int count=0;
		ofile<<"(";
		for(int i=0;i<it->second.size();i++)
		{
			ofile<<(it->second)[i];

			if(count<it->second.size()-1)
			{
				ofile<<",";
			}
			count++;
		}
		ofile<<")";
		ofile<<endl;


	}*/
	for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
	{
		ostringstream out;
		string selectassist;
		char *selectsql=new char[5000];
		int count=0;
		CoInitialize(NULL);
		_ConnectionPtr pConn(__uuidof(Connection));
		_RecordsetPtr pRst(__uuidof(Recordset));
		pConn->ConnectionString=dbconnection;
		pConn->Open("","","",adConnectUnspecified);
		cout <<it->first<<endl;
		ofile<<it->first<<endl;
		out<<"(";
		count=0;
		for(int i=0;i<it->second.size();i++)
		{
			out<<(it->second)[i];
			if(count<it->second.size()-1)
			{
				out<<",";
			}
			count++;
			
		
		}
		out<<")";
		selectassist=out.str();
		sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());

		pRst=pConn->Execute(selectsql,NULL,adCmdText);
		while(!pRst->rsEOF)
		{	
		//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
			string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
			//string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
			string categorization=(_bstr_t)pRst->GetCollect("class");
			cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
			ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;


			
		


			pRst->MoveNext();
			
		}
		pRst->Close();
		pConn->Close();
		pRst.Release();
		pConn.Release();
		CoUninitialize();
	
	}
	
	


ofile.close();	
	
	
}
/********按空白把关键词分割开*****************/
vector<string>Preprocess:: mySplit(string s,set<string> stopwords)
{
	vector<string> wordCollection;
	trim(s," ");

	int nPosBegin=0;
	int nPosEnd=s.find(' ',nPosBegin);
	while(nPosEnd!=string::npos)
	{
		string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
		trim(temp," ");
		wordCollection.push_back(temp);
		nPosBegin=s.find_first_not_of(' ',nPosEnd);
		nPosEnd=s.find(' ',nPosBegin);
	}
	string temp=s.substr(nPosBegin,s.size()-nPosBegin);
	trim(temp," ");
	wordCollection.push_back(temp);


	return wordCollection;

}
posted on 2010-09-03 19:58  finallyly  阅读(12276)  评论(18编辑  收藏  举报