作者:finallyliuyu 出处:博客园
最近在做关于文本分类算法的验证。汉语新闻分类的语料库采用的是我自己爬取的新闻。英文分类语料库考虑采用reuters需要处理reuters21578文本分类语料库。
下面给出处理reuters2178的代码,主要功能就是从文本中提取新闻标题、内容、类别存储到mssql2000中。
把代码拷贝下来,留在这里做份备忘,主要是因为里面涉及了些boost::regex的使用,已经宽窄字符集转换。
尤其是boost::regex的使用,有很多注意事项,比如C#中的\s+,boost要用“\\s+”等。比如boost::regex中的dotmatchnewline 模式是mod_s。这些细节问题,想要全部记住是件很困难的事情,况且也没有必要记住这些东西。用到的时候,想下当时的关键字,在博客里面搜一下就出来了。
提取文章实体
vector<ARTICLE> FindArticles(string rawtext)
{
vector<ARTICLE>articleCollection;
boost::regex regdoc("<REUTERS\\s+TOPICS=\"YES\"\\s+LEWISSPLIT=\"TEST\".*?>(.*?)</REUTERS>",boost::regbase::icase|boost::regbase::mod_s);//获得doc标签内的内容
boost::regex regtitle("<TITLE>(.*?)</TITLE>",boost::regbase::icase|boost::regbase::mod_s);//获得url标签内的内容
boost::regex reglabel("<TOPICS><D>(.*?)</D>.*?</TOPICS>",boost::regbase::icase|boost::regbase::mod_s);//获得标题
boost::regex regcontent("<BODY>(.*?)</BODY>",boost::regbase::icase|boost::regbase::mod_s);//获得内容
ARTICLE article;
boost::smatch mDOC;
boost::smatch mLabel;
boost::smatch mTitle;
boost::smatch mContent;
//rawtext=ProcessSingleline(rawtext);//预处理去掉文本中所有的回车和换行。
string::const_iterator it=rawtext.begin();
string::const_iterator end=rawtext.end();
while(boost::regex_search(it,end,mDOC,regdoc))
{
string doc=mDOC[0];
string label="";
string title="";
string content="";
if(boost ::regex_search(doc,mLabel,reglabel))
{
label=mLabel[1];
}
if(boost::regex_search(doc,mTitle,regtitle))
{
title=mTitle[1];
}
if(boost::regex_search(doc,mContent,regcontent))
{
content=mContent[1];
}
if(content!=""&&title!=""&&label!="")
{
article.ArticleText=content;
article.ArticleTitle=title;
article.Categorization=label;
articleCollection.push_back(article);
}
it=mDOC[0].second;
}
return articleCollection;
}
{
vector<ARTICLE>articleCollection;
boost::regex regdoc("<REUTERS\\s+TOPICS=\"YES\"\\s+LEWISSPLIT=\"TEST\".*?>(.*?)</REUTERS>",boost::regbase::icase|boost::regbase::mod_s);//获得doc标签内的内容
boost::regex regtitle("<TITLE>(.*?)</TITLE>",boost::regbase::icase|boost::regbase::mod_s);//获得url标签内的内容
boost::regex reglabel("<TOPICS><D>(.*?)</D>.*?</TOPICS>",boost::regbase::icase|boost::regbase::mod_s);//获得标题
boost::regex regcontent("<BODY>(.*?)</BODY>",boost::regbase::icase|boost::regbase::mod_s);//获得内容
ARTICLE article;
boost::smatch mDOC;
boost::smatch mLabel;
boost::smatch mTitle;
boost::smatch mContent;
//rawtext=ProcessSingleline(rawtext);//预处理去掉文本中所有的回车和换行。
string::const_iterator it=rawtext.begin();
string::const_iterator end=rawtext.end();
while(boost::regex_search(it,end,mDOC,regdoc))
{
string doc=mDOC[0];
string label="";
string title="";
string content="";
if(boost ::regex_search(doc,mLabel,reglabel))
{
label=mLabel[1];
}
if(boost::regex_search(doc,mTitle,regtitle))
{
title=mTitle[1];
}
if(boost::regex_search(doc,mContent,regcontent))
{
content=mContent[1];
}
if(content!=""&&title!=""&&label!="")
{
article.ArticleText=content;
article.ArticleTitle=title;
article.Categorization=label;
articleCollection.push_back(article);
}
it=mDOC[0].second;
}
return articleCollection;
}
遍历文件夹下面的sgm文件
void FindFile(wchar_t *pFilePath)
{
WIN32_FIND_DATA FindFileData;
HANDLE hFind = INVALID_HANDLE_VALUE;
wchar_t DirSpec[MAX_PATH + 1];// 指定路径
DWORD dwError;
wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1);
wcsncat (DirSpec, L"\\\*", 3);
hFind=FindFirstFile(DirSpec,&FindFileData);
if (hFind == INVALID_HANDLE_VALUE) {
wprintf(L"Invalid file handle. Error is %u ", GetLastError());
return ;
}
bool bFinish=false;
while(!bFinish)
{
if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
{
wchar_t temp[3000];
memset(temp,0,3000*sizeof(wchar_t));
wcscpy(temp,pFilePath);
wcscat(temp,L"\\");
wcscat(temp,FindFileData.cFileName);
string rawtext="";
string line;
ifstream infile;
infile.open(temp);
if(infile)
{
while(getline(infile,line))
{
rawtext+=line;
}
}
infile.clear();
infile.close();
InsertArticlesToDataBase(rawtext);
wstring path(temp);
string spath=myWideCharToMultibyte(path);
cout<<"finishprocess "<<spath<<endl;
}
bFinish = (FindNextFile(hFind, &FindFileData) == false);
}
}
{
WIN32_FIND_DATA FindFileData;
HANDLE hFind = INVALID_HANDLE_VALUE;
wchar_t DirSpec[MAX_PATH + 1];// 指定路径
DWORD dwError;
wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1);
wcsncat (DirSpec, L"\\\*", 3);
hFind=FindFirstFile(DirSpec,&FindFileData);
if (hFind == INVALID_HANDLE_VALUE) {
wprintf(L"Invalid file handle. Error is %u ", GetLastError());
return ;
}
bool bFinish=false;
while(!bFinish)
{
if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
{
wchar_t temp[3000];
memset(temp,0,3000*sizeof(wchar_t));
wcscpy(temp,pFilePath);
wcscat(temp,L"\\");
wcscat(temp,FindFileData.cFileName);
string rawtext="";
string line;
ifstream infile;
infile.open(temp);
if(infile)
{
while(getline(infile,line))
{
rawtext+=line;
}
}
infile.clear();
infile.close();
InsertArticlesToDataBase(rawtext);
wstring path(temp);
string spath=myWideCharToMultibyte(path);
cout<<"finishprocess "<<spath<<endl;
}
bFinish = (FindNextFile(hFind, &FindFileData) == false);
}
}
将文本文件中的单引号替换成双引号,否则插入不到数据库中
string ProcessforMSSQL(string src)
{
int pos=src.find('\'');
while(pos!=string::npos)
{ //string& replace ( size_t pos1, size_t n1, size_t n2, char c );
src=src.replace(pos,1,1,'\"');
pos=src.find('\'',pos);
}
return src;
}
{
int pos=src.find('\'');
while(pos!=string::npos)
{ //string& replace ( size_t pos1, size_t n1, size_t n2, char c );
src=src.replace(pos,1,1,'\"');
pos=src.find('\'',pos);
}
return src;
}
int _tmain(int argc, _TCHAR* argv[])
{
int end;
//DictionaryToDataBase();
FindFile(L"E:\\新闻语料\\reuters21578");
cout<<"finish"<<endl;
cin>>end;
}
{
int end;
//DictionaryToDataBase();
FindFile(L"E:\\新闻语料\\reuters21578");
cout<<"finish"<<endl;
cin>>end;
}
存入数据库
void InsertArticlesToDataBase(string rawtext)
{
vector<ARTICLE> articleCollection=FindArticles(rawtext);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
//_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn->Open("","","",adConnectUnspecified);
char *sqlInsert=new char[1000000];
for(vector<ARTICLE>::iterator it=articleCollection.begin();it!=articleCollection.end();++it)
{
_variant_t RecordsAffected;
memset(sqlInsert,0,1000000);
//将其中的带引号换为双引号
string url=ProcessforMSSQL((*it).Categorization);
string title=ProcessforMSSQL((*it).ArticleTitle);
string text=ProcessforMSSQL((*it).ArticleText);
sprintf_s(sqlInsert,1000000,"insert into ReuteursTest(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s')",title.c_str(),text.c_str(),url.c_str());
pConn->Execute(sqlInsert,&RecordsAffected,-1);
cout<<title<<"添加完毕"<<endl;
}
delete sqlInsert;
pConn->Close();
pConn.Release();
CoUninitialize();
}
{
vector<ARTICLE> articleCollection=FindArticles(rawtext);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
//_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
pConn->Open("","","",adConnectUnspecified);
char *sqlInsert=new char[1000000];
for(vector<ARTICLE>::iterator it=articleCollection.begin();it!=articleCollection.end();++it)
{
_variant_t RecordsAffected;
memset(sqlInsert,0,1000000);
//将其中的带引号换为双引号
string url=ProcessforMSSQL((*it).Categorization);
string title=ProcessforMSSQL((*it).ArticleTitle);
string text=ProcessforMSSQL((*it).ArticleText);
sprintf_s(sqlInsert,1000000,"insert into ReuteursTest(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s')",title.c_str(),text.c_str(),url.c_str());
pConn->Execute(sqlInsert,&RecordsAffected,-1);
cout<<title<<"添加完毕"<<endl;
}
delete sqlInsert;
pConn->Close();
pConn.Release();
CoUninitialize();
}