C++处理reuters21578（一）

作者:finallyliuyu 出处：博客园

最近在做关于文本分类算法的验证。汉语新闻分类的语料库采用的是我自己爬取的新闻。英文分类语料库考虑采用reuters需要处理reuters21578文本分类语料库。

下面给出处理reuters2178的代码，主要功能就是从文本中提取新闻标题、内容、类别存储到mssql2000中。

把代码拷贝下来，留在这里做份备忘，主要是因为里面涉及了些boost::regex的使用，已经宽窄字符集转换。

尤其是boost::regex的使用，有很多注意事项，比如C#中的\s+,boost要用“\\s+”等。比如boost::regex中的dotmatchnewline 模式是mod_s。这些细节问题，想要全部记住是件很困难的事情，况且也没有必要记住这些东西。用到的时候，想下当时的关键字，在博客里面搜一下就出来了。

提取文章实体

vector<ARTICLE> FindArticles(string rawtext)
{
vector<ARTICLE>articleCollection;

boost::regex regdoc("<REUTERS\\s+TOPICS=\"YES\"\\s+LEWISSPLIT=\"TEST\".*?>(.*?)</REUTERS>",boost::regbase::icase|boost::regbase::mod_s);//获得doc标签内的内容
boost::regex regtitle("<TITLE>(.*?)</TITLE>",boost::regbase::icase|boost::regbase::mod_s);//获得url标签内的内容
boost::regex reglabel("<TOPICS><D>(.*?)</D>.*?</TOPICS>",boost::regbase::icase|boost::regbase::mod_s);//获得标题
boost::regex regcontent("<BODY>(.*?)</BODY>",boost::regbase::icase|boost::regbase::mod_s);//获得内容

ARTICLE article;

boost::smatch mDOC;
boost::smatch mLabel;
boost::smatch mTitle;
boost::smatch mContent;
//rawtext=ProcessSingleline(rawtext);//预处理去掉文本中所有的回车和换行。
string::const_iterator  it=rawtext.begin();
string::const_iterator  end=rawtext.end();
while(boost::regex_search(it,end,mDOC,regdoc))
{
     string doc=mDOC[0];


     string label="";
     string title="";
     string content="";
     if(boost ::regex_search(doc,mLabel,reglabel))
     {
        label=mLabel[1];

     }
     if(boost::regex_search(doc,mTitle,regtitle))
     {
         title=mTitle[1];
     }
     if(boost::regex_search(doc,mContent,regcontent))
     {
         content=mContent[1];
     }
     if(content!=""&&title!=""&&label!="")
     {
         article.ArticleText=content;
         article.ArticleTitle=title;
         article.Categorization=label;
         articleCollection.push_back(article);
     }

     it=mDOC[0].second;

}

return articleCollection;

}

遍历文件夹下面的sgm文件

void FindFile(wchar_t *pFilePath)
{
     WIN32_FIND_DATA FindFileData;
     HANDLE hFind = INVALID_HANDLE_VALUE;
     wchar_t  DirSpec[MAX_PATH + 1];// 指定路径
     DWORD dwError;
     wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1);
     wcsncat (DirSpec, L"\\\*", 3);
     hFind=FindFirstFile(DirSpec,&FindFileData);
     if (hFind == INVALID_HANDLE_VALUE) {
         wprintf(L"Invalid file handle. Error is %u ", GetLastError());
         return ;
     }
     bool bFinish=false;
     while(!bFinish)
     {
         if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
         {

             wchar_t temp[3000];
             memset(temp,0,3000*sizeof(wchar_t));
             wcscpy(temp,pFilePath);
             wcscat(temp,L"\\");
             wcscat(temp,FindFileData.cFileName);
             string rawtext="";
             string line;
             ifstream infile;
             infile.open(temp);
             if(infile)
                {
                    while(getline(infile,line))
                    {
                        rawtext+=line;
                    }


                }

             infile.clear();
             infile.close();
            InsertArticlesToDataBase(rawtext);
            wstring path(temp);
            string spath=myWideCharToMultibyte(path);
            cout<<"finishprocess "<<spath<<endl;

         }
         bFinish = (FindNextFile(hFind, &FindFileData) == false);

     }










}

将文本文件中的单引号替换成双引号，否则插入不到数据库中

string ProcessforMSSQL(string src)
{
     int pos=src.find('\'');
     while(pos!=string::npos)
     { //string& replace ( size_t pos1, size_t n1,   size_t n2, char c );
        src=src.replace(pos,1,1,'\"');
        pos=src.find('\'',pos);

     }

     return src;
}

int _tmain(int argc, _TCHAR* argv[])
{
     int end;
    //DictionaryToDataBase();

    FindFile(L"E:\\新闻语料\\reuters21578");
   cout<<"finish"<<endl;


    cin>>end;

}

存入数据库

void InsertArticlesToDataBase(string rawtext)
{
     vector<ARTICLE> articleCollection=FindArticles(rawtext);
     CoInitialize(NULL);
     _ConnectionPtr pConn(__uuidof(Connection));
     //_RecordsetPtr pRst(__uuidof(Recordset));
     pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
     pConn->Open("","","",adConnectUnspecified);
     char *sqlInsert=new char[1000000];
     for(vector<ARTICLE>::iterator it=articleCollection.begin();it!=articleCollection.end();++it)
     {
         _variant_t RecordsAffected;
         memset(sqlInsert,0,1000000);
         //将其中的带引号换为双引号
         string url=ProcessforMSSQL((*it).Categorization);
         string title=ProcessforMSSQL((*it).ArticleTitle);
         string text=ProcessforMSSQL((*it).ArticleText);
         sprintf_s(sqlInsert,1000000,"insert into ReuteursTest(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s')",title.c_str(),text.c_str(),url.c_str());
         pConn->Execute(sqlInsert,&RecordsAffected,-1);
         cout<<title<<"添加完毕"<<endl;

     }
     delete sqlInsert;
     pConn->Close();
     pConn.Release();
     CoUninitialize();

}

posted on 2010-12-16 11:31 finallyly 阅读(1957) 评论(10) 编辑收藏举报

刷新页面返回顶部

C++处理reuters21578（一）

公告