作者finallyliuyu 出处博客园

 通过C++处理reuters21578(一)的代码,初步形成了两张表单存放训练语料库和测试语料库,由于这两个语料库的个别类别不一致,所以要找到这两个语料库类别的交集,然后最终形成文本分类的训练语料库和测试语料库。以下主函数中完成此功能。

 

 

class GT_clss
 {
 
public:
     GT_clss(
string &s):comparepart(s){}
     
bool operator()(const string &elem)
     {
         
return elem==comparepart;

     }
 
private:
     
string comparepart;
 };

 

 

 

数据库中共有多少个类别
vector<string>GetLabels(string tablename)
 {   vector
<string>labels;
 
char * selectbySpecificId=new char [1000];
 memset(selectbySpecificId,
0,1000);
 sprintf_s(selectbySpecificId,
1000,"select Categorization from %s ",tablename.c_str());
 CoInitialize(NULL);
 _ConnectionPtr pConn(__uuidof(Connection));
 _RecordsetPtr pRst(__uuidof(Recordset));
 pConn
->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
 pConn
->Open("","","",adConnectUnspecified);
 pRst
=pConn->Execute(selectbySpecificId,NULL,adCmdText);
 
while(!pRst->rsEOF)
 {
     
string label=(_bstr_t)pRst->GetCollect("Categorization");
     
if (!count_if(labels.begin(),labels.end(),GT_clss(label)))
     {
         labels.push_back(label);
     }

     pRst
->MoveNext();

 }
 pRst
->Close();
 pConn
->Close();
 pRst.Release();
 pConn.Release();
 CoUninitialize();
 delete []selectbySpecificId;

 
return labels;



 }

 

 

 

主函数
int _tmain(int argc, _TCHAR* argv[])
{
     
int end;
    
//set<string>labels;
    vector<string>labelsTrain=GetLabels("ReteursTrain");
    vector
<string>labelsTest=GetLabels("ReteursTest");
    vector
<string>finalLabels;
    
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
    {
        trim(
*it," ");
    }
    
for(vector<string>::iterator it=labelsTest.begin();it!=labelsTest.end();it++)
    {
        trim(
*it," ");

    }
    
    
for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
    {
        
if (count_if(labelsTest.begin(),labelsTest.end(),GT_clss(*it)))
        {
            finalLabels.push_back(
*it);
        }
    }

    
char * selectbySpecificId=new char [1000];
    memset(selectbySpecificId,
0,1000);
    sprintf_s(selectbySpecificId,
1000,"select CArticleName,CAbstract,Categorization from ReteursTest");
    CoInitialize(NULL);
    _ConnectionPtr pConn(__uuidof(Connection));
    _RecordsetPtr pRst(__uuidof(Recordset));
    _ConnectionPtr pConn2(__uuidof(Connection));
    pConn
->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
    pConn2
->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=FinallyCorpus";
    pConn
->Open("","","",adConnectUnspecified);
    pConn2
->Open("","","",adConnectUnspecified);
    pRst
=pConn->Execute(selectbySpecificId,NULL,adCmdText);
    
while(!pRst->rsEOF)
    {
        
string label=(_bstr_t)pRst->GetCollect("Categorization");
        trim(label,
" ");

        
if (count_if(finalLabels.begin(),finalLabels.end(),GT_clss(label)))
        {
            
string ArticleTitle=(_bstr_t)pRst->GetCollect("CArticleName");
            
string ArticleText=(_bstr_t)pRst->GetCollect("CAbstract");
            ArticleTitle
=ProcessforMSSQL(ArticleTitle);
            ArticleText
=ProcessforMSSQL(ArticleText);
            
char *sqlInsert=new char[1000000];
            _variant_t RecordsAffected;
            memset(sqlInsert,
0,1000000);
            sprintf_s(sqlInsert,
1000000,"insert into ReteursTestingCorpus(CArticleName,CAbstract,Categorization) values('%s','%s','%s')",ArticleTitle.c_str(),ArticleText.c_str(),label.c_str());
            pConn2
->Execute(sqlInsert,&RecordsAffected,-1);
            delete []sqlInsert;

            


            
        }
        

        pRst
->MoveNext();

    }
    pRst
->Close();
    pConn
->Close();
    pRst.Release();
    pConn.Release();
    pConn2
->Close();
    pConn2.Release();
    CoUninitialize();
    delete []selectbySpecificId;

    
    cout
<<"两标签集交集为"<<endl;

    cout
<<finalLabels.size()<<endl;

    
//DictionaryToDataBase();
    
    
//FindFile(L"E:\\新闻语料\\reuters21578");
    

    
//pRst=pConn->Execute(,NULL,adCmdText);


    
   cout
<<"finish"<<endl;
    
    
    cin
>>end;






}

 

 

posted on 2010-12-27 16:12  finallyly  阅读(447)  评论(0编辑  收藏  举报