原理: 将模式串与匹配串都转成unicode编码,再用正则。 可以用python完成,或者是用c++boost

方案一,解析程序C版本,中间调用python函数,python函数中调用正则表达式进行函数解析。 本地可以运行,但是haoop集群运行不了。

方案二,采用boost wregex C++源码编译boost库。

备注: cpp文件都UTF-8编码

 

方案一代码:

#-*-coding:UTF-8-*-
import re;
import sys;
import time;
def add(a,b): 
    s="";
    try:
        upatternstr=unicode(a,'UTF-8');
    except:
        pass;
    pchinese=re.compile(upatternstr);
    try:
        uline = unicode(b,"UTF-8");
        mylist = [];
        index = 0;
        while True:
            m=pchinese.search(uline,index);
            if (m!=None):
                mylist.append(m.group(1).encode("UTF-8"));
                index =m.end();
            else:
                break;
        s="\t".join(mylist);
        return s;
    except:
        return s;

if (__name__=="__main__"):
    t="<li><span>字义:</span>(.*?)</li>";
    fid=open("qiming2.txt","r");
    s=fid.read();
    fid.close();
    add(t,s);

  

char line[102400]={0};
    char text[102400]={0};
    char pattern[200]={0};
    strcpy(pattern,t.c_str());
    while(fgets(line,102400,stdin))
    {

            //text.assign(line);
            //wstring wtext = String2Wstringx(t);
            //wstring::const_iterator  it=wtext.begin();
           // wstring::const_iterator  end=wtext.end();
            //while(boost::regex_search(it,end,wm,wreg))
           // {
           //     wstring wtemp=wm[1];
           //     string temp=Wstring2String(wtemp);
           //     results.push_back(temp);
           //     it=wm[1].second;
            //}
          strcat(text,line);
          strcat(text,"\n");
    }
    //string t="刘[^刘]*?,";
    //wstring ws=String2Wstring(s);
    //cout<<p.size()<<endl;
    //cout<<ws.size()<<endl;
    //fprintf(stdout,"输出正则匹配结果\n");
    //for(vector<string>::iterator it=results.begin();it!=results.end();it++)
    //{
    //    printf("%s\n",(*it).c_str());
    //}

     
    Py_Initialize(); 

    // 检查初始化是否成功 
    if ( !Py_IsInitialized() ) 
    { 
        return -1; 
    } 
    // 添加当前路径 
    //把输入的字符串作为Python代码直接运行,返回0 
    //表示成功,-1表示有错。大多时候错误都是因为字符串 
    //中有语法错误。 
    PyRun_SimpleString("import sys"); 
    PyRun_SimpleString("sys.path.append('./')"); 
    PyObject *pName,*pModule,*pDict,*pFunc,*pArgs, *ret; 

    // 载入名为pytest的脚本 
    pName = PyString_FromString("pytest"); 
    pModule = PyImport_Import(pName); 
    if ( !pModule ) 
    { 
        printf("can't find pytest.py"); 
        return -1; 
    } 
    pDict = PyModule_GetDict(pModule); 
    if ( !pDict ) 
    { 
        return -1; 
    } 

    // 找出函数名为add的函数 
    pFunc = PyDict_GetItemString(pDict, "add"); 
    if ( !pFunc || !PyCallable_Check(pFunc) ) 
    { 
        printf("can't find function [add]"); 
        return -1; 
    } 

    // 参数进栈 
    *pArgs; 
    pArgs = PyTuple_New(2); 

    //  PyObject* Py_BuildValue(char *format, ...) 
    //  把C++的变量转换成一个Python对象。当需要从 
    //  C++传递变量到Python时,就会使用这个函数。此函数 
    //  有点类似C的printf,但格式不同。常用的格式有 
    //  s 表示字符串, 
    //  i 表示整型变量, 
    //  f 表示浮点数, 
    //  O 表示一个Python对象。 

    PyTuple_SetItem(pArgs, 0, Py_BuildValue("s",pattern)); 
    PyTuple_SetItem(pArgs, 1, Py_BuildValue("s",text)); 
    // 调用Python函数 
    ret=PyObject_CallObject(pFunc, pArgs); 
    char * str_ret = PyString_AsString(ret);
     printf("result:%s\n", str_ret);
    Py_DECREF(pName); 
    Py_DECREF(pArgs); 
    Py_DECREF(pModule); 
    // 关闭Python 
    Py_Finalize(); 
    gettimeofday(&tv2, NULL);
    fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
    fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
    return 0;

  方法二

// please add your code here!
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include<time.h>
#include <set>
#include <string>
#include <sys/time.h>
#include<locale.h>
#include<boost/regex.hpp>
#include  <wchar.h>
#include <iconv.h> 
#include <errno.h>
using namespace std;

/*
   funcname:
   spec:
   parms:[IN]
         [IN]
         [OUT]
   returnValue:
   author liuyu, 20120528
*/
void PrintUsage()
{
    fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );
}
int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
{
    iconv_t convertor=iconv_open(toCode,fromCode);
    size_t inputsize;
    size_t outputsize;
    size_t oldoutputsize;
    char *input, *inputold;
    char *output=NULL;
    char *outputold=NULL;
    int flag=0;
    if(convertor==iconv_t(-1))
    {
        fprintf(stderr,"convertor device initailization failed!\n");
        return 1;
    }
    else
    {
        inputsize=srclen;
        input=new char[inputsize+1];
        memcpy(input,srcstr,inputsize);
        input[inputsize]='\0';
        inputold=input;
        outputsize=inputsize*5;
        oldoutputsize=outputsize;
        output=new char[outputsize];
        output[0]=0;
        outputold=output;
        size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
        if (rc==size_t(-1))
        {
            fprintf(stdout, "errno=%d\n",errno);
        }
        destlen=oldoutputsize-outputsize;
        memcpy(deststr,outputold,destlen);
        deststr[destlen]=0;
        if(rc!=size_t(-1))
        {
            flag=1;
        }
        
        delete []inputold;
        delete []outputold;

    }
    iconv_close(convertor);
    if(flag==1)
    {
        return 0;
    }
    else
    {
        return 1;
    }

}
wchar_t * MBs2WCs(const char* pszSrc){ 
     wchar_t* pwcs = NULL; 
        int size = 0; 
        setlocale(LC_ALL, "zh_CN.UTF8"); 
        size = mbstowcs(NULL,pszSrc,0); 
        pwcs = new wchar_t[size+1]; 
        size = mbstowcs(pwcs, pszSrc, size+1); 
        pwcs[size] = 0; 
    return pwcs; 
 }
 
 char* WCs2MBs(const wchar_t * wcharStr){ 
    char* str = NULL; 
    int size = 0; 
        setlocale(LC_ALL, "zh_CN.UTF8"); 
        size = wcstombs( NULL, wcharStr, 0); 
        str = new char[size + 1]; 
        wcstombs( str, wcharStr, size); 
        str[size] = '\0'; 
    return str; 
 }

int main( int argc, char *argv[] )
{
    timeval tv1, tv2;
    gettimeofday(&tv1, NULL); 
    if ( 1 != argc )
	{
		PrintUsage();

		return 1;
	}
    /*
    char *s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi";
    char *t="(刘[^刘]*?),";
    wchar_t *ws =MBs2WCs(s);
    wchar_t *wt =MBs2WCs(t);
    wstring wstr1=ws;
    wstring wstr2=wt;
    boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
    boost::wsmatch wm;
    wstring::const_iterator  it=wstr1.begin();
    wstring::const_iterator  end=wstr1.end();
    while(boost::regex_search(it,end,wm,wreg))
        {
                wstring wtemp=wm[1];
                char* temp=WCs2MBs(wtemp.c_str());
                printf("%s\n",temp);
                it=wm[0].second;
        }
      */
    char line[102400]={0};
    char text[102400]={0};
    char* t="<li><span>字义:</span>(.*?)</li>";
    wchar_t *wt =MBs2WCs(t);
    boost::wsmatch wm;
    boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
    while(fgets(line,102400,stdin))
    {
        strcat(text,line);
    }
    wchar_t * ws = MBs2WCs(text);
    wstring wtext=ws;
    wstring::const_iterator  it=wtext.begin();
    wstring::const_iterator  end=wtext.end();
    vector<string> results;
    while(boost::regex_search(it,end,wm,wreg))
    {
                wstring wtemp=wm[1];
                char* temp=WCs2MBs(wtemp.c_str());
                results.push_back(temp);
                it=wm[1].second;
    }
    for (vector<string>::iterator it = results.begin(); it!=results.end(); it++)
    {
        fprintf(stdout,"%s\n",(*it).c_str());
    }
    gettimeofday(&tv2, NULL);
    fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
    fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
	return 0;
}

  方法一的编译方法:

 

 g++ Python.cpp -o Python -I/usr/include/python2.5 -L/usr/lib/python2.5 -lpython2.5

 

posted on 2015-08-12 15:03  finallyly  阅读(422)  评论(0编辑  收藏  举报