原理: 将模式串与匹配串都转成unicode编码,再用正则。 可以用python完成,或者是用c++boost
方案一,解析程序C版本,中间调用python函数,python函数中调用正则表达式进行函数解析。 本地可以运行,但是haoop集群运行不了。
方案二,采用boost wregex C++源码编译boost库。
备注: cpp文件都UTF-8编码
方案一代码:
#-*-coding:UTF-8-*- import re; import sys; import time; def add(a,b): s=""; try: upatternstr=unicode(a,'UTF-8'); except: pass; pchinese=re.compile(upatternstr); try: uline = unicode(b,"UTF-8"); mylist = []; index = 0; while True: m=pchinese.search(uline,index); if (m!=None): mylist.append(m.group(1).encode("UTF-8")); index =m.end(); else: break; s="\t".join(mylist); return s; except: return s; if (__name__=="__main__"): t="<li><span>字义:</span>(.*?)</li>"; fid=open("qiming2.txt","r"); s=fid.read(); fid.close(); add(t,s);
char line[102400]={0}; char text[102400]={0}; char pattern[200]={0}; strcpy(pattern,t.c_str()); while(fgets(line,102400,stdin)) { //text.assign(line); //wstring wtext = String2Wstringx(t); //wstring::const_iterator it=wtext.begin(); // wstring::const_iterator end=wtext.end(); //while(boost::regex_search(it,end,wm,wreg)) // { // wstring wtemp=wm[1]; // string temp=Wstring2String(wtemp); // results.push_back(temp); // it=wm[1].second; //} strcat(text,line); strcat(text,"\n"); } //string t="刘[^刘]*?,"; //wstring ws=String2Wstring(s); //cout<<p.size()<<endl; //cout<<ws.size()<<endl; //fprintf(stdout,"输出正则匹配结果\n"); //for(vector<string>::iterator it=results.begin();it!=results.end();it++) //{ // printf("%s\n",(*it).c_str()); //} Py_Initialize(); // 检查初始化是否成功 if ( !Py_IsInitialized() ) { return -1; } // 添加当前路径 //把输入的字符串作为Python代码直接运行,返回0 //表示成功,-1表示有错。大多时候错误都是因为字符串 //中有语法错误。 PyRun_SimpleString("import sys"); PyRun_SimpleString("sys.path.append('./')"); PyObject *pName,*pModule,*pDict,*pFunc,*pArgs, *ret; // 载入名为pytest的脚本 pName = PyString_FromString("pytest"); pModule = PyImport_Import(pName); if ( !pModule ) { printf("can't find pytest.py"); return -1; } pDict = PyModule_GetDict(pModule); if ( !pDict ) { return -1; } // 找出函数名为add的函数 pFunc = PyDict_GetItemString(pDict, "add"); if ( !pFunc || !PyCallable_Check(pFunc) ) { printf("can't find function [add]"); return -1; } // 参数进栈 *pArgs; pArgs = PyTuple_New(2); // PyObject* Py_BuildValue(char *format, ...) // 把C++的变量转换成一个Python对象。当需要从 // C++传递变量到Python时,就会使用这个函数。此函数 // 有点类似C的printf,但格式不同。常用的格式有 // s 表示字符串, // i 表示整型变量, // f 表示浮点数, // O 表示一个Python对象。 PyTuple_SetItem(pArgs, 0, Py_BuildValue("s",pattern)); PyTuple_SetItem(pArgs, 1, Py_BuildValue("s",text)); // 调用Python函数 ret=PyObject_CallObject(pFunc, pArgs); char * str_ret = PyString_AsString(ret); printf("result:%s\n", str_ret); Py_DECREF(pName); Py_DECREF(pArgs); Py_DECREF(pModule); // 关闭Python Py_Finalize(); gettimeofday(&tv2, NULL); fprintf(stderr,"%s has finished congratulations!\n",argv[0]); fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000); return 0;
方法二
// please add your code here! #include <iostream> #include <stdlib.h> #include <math.h> #include<time.h> #include <set> #include <string> #include <sys/time.h> #include<locale.h> #include<boost/regex.hpp> #include <wchar.h> #include <iconv.h> #include <errno.h> using namespace std; /* funcname: spec: parms:[IN] [IN] [OUT] returnValue: author liuyu, 20120528 */ void PrintUsage() { fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" ); } int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen) { iconv_t convertor=iconv_open(toCode,fromCode); size_t inputsize; size_t outputsize; size_t oldoutputsize; char *input, *inputold; char *output=NULL; char *outputold=NULL; int flag=0; if(convertor==iconv_t(-1)) { fprintf(stderr,"convertor device initailization failed!\n"); return 1; } else { inputsize=srclen; input=new char[inputsize+1]; memcpy(input,srcstr,inputsize); input[inputsize]='\0'; inputold=input; outputsize=inputsize*5; oldoutputsize=outputsize; output=new char[outputsize]; output[0]=0; outputold=output; size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize); if (rc==size_t(-1)) { fprintf(stdout, "errno=%d\n",errno); } destlen=oldoutputsize-outputsize; memcpy(deststr,outputold,destlen); deststr[destlen]=0; if(rc!=size_t(-1)) { flag=1; } delete []inputold; delete []outputold; } iconv_close(convertor); if(flag==1) { return 0; } else { return 1; } } wchar_t * MBs2WCs(const char* pszSrc){ wchar_t* pwcs = NULL; int size = 0; setlocale(LC_ALL, "zh_CN.UTF8"); size = mbstowcs(NULL,pszSrc,0); pwcs = new wchar_t[size+1]; size = mbstowcs(pwcs, pszSrc, size+1); pwcs[size] = 0; return pwcs; } char* WCs2MBs(const wchar_t * wcharStr){ char* str = NULL; int size = 0; setlocale(LC_ALL, "zh_CN.UTF8"); size = wcstombs( NULL, wcharStr, 0); str = new char[size + 1]; wcstombs( str, wcharStr, size); str[size] = '\0'; return str; } int main( int argc, char *argv[] ) { timeval tv1, tv2; gettimeofday(&tv1, NULL); if ( 1 != argc ) { PrintUsage(); return 1; } /* char *s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi"; char *t="(刘[^刘]*?),"; wchar_t *ws =MBs2WCs(s); wchar_t *wt =MBs2WCs(t); wstring wstr1=ws; wstring wstr2=wt; boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl); boost::wsmatch wm; wstring::const_iterator it=wstr1.begin(); wstring::const_iterator end=wstr1.end(); while(boost::regex_search(it,end,wm,wreg)) { wstring wtemp=wm[1]; char* temp=WCs2MBs(wtemp.c_str()); printf("%s\n",temp); it=wm[0].second; } */ char line[102400]={0}; char text[102400]={0}; char* t="<li><span>字义:</span>(.*?)</li>"; wchar_t *wt =MBs2WCs(t); boost::wsmatch wm; boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl); while(fgets(line,102400,stdin)) { strcat(text,line); } wchar_t * ws = MBs2WCs(text); wstring wtext=ws; wstring::const_iterator it=wtext.begin(); wstring::const_iterator end=wtext.end(); vector<string> results; while(boost::regex_search(it,end,wm,wreg)) { wstring wtemp=wm[1]; char* temp=WCs2MBs(wtemp.c_str()); results.push_back(temp); it=wm[1].second; } for (vector<string>::iterator it = results.begin(); it!=results.end(); it++) { fprintf(stdout,"%s\n",(*it).c_str()); } gettimeofday(&tv2, NULL); fprintf(stderr,"%s has finished congratulations!\n",argv[0]); fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000); return 0; }
方法一的编译方法:
g++ Python.cpp -o Python -I/usr/include/python2.5 -L/usr/lib/python2.5 -lpython2.5