需求:从格式化数据中抽取html网页,并抽取网页中的全部汉字内容。需要的开源库 inconv boost
数据格式如下:两个^\r\n锁定头部,之后是html网页。 其中第一个头部Store-Size部分保存网页的字节数。
/*=============================================================== * Copyright (C) 2013 All rights reserved. * * 文件名称:ProcessWeiboCorpora.cpp * 创 建 者:刘禹 finallyly * 创建日期:2013年04月24日 * 描 述:从信息文件中找到html源码,抽取网页源码,找到网页文件编码,转换成GBK,从网页中取出全部的汉字过滤掉其他文本,将汉字串按照标点切割 * 备 注: * 更新日志: * ================================================================*/ #include<stdio.h> #include<string.h> #include<zlib.h> #include<stdlib.h> #include <sys/time.h> #include<ctype.h> #include<locale.h> #include<boost/regex.hpp> #include </usr/local/include/iconv.h> #include <errno.h> #include<algorithm> // please add your code here! using namespace std; #define MAX_LINE_LENGTH 1048576 #define TAGLEN 50 /************************************************************ * @brief <funcName:trim> Author:刘禹 finallyly 20130425 去掉字符串首尾空格 ================================================== * @param s ================================================== **********************************************************/ void trim(char *s) { char *start; char *end; int len=strlen(s); start=s; end=s+len-1; while(1) { char c=*start; if(!isspace(c)) { break; } start++; if(start>end) { s[0]='\0'; return ; } } while(1) { char c=*end; if(!isspace(c)) { break; } end --; if(start>end) { s[0]='\0'; return; } } memmove(s,start,end-start+1); s[end-start+1]='\0'; return; } inline bool strTolower( char* str ) { if ( !str ) return false; int i = 0; bool flag = true; while ( str[i] ) { if ( 'A' <= str[i] && 'Z' >= str[i] ) { str[i] += 32; } else if ( 'a' <= str[i] && 'z' >= str[i] ) { } else { flag = false; } ++i; } return flag; } /************************************************************ * @brief <funcName:UnCompressData> Author:刘禹 finallyly 20130424 * 对指定字符串调用zlib函数进行解压 ================================================== * @param input 待解压的字符串 ================================================== * @param output 解压后的字符串 ================================================== * @return **********************************************************/ int UnCompressData(char * SourceBuffer, char * DestBuffer,int &SourceBufferLength, int &DestBufferLength) { int err=uncompress((Bytef*)DestBuffer,(uLongf*)&DestBufferLength,(const Bytef*)SourceBuffer,(uLongf)SourceBufferLength); if(Z_OK!=err) { fprintf(stderr,"解压失败\t%d\n",err); return 1; } else { return 0; } } /************************************************************ * @brief <funcName:CompressData> Author:刘禹 finallyly 20130424 压缩文件 ================================================== * @param SourceBuffer ================================================== * @param DestBuffer ================================================== * @param SourceBufferLength ================================================== * @param DestBufferLength ================================================== * @return **********************************************************/ int CompressData(char * SourceBuffer, char * DestBuffer,unsigned long &SourceBufferLength, unsigned long &DestBufferLength) { int err=compress((Bytef*)DestBuffer,(uLongf*)&DestBufferLength,(const Bytef*)SourceBuffer,(uLongf)SourceBufferLength); if(Z_OK!=err) { fprintf(stderr,"压缩失败\n"); return 1; } else { return 0; } } /************************************************************ * @brief <funcName:> Author:刘禹 20121219 ================================================== * @param Tag 计算Tag数组的长度,结尾符为0,长度不包括0 类似strlen,只是我们这里用int代替char ================================================== * @return 数组的长度 **********************************************************/ int strlenEx( const unsigned char *Tag) { if(NULL==Tag) { return 0; } int len=0; for(;*Tag!='\0';Tag++) { len++; } return len; } /************************************************************ * @brief <funcName:IsEqual> Author:刘禹 20121219 输入两个Tag计算这两个Tag是否相同 ================================================== * @param Tag 输入的Tag ================================================== * @param TagCand 模板Tag ================================================== * @return 1,相等 0 不相等 **********************************************************/ bool IsEqual(unsigned char *Tag, const unsigned char *TagCand) { int lenTag=strlenEx(Tag); int lenTagCand=strlenEx(TagCand); int i=0; bool isequal=0; if(lenTag!=lenTagCand) { return 0; } else { for(i=0;i<lenTag;i++) { if(Tag[i]==TagCand[i]) { isequal=1; } else { isequal=0; break; } } return isequal; } } /************************************************************ * @brief <funcName:FormatTags> Author:刘禹 20121219 根据给定字符c确定nType,返回特定信息 ================================================== * @param c [IN]输入字符 ================================================== * @param Tag [INOUT]将输入字符形成Tag, ================================================== * @param nType [INOUT]起始值是-1 * nType=0正在构建Tag; * nType=1Tag构建结束 ================================================== * @return 函数运行成功返回0,函数运行失败返回1 **********************************************************/ int FormatTags(unsigned char c, unsigned char * Tag,int &nType) { int len=strlenEx(Tag); if(c=='<')//检测到tag的开始符号 { Tag[0]='<'; Tag[len+1]=0; nType=0; } else if(c=='>')//检测到tag的中间符号 { if(len>0&&nType!=-1) { Tag[len]=c; Tag[len+1]=0; nType=1; } else//检测到空tag"<>" { Tag[0]=0; nType=-1; } } else { if(0==nType)//要填充tag,长度大于8说明此tag不是<html>或者</html> { Tag[len]=c; Tag[len+1]=0; } else { Tag[0]=0; nType=-1;//重新开始定位tag; } } return 0; } /************************************************************ * @brief <funcName:GetMainText> Author:刘禹 finallyly 20130501 *从xml 格式文档中抽取需要的正文和来源字段 ================================================== * * @param inputstr ================================================== * @param fout ================================================== * * @return **********************************************************/ int GetMainText(const char *inputstr,FILE *fout) { unsigned char Tag[TAGLEN]={0}; unsigned char text[MAX_LINE_LENGTH]={0}; unsigned char startTag1[TAGLEN]={0}; unsigned char endTag1[TAGLEN]={0}; //不是gbk编码退出 //抽取<text>字段 int nType=-1; bool Writing=0; bool endWriting=0; int buflen=0; //抽取<source>字段 int ret=0; size_t len; startTag1[0]='<'; startTag1[1]='h'; startTag1[2]='t'; startTag1[3]='t'; startTag1[4]='p'; startTag1[5]='>'; startTag1[6]=0; endTag1[0]='<'; endTag1[1]='/'; endTag1[2]='h'; endTag1[3]='t'; endTag1[4]='t'; endTag1[5]='p'; endTag1[6]='>'; endTag1[7]=0; //strLowercase(input,rawtext) if(0==strstr(inputstr,"encoding=\"gb18030\"")) { return 1; } unsigned char *temp=(unsigned char *)inputstr; len=strlenEx(temp); //抽取text字段 for(int i=0;i<len;i++) { int c=temp[i]; FormatTags(c,Tag,nType); if(1==nType)//检测到一个完整的tag标记 { if(!Writing) { Writing=IsEqual(Tag,startTag1); } if(!endWriting) { endWriting=IsEqual(Tag,endTag1); } Tag[0]=0; nType=-1; } if(endWriting) { for(int i=0;i<buflen;i++) { fputc(text[i],fout); } //初始化现场 buflen=0; text[0]=0; nType=-1; Tag[0]=0; Writing=0; endWriting=0; } if(Writing) { text[buflen]=c; text[buflen+1]=0; buflen++; } } } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * 从系统默认的汉字编码本机是GBK转unicode,宽字符保存 ================================================== * @param sToMatch ================================================== * @return **********************************************************/ wstring String2Wstring(string sToMatch) { wstring wsToMatch; setlocale( LC_CTYPE, "" ); // 很重要,没有这一句,转换会失败。 int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。(不包含字符串结束符) if(iWLen>0) { wchar_t *lpwsz = new wchar_t[iWLen + 1]; int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。(转换后的字符串有结束符) wsToMatch.assign(lpwsz); delete []lpwsz; } else { wsToMatch=L""; } return wsToMatch; } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * Unicode转系统自带编码,用于输出 ================================================== * @param sToMatch ================================================== * @return **********************************************************/ string Wstring2String(wstring sToMatch) { string sResult; int iLen = wcstombs( NULL, sToMatch.c_str(), 0 ); // 计算转换后字符串的长度。(不包含字符串结束符) if(iLen>0) { char *lpsz = new char[iLen + 1]; int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 转换。(没有结束符) lpsz[iLen] = '\0'; sResult.assign(lpsz); delete []lpsz; } else { sResult=""; } return sResult; } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * 从指定编码转换到目标编码 ================================================== * @param toCode ================================================== * @param fromCode ================================================== * @param srcstr ================================================== * @param deststr ================================================== * @param srclen ================================================== * @param destlen ================================================== * @return **********************************************************/ int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen) { iconv_t convertor=iconv_open(toCode,fromCode); size_t inputsize; size_t outputsize; size_t oldoutputsize; char *input, *inputold; char *output=NULL; char *outputold=NULL; int flag=0; if(convertor==iconv_t(-1)) { fprintf(stderr,"convertor device initailization failed!\n"); return 1; } else { inputsize=srclen; input=new char[inputsize+1]; memcpy(input,srcstr,inputsize); input[inputsize]='\0'; inputold=input; outputsize=inputsize*5; oldoutputsize=outputsize; output=new char[outputsize]; output[0]=0; outputold=output; size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize); memcpy(deststr,outputold,oldoutputsize-outputsize); deststr[destlen]=0; destlen=oldoutputsize-outputsize; if(rc>0) { flag=1; } delete []inputold; delete []outputold; } iconv_close(convertor); if(flag==1) { return 0; } else { return 1; } } /************************************************************ * @brief <funcName:PrintUsage> Author:刘禹 finallyly 20130424 ================================================== **********************************************************/ void PrintUsage() { fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" ); } /* void PrintError(char error_text[]) { fprintf(stderr,"liuyusi0121 lib run-time error...\n"); fprintf(stderr,"%s\n",error_text); fprintf(stderr,"...now exiting to system...\n"); exit(1); } */ /************************************************************ * @brief <funcName:> Author:刘禹 finallyly ================================================== * @param encodeingfield ================================================== * * @return **********************************************************/ string ParseHtmlEncoding(string encodingfield) { char tempchar[3000]={0}; int i=0; string itsencoding=""; strcpy(tempchar,encodingfield.c_str()); while(tempchar[i]) { tempchar[i] = tolower(tempchar[i]); i++; } if(strstr(tempchar,"utf8")or strstr(tempchar,"utf-8")) { itsencoding="UTF-8"; } if(strstr(tempchar,"gbk")) { itsencoding="GBK"; } if(strstr(tempchar,"gb2312")) { itsencoding="GB2312"; } if(strstr(tempchar,"gb18030")) { itsencoding="GB18030"; } if(strstr(tempchar,"big5")or strstr(tempchar,"big-5")) { itsencoding="BIG5"; } return itsencoding; } /************************************************************ * @brief <funcName:loadFile> Author:刘禹 finallyly 20130510 * 从搜索部门返回的繁体语料中抽取html网页,只处理gbk,gb2312,gb18030,utf-8,big5五种编码的网页,都转成gbk编码 ================================================== * @param inputfilename 待处理文件 ================================================== * @param outputfilename 输出文件夹 ================================================== ================================================== * @return **********************************************************/ int LoadFile(char* inputfilename,char *outputfilename) { FILE *fout=NULL; FILE *fin=NULL; char line[MAX_LINE_LENGTH]={0}; char src[MAX_LINE_LENGTH]={0}; char des[MAX_LINE_LENGTH]={0}; char url[8000]={0}; size_t srclen; size_t deslen=50000; char *p1; char *p2; bool stopflag=false; unsigned long count=0;//每隔10万个数据数据打印一次行号 unsigned long rcount=0;//没两个rcount表明读完了信息文件的头部 unsigned long StoreSize=0; unsigned long OriginalSize=0; unsigned long origArticleCount=0; unsigned long processedArticleCount=0; int linecount=0; //正则用到的变量事先声明 string::const_iterator strit; string::const_iterator strend; wstring::const_iterator wstrit; wstring::const_iterator wstrend; wstring wpattern; wstring rawtext; wstring wresult; string srcstr; string temp; wstring wtemp; string pattern="[,「 」《》# ; ' ' ` ` :!?.。"()@… ~、“”【】]"; //获取网页编码的正则,窄正则 boost::regex reg("<meta[^>]*charset=([^>]+)>",boost::regex::perl|boost::regbase::icase); //获取网页中汉字的正则,宽正则 boost::wregex wreg(L"([\u2E80-\u9FFF]+)",boost::regex::perl|boost::regbase::icase); boost::smatch what; boost::wsmatch wswhat; string charset;//网页编码 wpattern=String2Wstring(pattern); //分割汉字的正则 boost::wregex wreg_split(wpattern); //文件读写 fin=fopen(inputfilename,"rb"); fout=fopen(outputfilename,"w"); if(NULL==fin) { fprintf(stderr,"can not open inputfile:%s\n",inputfilename); return 1; } if(NULL==fout) { fprintf(stderr,"can not open outputfile:%s\n",outputfilename); return 1; } while(true) { do { fgets(line,MAX_LINE_LENGTH,fin); if(feof(fin)) { stopflag=1; break; } size_t len=strlen(line); line[len-1]='\0'; if(line[0]=='\r') { rcount++; } //读取正文URL if(line[0]=='h'&& line[1]=='t' && line[2]=='t' && line[3]=='p' && line[4]==':') { strcpy(url,line); } //读取正文压缩后的数据 if(line[0]=='S' and line[1]=='t' and line[2]=='o' and line[3]=='r'and line[4]=='e') { p1=strtok(line,":"); p2=strtok(NULL,":"); trim(p2); StoreSize=atoi(p2); } //读取正文原文字符数 if(line[0]=='O' and line[1]=='r' and line[2]=='i' and line[3]=='g'and line[4]=='i') { p1=strtok(line,":"); p2=strtok(NULL,":"); trim(p2); OriginalSize=atoi(p2); } linecount++; count++; if(0==count%100000) { fprintf(stderr,"count=%lu\n",count); } }while(rcount!=2); rcount=0; if(stopflag) { break; } //每读到开头为'\r'两次,表明已经读完了头部,开始抽取html文件 origArticleCount++; if(OriginalSize+1<MAX_LINE_LENGTH) { fread(src,sizeof(char),(OriginalSize+1),fin); } else { fseek(fin,OriginalSize+1,SEEK_CUR); } if(feof(fin)) { break; } srcstr=""; srcstr.assign(src); strit=srcstr.begin(); strend=srcstr.end(); //获得网页编码 if(boost::regex_search(strit,strend,what,reg)) { temp=what[1]; } //判断用正则抽取到的编码是否有效 //printf("%s\n",temp.c_str()); charset=ParseHtmlEncoding(temp); if(charset.size()>10 or charset=="") { continue; } srclen=strlen(src); //需要进行编码转换 if(charset=="UTF-8"or charset=="UTF8" or charset=="BIG5" or charset=="BIG-5" or charset=="GB18030") { if(srclen==0) { continue; } if(0==toAnotherCode("GBK//IGNORE",charset.c_str(),src,des,srclen,deslen)) { src[0]=0;//重新初始化,后续留作其他用途 srcstr=""; srcstr.assign(des);//注意要对转成GBK后的网页文件进行处理 des[0]=0;//重新初始化,后续留作其他用途 rawtext=String2Wstring(srcstr); if(rawtext==L"") { continue; } wstrit=rawtext.begin(); wstrend=rawtext.end(); //只取汉字 while(boost::regex_search(wstrit,wstrend,wswhat,wreg)) { wtemp=wswhat[1]; //按指定分割符号将汉字切割 boost::wsregex_token_iterator sentences(wtemp.begin(), wtemp.end(), wreg_split, -1); boost::wsregex_token_iterator sentend; for(boost::wsregex_token_iterator begin=sentences;begin!=sentend;begin++) { wresult=*begin; temp=Wstring2String(wresult); if(temp!="") { strcat(des,temp.c_str()); strcat(des,"\n"); } } wstrit=wswhat[0].second; } //输出URL,保存URL的原因,台湾正体和香港繁体不一样 fprintf(fout,"%s\n",url); fprintf(fout,des); des[0]=0; url[0]=0; processedArticleCount++; } } //不需要进行编码转换 if(charset=="GBK") { memcpy(des,src,srclen); des[srclen]=0; src[0]=0;//重新初始化,后续留作其他用途 srcstr=""; srcstr.assign(des);//注意要对转成GBK后的网页文件进行处理 des[0]=0;//重新初始化,后续留作其他用途 rawtext=String2Wstring(srcstr); wstrit=rawtext.begin(); wstrend=rawtext.end(); //只取汉字 while(boost::regex_search(wstrit,wstrend,wswhat,wreg)) { wtemp=wswhat[1]; //按指定分割符号将汉字切割 boost::wsregex_token_iterator sentences(wtemp.begin(), wtemp.end(), wreg_split, -1); boost::wsregex_token_iterator sentend; for(boost::wsregex_token_iterator begin=sentences;begin!=sentend;begin++) { wresult=*begin; temp=Wstring2String(wresult); if(temp!="") { strcat(des,temp.c_str()); strcat(des,"\n"); } } wstrit=wswhat[0].second; } //输出URL,保存URL的原因,台湾正体和香港繁体不一样 fprintf(fout,"%s\n",url); fprintf(fout,des); des[0]=0; url[0]=0; processedArticleCount++; } //读取了一个网页数据,状态被重置: linecount=0; count++; if(0==count%1000000) { fprintf(stderr,"count=%lu\n",count); } fprintf(stdout,"原始文章数%lu\t处理后的文章数%lu\n",origArticleCount,processedArticleCount); } fprintf(stdout,"原始文章数%lu\t处理后的文章数%lu\n",origArticleCount,processedArticleCount); if(NULL!=fin) { fclose(fin); fin=NULL; } if(NULL!=fout) { fclose(fout); fout=NULL; } return 0; } int main( int argc, char *argv[] ) { timeval tv1, tv2; gettimeofday(&tv1, NULL); if ( 3 != argc ) { PrintUsage(); return 1; } LoadFile(argv[1],argv[2]); gettimeofday(&tv2, NULL); fprintf(stderr,"%s has finished congratulations!\n",argv[0]); fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000); return 0; }
功能代码说明:
toAnotherCode 是进行编码转换的函数
ParseHtmlEncoding调用boost正则,从网页的meta字段中读取网页编码,我们把BIG5,UTF8,GB2312等编码的网页通通转成GBK。
String2Wstring和WString2String实现宽窄字符转换,具体而言是将GBK编码转换成UNICODE编码,或者将UNICODE编码转成GBK。 我linux服务器的汉字默认编码是GBK。
主要的功能函数LoadFile
另外的一些函数本项目中没有使用,以前的项目中使用过。其实就是从html源码中,按照指定的tag完成内容抽取。
再贴一下我的makefile.am
bin_PROGRAMS+=ExtractTradition
INCLUDES=-I /home/liuyu/MyTars/boost_1_53_0/libs/regex/src
ExtractTradition_SOURCES=ExtractTradition.cpp winstances.cpp wide_posix_api.cpp wc_regex_traits.cpp w32_regex_traits.cpp usinstances.cpp static_mutex.cpp regex_traits_defaults.cpp regex_raw_buffer.cpp regex_debug.cpp regex.cpp posix_api.cpp instances.cpp icu.cpp cpp_regex_traits.cpp cregex.cpp c_regex_traits.cpp fileiter.cpp
ExtractTradition_LDADD= -lACE -lm -liconv
ExtractTradition_LDFLAGS=-static-libtool-libs
注意:SOURCE字段的若干cpp是/regex/src下面的cpp,这里采用的是boost_regex的源码编译方式。 动态链接库的形式,在我的电脑上一直失败。
附图见下。