需求:从格式化数据中抽取html网页,并抽取网页中的全部汉字内容。需要的开源库 inconv boost

数据格式如下:两个^\r\n锁定头部,之后是html网页。 其中第一个头部Store-Size部分保存网页的字节数。

/*===============================================================
*   Copyright (C) 2013 All rights reserved.
*   
*   文件名称:ProcessWeiboCorpora.cpp
*   创 建 者:刘禹 finallyly 
*   创建日期:2013年04月24日
*   描    述:从信息文件中找到html源码,抽取网页源码,找到网页文件编码,转换成GBK,从网页中取出全部的汉字过滤掉其他文本,将汉字串按照标点切割
*   备    注: 
*   更新日志:
*
================================================================*/
#include<stdio.h>
#include<string.h>
#include<zlib.h>
#include<stdlib.h> 
#include <sys/time.h>
#include<ctype.h>
#include<locale.h>
#include<boost/regex.hpp>
#include </usr/local/include/iconv.h> 
#include <errno.h>
#include<algorithm>
// please add your code here!
using namespace std;
#define MAX_LINE_LENGTH 1048576
#define TAGLEN 50
/************************************************************
* @brief <funcName:trim> Author:刘禹 finallyly 20130425 去掉字符串首尾空格
==================================================
* @param s
==================================================
**********************************************************/
void trim(char *s)
{
    char *start;
    char *end;
    int len=strlen(s);
    start=s;
    end=s+len-1;
    while(1)
    {
        char c=*start;
        if(!isspace(c))
        {
            break;
        }
        start++;
        if(start>end)
        {
            s[0]='\0';
            return ;
        }
    }
    while(1)
    {
        char c=*end;
        if(!isspace(c))
        {
            break;
        }
        end --;
        if(start>end)
        {
            s[0]='\0';
            return;
        }
    }
    memmove(s,start,end-start+1);
    s[end-start+1]='\0';
    return;
}

inline bool strTolower( char* str )
{
    if ( !str )
        return false;
    int i = 0;
    bool flag = true;
    while ( str[i] )
    {
        if ( 'A' <= str[i] && 'Z' >= str[i] )
        {
            str[i] += 32;
        }
        else if ( 'a' <= str[i] && 'z' >= str[i] )
        {
        }
        else
        {
            flag = false;
        }
        ++i;
    }
    return flag;
}
/************************************************************
* @brief <funcName:UnCompressData> Author:刘禹 finallyly 20130424
* 对指定字符串调用zlib函数进行解压
==================================================
* @param input 待解压的字符串
==================================================
* @param output 解压后的字符串
==================================================
* @return 
**********************************************************/

int UnCompressData(char * SourceBuffer, char * DestBuffer,int &SourceBufferLength, int  &DestBufferLength)
{
    int err=uncompress((Bytef*)DestBuffer,(uLongf*)&DestBufferLength,(const Bytef*)SourceBuffer,(uLongf)SourceBufferLength);
    if(Z_OK!=err)
    {
        fprintf(stderr,"解压失败\t%d\n",err);
        return 1;
    }
    else
    {
        return 0;
    }
}

/************************************************************
* @brief <funcName:CompressData> Author:刘禹 finallyly 20130424 压缩文件
==================================================
* @param SourceBuffer
==================================================
* @param DestBuffer
==================================================
* @param SourceBufferLength
==================================================
* @param DestBufferLength
==================================================
* @return 
**********************************************************/
int CompressData(char * SourceBuffer, char * DestBuffer,unsigned long &SourceBufferLength, unsigned long &DestBufferLength)
{
    int err=compress((Bytef*)DestBuffer,(uLongf*)&DestBufferLength,(const Bytef*)SourceBuffer,(uLongf)SourceBufferLength);
    if(Z_OK!=err)
    {
        fprintf(stderr,"压缩失败\n");
        return 1;
    }
    else
    {
        return 0;
    }
}

/************************************************************
* @brief <funcName:> Author:刘禹 20121219
==================================================
* @param Tag 计算Tag数组的长度,结尾符为0,长度不包括0 类似strlen,只是我们这里用int代替char
==================================================
* @return 数组的长度
**********************************************************/
int strlenEx( const unsigned char *Tag)
{
    if(NULL==Tag)
    {
        return 0;
    }

    int len=0;
    for(;*Tag!='\0';Tag++)
    {
        len++;
    }
    return len;
}
/************************************************************
* @brief <funcName:IsEqual> Author:刘禹 20121219 输入两个Tag计算这两个Tag是否相同
==================================================
* @param Tag 输入的Tag
==================================================
* @param TagCand 模板Tag
==================================================
* @return 1,相等 0 不相等
**********************************************************/
bool IsEqual(unsigned char *Tag, const unsigned char *TagCand)
{
    int lenTag=strlenEx(Tag);
    int lenTagCand=strlenEx(TagCand);
    int i=0;
    bool isequal=0;
    if(lenTag!=lenTagCand)
    {
        return 0;
    }
    else
    {
        for(i=0;i<lenTag;i++)
        {
           if(Tag[i]==TagCand[i])
            {
                isequal=1;
            }
            else
            {
                isequal=0;
                break;
            }
        }
        return isequal;
    }
}
/************************************************************
* @brief <funcName:FormatTags> Author:刘禹 20121219 根据给定字符c确定nType,返回特定信息
==================================================
* @param c [IN]输入字符
==================================================
* @param Tag [INOUT]将输入字符形成Tag,
==================================================
* @param nType [INOUT]起始值是-1
* nType=0正在构建Tag; 
* nType=1Tag构建结束
==================================================
* @return 函数运行成功返回0,函数运行失败返回1
**********************************************************/
int FormatTags(unsigned char c, unsigned char * Tag,int &nType)
{    
   int len=strlenEx(Tag);
   if(c=='<')//检测到tag的开始符号
    {
        Tag[0]='<';
        Tag[len+1]=0;
        nType=0;
    }
    else if(c=='>')//检测到tag的中间符号
    {
        if(len>0&&nType!=-1)
        {
            Tag[len]=c;
            Tag[len+1]=0;
            nType=1;
        }
        else//检测到空tag"<>"
        {
            Tag[0]=0;
            nType=-1;
        }
    }
    else
    {
        if(0==nType)//要填充tag,长度大于8说明此tag不是<html>或者</html>
        {
            Tag[len]=c;
            Tag[len+1]=0;
        }
        else
        {
            Tag[0]=0;
            nType=-1;//重新开始定位tag;
        }
    }
    return 0;
}
/************************************************************
* @brief <funcName:GetMainText> Author:刘禹 finallyly 20130501 
*从xml 格式文档中抽取需要的正文和来源字段
==================================================
*
* @param inputstr
==================================================
* @param fout
==================================================
*
* @return 
**********************************************************/
int GetMainText(const char *inputstr,FILE *fout)
{
   
   unsigned char Tag[TAGLEN]={0};
   unsigned char text[MAX_LINE_LENGTH]={0};
   unsigned char startTag1[TAGLEN]={0};
   unsigned char  endTag1[TAGLEN]={0};
   //不是gbk编码退出
   //抽取<text>字段
    int nType=-1;
    bool Writing=0;
    bool endWriting=0;
    int buflen=0;
    //抽取<source>字段
    int ret=0;
    size_t len;
    startTag1[0]='<';
    startTag1[1]='h';
    startTag1[2]='t';
    startTag1[3]='t';
    startTag1[4]='p';
    startTag1[5]='>';
    startTag1[6]=0;
    
    endTag1[0]='<';
    endTag1[1]='/';
    endTag1[2]='h';
    endTag1[3]='t';
    endTag1[4]='t';
    endTag1[5]='p';
    endTag1[6]='>';
    endTag1[7]=0;


    //strLowercase(input,rawtext)
    if(0==strstr(inputstr,"encoding=\"gb18030\""))
    {
        return 1;
    }
    unsigned char *temp=(unsigned char *)inputstr;
    len=strlenEx(temp);

    //抽取text字段
    for(int i=0;i<len;i++)
    {
        int c=temp[i];
        FormatTags(c,Tag,nType);
        if(1==nType)//检测到一个完整的tag标记
        {
            if(!Writing)
            {
                Writing=IsEqual(Tag,startTag1);
            }
            if(!endWriting)
            {
                endWriting=IsEqual(Tag,endTag1);
            }
            Tag[0]=0;
            nType=-1;
        }
    
        if(endWriting)
        {
            for(int i=0;i<buflen;i++)
            {
                fputc(text[i],fout);
            }
            //初始化现场
             buflen=0;
             text[0]=0;
             nType=-1;
             Tag[0]=0; 
             Writing=0;
             endWriting=0;
        }
        if(Writing)
        {
            text[buflen]=c;
            text[buflen+1]=0;
            buflen++;
        }
    }

}

/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* 从系统默认的汉字编码本机是GBK转unicode,宽字符保存
==================================================
* @param sToMatch
==================================================
* @return 
**********************************************************/
wstring String2Wstring(string sToMatch)
{     
    wstring wsToMatch;
    setlocale( LC_CTYPE, "" ); // 很重要,没有这一句,转换会失败。   
    int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。(不包含字符串结束符)
    if(iWLen>0)
    {
        wchar_t *lpwsz = new wchar_t[iWLen + 1];  
        int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。(转换后的字符串有结束符)   
        wsToMatch.assign(lpwsz);  
        delete []lpwsz;  
    }
    else
    {
        wsToMatch=L"";    
    }
    return wsToMatch;
}  
/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* Unicode转系统自带编码,用于输出
==================================================
* @param sToMatch
==================================================
* @return 
**********************************************************/
string Wstring2String(wstring sToMatch)  
{     
    string sResult;
    int iLen = wcstombs( NULL, sToMatch.c_str(), 0 ); // 计算转换后字符串的长度。(不包含字符串结束符)   
    if(iLen>0)
    {
        char *lpsz = new char[iLen + 1];  
        int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 转换。(没有结束符)   
        lpsz[iLen] = '\0';  
        sResult.assign(lpsz); 
        delete []lpsz;  
    }
    else
    {
        sResult="";
    }
    return sResult;  
}
/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
* 从指定编码转换到目标编码
==================================================
* @param toCode
==================================================
* @param fromCode
==================================================
* @param srcstr
==================================================
* @param deststr
==================================================
* @param srclen
==================================================
* @param destlen
==================================================
* @return 
**********************************************************/
int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
{
    iconv_t convertor=iconv_open(toCode,fromCode);
    size_t inputsize;
    size_t outputsize;
    size_t oldoutputsize;
    char *input, *inputold;
    char *output=NULL;
    char *outputold=NULL;
    int flag=0;
    if(convertor==iconv_t(-1))
    {
        fprintf(stderr,"convertor device initailization failed!\n");
        return 1;
    }
    else
    {
        inputsize=srclen;
        input=new char[inputsize+1];
        memcpy(input,srcstr,inputsize);
        input[inputsize]='\0';
        inputold=input;
        outputsize=inputsize*5;
        oldoutputsize=outputsize;
        output=new char[outputsize];
        output[0]=0;
        outputold=output;
        size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
        memcpy(deststr,outputold,oldoutputsize-outputsize);
        deststr[destlen]=0;
        destlen=oldoutputsize-outputsize;
        if(rc>0)
        {
            flag=1;
        }
        
        delete []inputold;
        delete []outputold;

    }
    iconv_close(convertor);
    if(flag==1)
    {
        return 0;
    }
    else
    {
        return 1;
    }

}
/************************************************************
* @brief <funcName:PrintUsage> Author:刘禹 finallyly 20130424
==================================================
**********************************************************/
void PrintUsage()
{
    fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );
}
/*
void PrintError(char error_text[])
{
    fprintf(stderr,"liuyusi0121 lib run-time error...\n");     
    fprintf(stderr,"%s\n",error_text);
    fprintf(stderr,"...now exiting to system...\n");
    exit(1);                     

}
*/
/************************************************************
* @brief <funcName:> Author:刘禹 finallyly
==================================================
* @param encodeingfield
==================================================
*
* @return 
**********************************************************/
string ParseHtmlEncoding(string encodingfield)
{
    char tempchar[3000]={0};
    int i=0;
    string itsencoding="";
    strcpy(tempchar,encodingfield.c_str());
    while(tempchar[i])
    {
            tempchar[i] = tolower(tempchar[i]);
            i++;
    }
    if(strstr(tempchar,"utf8")or strstr(tempchar,"utf-8"))
    {
        itsencoding="UTF-8";
    }
    if(strstr(tempchar,"gbk"))
    {
        itsencoding="GBK";
    }
    if(strstr(tempchar,"gb2312"))
    {
        itsencoding="GB2312";
    }
    if(strstr(tempchar,"gb18030"))
    {
        itsencoding="GB18030";
    }
    if(strstr(tempchar,"big5")or strstr(tempchar,"big-5"))
    {
        itsencoding="BIG5";
    }
    return itsencoding;  
    
}

/************************************************************
* @brief <funcName:loadFile> Author:刘禹 finallyly 20130510 
* 从搜索部门返回的繁体语料中抽取html网页,只处理gbk,gb2312,gb18030,utf-8,big5五种编码的网页,都转成gbk编码
==================================================
* @param inputfilename 待处理文件
==================================================
* @param outputfilename 输出文件夹
==================================================
==================================================
* @return 
**********************************************************/
int LoadFile(char* inputfilename,char *outputfilename)
{
    FILE *fout=NULL;
    FILE *fin=NULL;
    char line[MAX_LINE_LENGTH]={0};
    char src[MAX_LINE_LENGTH]={0};
    char des[MAX_LINE_LENGTH]={0};
    char url[8000]={0};
    size_t srclen;
    size_t deslen=50000;
    char *p1;
    char *p2;
    bool stopflag=false;

    unsigned long  count=0;//每隔10万个数据数据打印一次行号
    unsigned long rcount=0;//没两个rcount表明读完了信息文件的头部
    unsigned long StoreSize=0;
    unsigned long OriginalSize=0;
    unsigned long origArticleCount=0;
    unsigned long processedArticleCount=0;
    int linecount=0;

    //正则用到的变量事先声明
    string::const_iterator strit;
    string::const_iterator strend;
    wstring::const_iterator wstrit;
    wstring::const_iterator wstrend;
    wstring wpattern;
    wstring rawtext;
    wstring wresult;
    string srcstr;
    string temp;
    wstring wtemp;
    string pattern="[,「 」《》# ; ' ' ` ` :!?.。"()@… ~、“”【】]";
//获取网页编码的正则,窄正则
    boost::regex reg("<meta[^>]*charset=([^>]+)>",boost::regex::perl|boost::regbase::icase);
    //获取网页中汉字的正则,宽正则
    boost::wregex wreg(L"([\u2E80-\u9FFF]+)",boost::regex::perl|boost::regbase::icase);
    boost::smatch what;
    boost::wsmatch wswhat;
    string charset;//网页编码
    wpattern=String2Wstring(pattern);
    //分割汉字的正则
    boost::wregex wreg_split(wpattern);
    //文件读写
    fin=fopen(inputfilename,"rb");
    fout=fopen(outputfilename,"w");
    if(NULL==fin)
    {
        fprintf(stderr,"can not open inputfile:%s\n",inputfilename);
        return 1;
    }
    if(NULL==fout)
    {
        fprintf(stderr,"can not open outputfile:%s\n",outputfilename);
        return 1;
    }

    while(true)
    {
        do
        {
            fgets(line,MAX_LINE_LENGTH,fin);
            if(feof(fin))
            {
                stopflag=1;
                break;
            }
            size_t len=strlen(line);
            line[len-1]='\0';
            if(line[0]=='\r')
            {
                rcount++;
            }
            //读取正文URL
            if(line[0]=='h'&& line[1]=='t' && line[2]=='t' && line[3]=='p' && line[4]==':')
            {
                strcpy(url,line);
            }
            //读取正文压缩后的数据
            if(line[0]=='S' and line[1]=='t' and line[2]=='o' and line[3]=='r'and line[4]=='e')
            {
                p1=strtok(line,":");
                p2=strtok(NULL,":");
                trim(p2);
                StoreSize=atoi(p2);
            }
            //读取正文原文字符数
            if(line[0]=='O' and line[1]=='r' and line[2]=='i' and line[3]=='g'and line[4]=='i')
            {
                p1=strtok(line,":");
                p2=strtok(NULL,":");
                trim(p2);
                OriginalSize=atoi(p2);
            }
            linecount++;
            count++;
            if(0==count%100000)
            {
                fprintf(stderr,"count=%lu\n",count);
            }
        }while(rcount!=2);
        rcount=0;
        if(stopflag)
        {
            break;
        }
        //每读到开头为'\r'两次,表明已经读完了头部,开始抽取html文件
        origArticleCount++;
        if(OriginalSize+1<MAX_LINE_LENGTH)
        {
            fread(src,sizeof(char),(OriginalSize+1),fin);
        }
        else
        {
            fseek(fin,OriginalSize+1,SEEK_CUR);
        }
        if(feof(fin))
        {
            break;
        }
        srcstr="";
        srcstr.assign(src);
        strit=srcstr.begin();
        strend=srcstr.end();
        //获得网页编码
        if(boost::regex_search(strit,strend,what,reg))
        {
            temp=what[1];
        }
        //判断用正则抽取到的编码是否有效
        //printf("%s\n",temp.c_str());
        charset=ParseHtmlEncoding(temp);
        if(charset.size()>10 or charset=="")
        {
            continue;
        }
        srclen=strlen(src);
        //需要进行编码转换
        if(charset=="UTF-8"or charset=="UTF8" or charset=="BIG5" or charset=="BIG-5" or charset=="GB18030")
        {
            if(srclen==0)
            {
                continue;
            }
            
            if(0==toAnotherCode("GBK//IGNORE",charset.c_str(),src,des,srclen,deslen))
            {
                src[0]=0;//重新初始化,后续留作其他用途
                srcstr="";
                srcstr.assign(des);//注意要对转成GBK后的网页文件进行处理
                des[0]=0;//重新初始化,后续留作其他用途
                rawtext=String2Wstring(srcstr);
                if(rawtext==L"")
                {
                    continue;
                }
                wstrit=rawtext.begin();
                wstrend=rawtext.end();
                //只取汉字
                while(boost::regex_search(wstrit,wstrend,wswhat,wreg))
                {
                    wtemp=wswhat[1];
                    //按指定分割符号将汉字切割 
                    boost::wsregex_token_iterator sentences(wtemp.begin(), wtemp.end(), wreg_split, -1);
                    boost::wsregex_token_iterator sentend;
                    for(boost::wsregex_token_iterator begin=sentences;begin!=sentend;begin++)
                    {
                        wresult=*begin;
                        temp=Wstring2String(wresult);
                        if(temp!="")
                        {
                            strcat(des,temp.c_str());
                            strcat(des,"\n");
                        }
                    }
                    wstrit=wswhat[0].second;
                }
                //输出URL,保存URL的原因,台湾正体和香港繁体不一样
                fprintf(fout,"%s\n",url);
                fprintf(fout,des);
                des[0]=0;
                url[0]=0;
                processedArticleCount++;

            }
        }
        //不需要进行编码转换
        if(charset=="GBK")
        {
            memcpy(des,src,srclen);
            des[srclen]=0;
            src[0]=0;//重新初始化,后续留作其他用途
            srcstr="";
            srcstr.assign(des);//注意要对转成GBK后的网页文件进行处理
            des[0]=0;//重新初始化,后续留作其他用途
            rawtext=String2Wstring(srcstr);
            wstrit=rawtext.begin();
            wstrend=rawtext.end();
            //只取汉字
            while(boost::regex_search(wstrit,wstrend,wswhat,wreg))
            {
                wtemp=wswhat[1];
                //按指定分割符号将汉字切割 
                boost::wsregex_token_iterator sentences(wtemp.begin(), wtemp.end(), wreg_split, -1);
                boost::wsregex_token_iterator sentend;
                for(boost::wsregex_token_iterator begin=sentences;begin!=sentend;begin++)
                {
                    wresult=*begin;
                    temp=Wstring2String(wresult);
                    if(temp!="")
                    {
                        strcat(des,temp.c_str());
                        strcat(des,"\n");
                    }
                }
                wstrit=wswhat[0].second;
            }
            //输出URL,保存URL的原因,台湾正体和香港繁体不一样
            fprintf(fout,"%s\n",url);
            fprintf(fout,des);
            des[0]=0;
            url[0]=0;
            processedArticleCount++;
        }
        
        //读取了一个网页数据,状态被重置:
        linecount=0;
        count++;
        if(0==count%1000000)
        {
            fprintf(stderr,"count=%lu\n",count);
        }
        fprintf(stdout,"原始文章数%lu\t处理后的文章数%lu\n",origArticleCount,processedArticleCount);
    }

    fprintf(stdout,"原始文章数%lu\t处理后的文章数%lu\n",origArticleCount,processedArticleCount);
    if(NULL!=fin)
    {
        fclose(fin);
        fin=NULL;
    }
    if(NULL!=fout)
    {
        fclose(fout);
        fout=NULL;
    }
    return 0;

}
int main( int argc, char *argv[] )
{
    timeval tv1, tv2;
    gettimeofday(&tv1, NULL); 
    if ( 3 != argc )
    {
        PrintUsage();
        return 1;
    }
    LoadFile(argv[1],argv[2]);
    gettimeofday(&tv2, NULL);
    fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
    fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
    return 0;
}
功能代码

功能代码说明:

toAnotherCode 是进行编码转换的函数

ParseHtmlEncoding调用boost正则,从网页的meta字段中读取网页编码,我们把BIG5,UTF8,GB2312等编码的网页通通转成GBK。

 String2Wstring和WString2String实现宽窄字符转换,具体而言是将GBK编码转换成UNICODE编码,或者将UNICODE编码转成GBK。 我linux服务器的汉字默认编码是GBK。

主要的功能函数LoadFile

另外的一些函数本项目中没有使用,以前的项目中使用过。其实就是从html源码中,按照指定的tag完成内容抽取。


再贴一下我的makefile.am

bin_PROGRAMS+=ExtractTradition
INCLUDES=-I /home/liuyu/MyTars/boost_1_53_0/libs/regex/src
ExtractTradition_SOURCES=ExtractTradition.cpp winstances.cpp wide_posix_api.cpp wc_regex_traits.cpp w32_regex_traits.cpp usinstances.cpp static_mutex.cpp regex_traits_defaults.cpp regex_raw_buffer.cpp regex_debug.cpp regex.cpp posix_api.cpp instances.cpp icu.cpp  cpp_regex_traits.cpp cregex.cpp  c_regex_traits.cpp fileiter.cpp

 ExtractTradition_LDADD= -lACE -lm  -liconv                                                                        
 ExtractTradition_LDFLAGS=-static-libtool-libs 


 注意:SOURCE字段的若干cpp是/regex/src下面的cpp,这里采用的是boost_regex的源码编译方式。 动态链接库的形式,在我的电脑上一直失败。

 

 

 

附图见下。

posted on 2013-05-28 09:44  finallyly  阅读(489)  评论(0编辑  收藏  举报