DocSegment.cpp

  1 /*
  2  * DocSegment.cpp
  3  * Created on: 2011-11-10
  4  *   function:分析网页算法的实现
  5  *   将原始网页库中存储的网页转化为
  6  *   一组词的集合.
  7  */
  8 #include <algorithm>
  9 #include <fstream>
 10 #include <iostream>
 11 #include <map>
 12 #include <vector>
 13 
 14 #include "Md5.h"
 15 #include "Url.h"
 16 #include "Document.h"
 17 #include "ChSeg/Dict.h"
 18 #include "ChSeg/HzSeg.h"
 19 #include "StrFun.h"
 20 
 21 CDict iDict;
 22 
 23 using namespace std;
 24 
 25 const unsigned int HEADER_BUF_SIZE = 1024;//记录头和网页头信息的最大长度
 26 //const unsigned int MAX_DOC_ID = 12932;        // you should change according "Doc.idx"
 27 const unsigned int MAX_DOC_ID = 767;
 28 //所要处理的原始网页库中文档的个数
 29 //不同的原始网页库文档数不同,这个值
 30 //需要更改,可以通过URL索引文件[Url.idx]得到
 31 
 32 int main(int argc, char* argv[])
 33 {
 34     string strLine, strFileName=argv[1];
 35     CUrl iUrl;
 36     vector<CUrl> vecCUrl;//为什么不是map
 37     CDocument iDocument;
 38     vector<CDocument> vecCDocument;//vector容器保存文档对象
 39     unsigned int docId = 0;
 40 
 41     //ifstream ifs("Tianwang.raw.2559638448");
 42     ifstream ifs(strFileName.c_str());//文档对象
 43 
 44     if (!ifs) {
 45         cerr << "Cannot open tianwang.img.info for input\n";
 46         return -1;
 47     }
 48 
 49     ifstream ifsUrl("Url.idx.sort_uniq");
 50     if (!ifsUrl) {
 51         cerr << "Cannot open Url.idx.sort_uniq for input\n";
 52         return -1;
 53     }
 54     ifstream ifsDoc("Doc.idx");
 55     if (!ifsDoc) {
 56         cout<<"不能打开网页索引文件"<<endl;
 57         cerr << "Cannot open Doc.idx for input\n";
 58         return -1;
 59     }
 60 
 61     while (getline(ifsUrl, strLine)) {//读入md5到id的映射
 62         char chksum[33];
 63         int  docid;
 64 
 65         memset(chksum, 0, 33);
 66         //cout<<strLine.c_str()<<endl<<endl;
 67         sscanf( strLine.c_str(), "%s%d", chksum, &docid );
 68         //cout<<strLine.c_str();
 69         iUrl.m_sChecksum = chksum;
 70         iUrl.m_nDocId = docid;
 71         vecCUrl.push_back(iUrl);
 72     }
 73 
 74     while (getline(ifsDoc,strLine)){
 75         /* docid:文档编号
 76          * pos:偏移
 77          * length:好像和DocIndex的版本有点问题*/
 78 
 79         int docid,pos,length;
 80         char chksum[33];
 81 
 82         //cout<<strLine<<endl<<endl;
 83 
 84         memset(chksum, 0, 33);
 85         sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
 86         //cout<<endl<<docid<< pos<<length<<endl;
 87         iDocument.m_nDocId = docid;
 88         iDocument.m_nPos = pos;
 89         iDocument.m_nLength = length;
 90         iDocument.m_sChecksum = chksum;//网页的MD5
 91         vecCDocument.push_back(iDocument);
 92     }
 93 
 94 
 95 
 96     //保存网页分析的结果
 97     strFileName += ".seg";
 98     ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);
 99     for ( docId=0; docId<MAX_DOC_ID; docId++ ){
100 
101         // find document according to docId
102         int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;//当前网页文件的长度
103         char *pContent = new char[length+1];//记录内容
104         memset(pContent, 0, length+1);
105         ifs.seekg(vecCDocument[docId].m_nPos);//移动读取位置
106         ifs.read(pContent, length);
107 
108         char *s;
109         s = pContent;
110 
111         // 过滤记录头
112         int bytesRead = 0,newlines = 0;
113         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
114             if (*s == '\n')
115                 newlines++;
116             else
117                 newlines = 0;
118             s++;
119             bytesRead++;
120         }
121         if (bytesRead == HEADER_BUF_SIZE-1) continue;
122 
123 
124         // 过滤网页头部信息
125         bytesRead = 0,newlines = 0;
126         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
127             if (*s == '\n')
128                 newlines++;
129             else
130                 newlines = 0;
131             s++;
132             bytesRead++;
133         }
134         if (bytesRead == HEADER_BUF_SIZE-1) continue;//一般没有1024??
135 
136         //iDocument.m_sBody = s;
137         //过滤网页体的正文信息
138         iDocument.RemoveTags(s);
139         iDocument.m_sBodyNoTags = s;
140 
141         delete[] pContent;
142         string strLine = iDocument.m_sBodyNoTags;
143 
144         CStrFun::ReplaceStr(strLine, "&nbsp;", " ");//将网页体正文中的"&nbsp"替换成" "
145         CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
146 
147 
148         //将网页体正文信息进行分词
149         CHzSeg iHzSeg;
150         //cout<<strLine<<endl<<endl;
151         strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
152         cout<<docId<<";";
153         cout<<strLine;
154         fout << docId << endl << strLine;
155         fout << endl;
156         
157     }
158 
159     return(0);
160 }

posted on 2012-07-14 13:37  kakamilan  阅读(329)  评论(0编辑  收藏  举报

导航