1 /*
2 * DocSegment.cpp
3 * Created on: 2011-11-10
4 * function:分析网页算法的实现
5 * 将原始网页库中存储的网页转化为
6 * 一组词的集合.
7 */
8 #include <algorithm>
9 #include <fstream>
10 #include <iostream>
11 #include <map>
12 #include <vector>
13
14 #include "Md5.h"
15 #include "Url.h"
16 #include "Document.h"
17 #include "ChSeg/Dict.h"
18 #include "ChSeg/HzSeg.h"
19 #include "StrFun.h"
20
21 CDict iDict;
22
23 using namespace std;
24
25 const unsigned int HEADER_BUF_SIZE = 1024;//记录头和网页头信息的最大长度
26 //const unsigned int MAX_DOC_ID = 12932; // you should change according "Doc.idx"
27 const unsigned int MAX_DOC_ID = 767;
28 //所要处理的原始网页库中文档的个数
29 //不同的原始网页库文档数不同,这个值
30 //需要更改,可以通过URL索引文件[Url.idx]得到
31
32 int main(int argc, char* argv[])
33 {
34 string strLine, strFileName=argv[1];
35 CUrl iUrl;
36 vector<CUrl> vecCUrl;//为什么不是map
37 CDocument iDocument;
38 vector<CDocument> vecCDocument;//vector容器保存文档对象
39 unsigned int docId = 0;
40
41 //ifstream ifs("Tianwang.raw.2559638448");
42 ifstream ifs(strFileName.c_str());//文档对象
43
44 if (!ifs) {
45 cerr << "Cannot open tianwang.img.info for input\n";
46 return -1;
47 }
48
49 ifstream ifsUrl("Url.idx.sort_uniq");
50 if (!ifsUrl) {
51 cerr << "Cannot open Url.idx.sort_uniq for input\n";
52 return -1;
53 }
54 ifstream ifsDoc("Doc.idx");
55 if (!ifsDoc) {
56 cout<<"不能打开网页索引文件"<<endl;
57 cerr << "Cannot open Doc.idx for input\n";
58 return -1;
59 }
60
61 while (getline(ifsUrl, strLine)) {//读入md5到id的映射
62 char chksum[33];
63 int docid;
64
65 memset(chksum, 0, 33);
66 //cout<<strLine.c_str()<<endl<<endl;
67 sscanf( strLine.c_str(), "%s%d", chksum, &docid );
68 //cout<<strLine.c_str();
69 iUrl.m_sChecksum = chksum;
70 iUrl.m_nDocId = docid;
71 vecCUrl.push_back(iUrl);
72 }
73
74 while (getline(ifsDoc,strLine)){
75 /* docid:文档编号
76 * pos:偏移
77 * length:好像和DocIndex的版本有点问题*/
78
79 int docid,pos,length;
80 char chksum[33];
81
82 //cout<<strLine<<endl<<endl;
83
84 memset(chksum, 0, 33);
85 sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
86 //cout<<endl<<docid<< pos<<length<<endl;
87 iDocument.m_nDocId = docid;
88 iDocument.m_nPos = pos;
89 iDocument.m_nLength = length;
90 iDocument.m_sChecksum = chksum;//网页的MD5
91 vecCDocument.push_back(iDocument);
92 }
93
94
95
96 //保存网页分析的结果
97 strFileName += ".seg";
98 ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);
99 for ( docId=0; docId<MAX_DOC_ID; docId++ ){
100
101 // find document according to docId
102 int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;//当前网页文件的长度
103 char *pContent = new char[length+1];//记录内容
104 memset(pContent, 0, length+1);
105 ifs.seekg(vecCDocument[docId].m_nPos);//移动读取位置
106 ifs.read(pContent, length);
107
108 char *s;
109 s = pContent;
110
111 // 过滤记录头
112 int bytesRead = 0,newlines = 0;
113 while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
114 if (*s == '\n')
115 newlines++;
116 else
117 newlines = 0;
118 s++;
119 bytesRead++;
120 }
121 if (bytesRead == HEADER_BUF_SIZE-1) continue;
122
123
124 // 过滤网页头部信息
125 bytesRead = 0,newlines = 0;
126 while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
127 if (*s == '\n')
128 newlines++;
129 else
130 newlines = 0;
131 s++;
132 bytesRead++;
133 }
134 if (bytesRead == HEADER_BUF_SIZE-1) continue;//一般没有1024??
135
136 //iDocument.m_sBody = s;
137 //过滤网页体的正文信息
138 iDocument.RemoveTags(s);
139 iDocument.m_sBodyNoTags = s;
140
141 delete[] pContent;
142 string strLine = iDocument.m_sBodyNoTags;
143
144 CStrFun::ReplaceStr(strLine, " ", " ");//将网页体正文中的" "替换成" "
145 CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
146
147
148 //将网页体正文信息进行分词
149 CHzSeg iHzSeg;
150 //cout<<strLine<<endl<<endl;
151 strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
152 cout<<docId<<";";
153 cout<<strLine;
154 fout << docId << endl << strLine;
155 fout << endl;
156
157 }
158
159 return(0);
160 }