1 #ifndef _Crawl_H_031104_
2 #define _Crawl_H_031104_
3
4 //#include <openssl/md5.h>
5 #include <zlib.h>
6
7 #include "Tse.h"
8 #include "Http.h"
9 #include "StrFun.h"
10 #include "Url.h"
11 #include "Page.h"
12 #include "TianwangFile.h"
13 #include "IsamFile.h"
14 #include "Link4SEFile.h"
15
16 using namespace std;
17
18 class CCrawl
19 {
20 public:
21 string m_sInputFileName; //种子URL的文件名字: tse_seed.pku
22 string m_sOutputFileName; //保存我们已经访问过的URL的文件名字: visited.all
23
24 CIsamFile m_isamFile; // ISAM file handle
25
26 ofstream m_ofsVisitedUrlFile; //visited.all的文件句柄
27 ofstream m_ofsLink4SEFile; //link4SE.url的文件句柄
28 ofstream m_ofsLink4HistoryFile; //link4History.url的文件句柄
29 ofstream m_ofsUnreachHostFile; //tse_unreachHost.list的文件句柄
30
31 ofstream m_ofsVisitedUrlMD5File;//tse_md5.visitedurl的文件句柄
32 ofstream m_ofsVisitedPageMD5File;//tse_md5.visitedpage
33
34 ofstream m_ofsUnreachUrlFile; // unreach URL file handle
35
36
37 public:
38 CCrawl();//无参构造函数 "tse_seed.pku" "visited.all"
39 CCrawl(string strInputFile, string strOutputFile);
40 ~CCrawl();
41
42 //CCrawl类中最重要的函数
43 void DoCrawl();
44
45 //根据URL以及套接字文件描述符抓取URL对应的网页
46 void DownloadFile( CTianwangFile *pTianwangFile,CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock);
47
48 //每个线程函数start()都调用这个函数
49 void fetch(void *arg);
50
51 //如果url满足条件加到mmapUrls[待访问的url]容器中
52 void AddUrl(const char *url);
53
54 void GetVisitedUrlMD5();//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
55 void GetVisitedPageMD5();//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
56
57 void GetIpBlock();//得到阻塞的IP,放入mapIpBlock容器中
58
59 void GetUnreachHostMD5();//得到不可到达的主机号,放入setUnreachHostMD5中
60 void OpenFilesForOutput();//打开所有的输出流
61
62 // save in the process
63 void SaveTianwangRawData(CTianwangFile *pTianwangFile,CUrl *pUrl, CPage *pPage);//将抓取的网页以天网格式存储
64 void SaveLink4SERawData(CLink4SEFile *pLink4SEFile,CUrl *pUrl, CPage *pPage);//将抓取的网页从中提取超链接信息建立网页结构库
65
66 void SaveIsamRawData(CUrl *pUrl, CPage *Page);
67 void SaveVisitedUrl(string url);//保存已经访问过的URL
68 void SaveUnreachHost(string host);//保存不可到达的主机号
69 void SaveLink4SE(CPage *Page);//保存为搜索引擎准备的超链接信息
70 bool SaveLink4SE031121(void *arg);
71 void SaveLink4History(CPage *Page)//保存为历史网页存档准备的超链接信息
72
73 // save while the program running
74 void SaveVisitedUrlMD5(string md5);//保存已经访问过的URL对应的MD5值
75 void SaveVisitedPageMD5(string md5);//得到已经访问过的web网页体对应的MD5值
76
77 };
78
79 #endif /* _CRAWL_H_031104_ */
1 #include "Crawl.h"
2 #include "Url.h"
3 #include "Md5.h"
4
5 #include <list.h>
6 #include <hlink.h>
7 #include <uri.h>
8
9 extern pthread_mutex_t mymutex;
10 extern map<string, string> mapCacheHostLookup; //DNS缓存
11 extern vector<string> vsUnreachHost;
12 extern char **ParseRobot(char *data, char len);
13
14 set<string> setVisitedUrlMD5; //open list[已经访问的URL对应的MD5值]
15 set<string> setVisitedPageMD5; //已经访问过的web网页体对应的MD5值
16 set<string> setUnvisitedUrlMD5; //close list[没有访问过的URL对应的MD5值]
17
18 set<string> setUnreachHostMD5; //不可达到的主机号对应的MD5值的集合
19
20 multimap<string, string, less<string> > replicas; //web网页体对应的MD5值<->web网页体对应的URL
21
22 //定义线程的互斥变量并初始化
23 pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER; //保护mmapUrls资源
24 pthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER; //保护setUnreachHostMD5&&m_ofsUnreachHostFile资源
25 pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setUnvisitedUrlMD5资源
26 pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisiteUrlMD5&&m_ofsVisiteUrlMD5File资源
27 pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisitePageMD5&&m_ofsVisitePageMD5File资源
28
29 pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;
30 pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER; //保护m_ofsLink4SEFile资源
31 pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;
32 pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;
33 pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;
34 pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;
35 pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;
36 //pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;
37
38 map<unsigned long, unsigned long> mapIpBlock; //IP阻塞范围
39 bool b_fOver; //线程运行控制参数
40 //multimap<string,string, less<string> > mmapUrls;
41 multimap<string, string> mmapUrls; //保存没有访问过的URL的主机号<->对应的URL
42
43 typedef map<unsigned long, unsigned long>::value_type valTypeIpBlock;
44 typedef map<string, string>::value_type mvalType;
45
46 void SaveReplicas(const char* filename); //保存镜像网页对应的URL的一个值到指定的文件名中
47
48 struct package {
49 CCrawl *crawl;
50 CPage *page;
51 };
52
53 vector<string> vsParsedLinks;
54
55 int onfind(const char *elem, const char *attr, struct uri *uri, void *arg) {
56 struct package *p = (struct package*) arg;
57 char buff[URL_LEN + 1];
58
59 if (uri_recombine(uri, buff, URL_LEN + 1,
60 C_SCHEME | C_AUTHORITY | C_PATH | C_QUERY) >= 0)
61
62 {
63 vsParsedLinks.push_back(buff);
64 if (!p->page->IsFilterLink(buff)) {
65 // accept "a,link,frame,iframe,img,area"
66
67 if (strcasecmp(elem, "img") == 0) {
68 pthread_mutex_lock(&mutexLink4HistoryFile);
69 if (p->crawl->m_ofsLink4HistoryFile) {
70 p->crawl->m_ofsLink4HistoryFile << buff << endl;
71 }
72 pthread_mutex_unlock(&mutexLink4HistoryFile);
73
74 } else {
75 p->crawl->AddUrl(buff);
76 }
77 /*
78 else if (strcasecmp(elem, "img") == 0)
79 {
80 pthread_mutex_lock(&mutexLink4HistoryFile);
81 if( p->crawl->m_ofsLink4HistoryFile ){
82 p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;;
83 }
84 pthread_mutex_unlock(&mutexLink4HistoryFile);
85 }
86 */
87 }
88 }
89
90 uri_destroy(uri);
91 free(uri);
92 return 1;
93 }
94
95 /***********************************************************************
96 * Function name: start
97 * Input argv:
98 * -- arg: the CCrawl handle
99 * Output argv:
100 * --
101 * Return:
102 ***********************************************************************/
103 //线程函数-->每个线程函数调用fetch(void*arg)函数
104 void* start(void *arg) {
105 ((CCrawl*) arg)->fetch(arg);
106 }
107
108 /*
109 这个函数设计的很巧妙,这里说的巧妙不是函数写有多hi:
110 我们知道spider在最开始抓取网页的时候需要种子url,
111 我们这个spider的种子url文件库是tse_seed.ur文件
112 而这个函数正好在我们强制中断程序的时候,将mmapUrls
113 中没有访问完的url放入tse_unvisited.url文件中,
114 扩充了我们的种子URL库!*/
115 void SaveUnvisitedUrl() {
116 ofstream ofsUnvisitedUrl;
117 ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(),
118 ios::in | ios::out | ios::trunc | ios::binary); //以二进制可追加写方式打开文件
119 if (!ofsUnvisitedUrl) //打开失败
120 {
121 cerr << "cannot open " << UNVISITED_FILE << "for output" << endl;
122 exit(-1);
123 }
124
125 //将mmapUrls中没有访问完的url放入tse_unvisited.url文件中,扩充了我们的URL种子库!
126 multimap<string, string>::iterator it = mmapUrls.begin();
127 for (; it != mmapUrls.end(); it++) {
128 ofsUnvisitedUrl << ((*it).second).c_str() << "\n";
129 }
130
131 ofsUnvisitedUrl << endl;
132 ofsUnvisitedUrl.close();
133
134 }
135
136 /***********************************************************************
137 * Function name: fetch
138 * Input argv:
139 * -- arg: the CCrawl handle
140 * Output argv:
141 * --
142 * Return:
143 ***********************************************************************/
144 void CCrawl::fetch(void *arg) //每个线程都执行这个函数
145 {
146 string strUrl, host;
147
148 int nGSock = -1; //之前的套接字文件描述符
149 string strGHost = ""; //字前的主机号
150
151 // create a Tianwang file for output the raw page data
152 string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); //Tianwang.raw+"线程号"
153 CTianwangFile tianwangFile(ofsName); //创建一个天网格式的文件,保存为原始网页库
154
155 // create a Link4SE file for output the raw link data
156 ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self()); //Link4SE.raw+"线程号"
157 CLink4SEFile link4SEFile(ofsName); //创建一个网页结构库
158
159 int iSleepCnt = 0; //线程运行控制参数
160 for (;;) {
161 pthread_mutex_lock(&mutexCollection); //互斥的锁定函数
162 //if( !mmapUrls.empty() ){
163 int cnt = mmapUrls.size();
164 if (cnt > 0) {
165 //已经收集的没有访问的url
166 cout << "collection has: " << cnt << " unvisited urls" << endl;
167 multimap<string, string>::iterator it = mmapUrls.begin();
168 if (it != mmapUrls.end()) {
169 // get an URL
170 strUrl = (*it).second; //从待访问的URL队列中得到一个URL进行访问
171
172 // remove it from the collection
173 mmapUrls.erase(it); //删除迭代器所指的元素
174
175 pthread_mutex_unlock(&mutexCollection); //互斥的解锁函数
176
177 // parse URL
178 CUrl iUrl; //关键是看看strUrl是否有http://协议号,没有返回false
179 if (iUrl.ParseUrlEx(strUrl) == false) {
180 cout << "ParseUrlEx error in fetch(): " << strUrl << endl;
181 continue;
182 }
183
184 //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同
185 //故,我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的
186 //套接字文件描述符进行通信,这是由于循环导致的
187 if (strGHost != iUrl.m_sHost) {
188 close(nGSock);
189 nGSock = -1;
190 strGHost = iUrl.m_sHost;
191 }
192
193 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
194 ((CCrawl*) arg)->DownloadFile(&tianwangFile, &link4SEFile, iUrl,
195 nGSock);
196
197 cnt = 0;
198 } else {
199 pthread_mutex_unlock(&mutexCollection);
200 }
201 } else {
202 //待访问的URL队列mmapUrls中没有URL了,这个时候我们必须挂起线程进行等待
203 pthread_mutex_unlock(&mutexCollection);
204 usleep(1000);
205 iSleepCnt++;
206 }
207
208 if (b_fOver == true && iSleepCnt == 200) //当URL队列mmapUrls有200次都是空的时候就结束这个线程调用的fetch()函数
209 break;
210 /*
211 if( b_fOver == true ){
212 break;
213 } else if( cnt == 100 ) {
214 cout << "w.";
215 cnt = 0;
216 }
217 */
218 }
219
220 tianwangFile.Close();
221 link4SEFile.Close();
222 }
223
224 /***********************************************************************
225 * Function name: DownloadFile
226 * Input argv:
227 * -- pTianwang: the CCrawl handle
228 * -- pLink4SE: the CCrawl handle
229 * -- iUrl: the URL for crawling
230 * -- nGSock: the previous global socket
231 * Output argv:
232 * --
233 * Return:
234 ***********************************************************************/
235
236 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
237 void CCrawl::DownloadFile(CTianwangFile *pTianwangFile,
238 CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock) {
239 char *downloaded_file = NULL, //网页体信息
240 *fileHead = NULL, //网页头信息
241 *location = NULL; //网页的重定向信息
242 int file_length = 0; //网页体真实的字节长度
243 string strUrlLocation = ""; //保存网页的重定向超链接
244
245 //之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间
246 int nSock = nGSock; //将之前的套接字文件描述符赋值给nSock
247
248 cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl;
249
250 CHttp http;
251
252 //这是一个真正的抓取网页的函数,有了URL搜集系统可以根据URL的标识抓取其对应的网页
253 file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead,
254 &location, &nSock);
255
256 int nCount = 0; //用来标识URL重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页
257
258 while (file_length == -300) //表明该iUrl.m_sUrl对应的网页重定向了
259 { // moved to an another place
260 if (strlen(location) > URL_LEN - 1 || nCount == 3
261 || strlen(location) == 0) {
262 if (location) {
263 //pthread_mutex_lock(&mutexMemory);
264 free(location);
265 location = NULL;
266 //pthread_mutex_unlock(&mutexMemory);
267 }
268 file_length = -1;
269 break;
270 }
271
272 //将获取到的重定向的URL给strUrlLocation为下次抓取网页做准备
273 strUrlLocation = location;
274 if (location) {
275 //pthread_mutex_lock(&mutexMemory);
276 free(location);
277 location = NULL;
278 //pthread_mutex_unlock(&mutexMemory);
279 }
280
281 //这个地方要注意,因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径
282 //跟CPage类中提取超链接信息一样
283 string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http");
284
285 if (idx1 != 0) { //没有找"http://"协议号
286
287 char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length() - 1);
288 char c2 = strUrlLocation.at(0);
289
290 if (c2 == '/') //重定向的URL一定是相对路径
291 {
292 strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation;
293 } else if (c1 != '/' && c2 != '/') {
294 string::size_type idx;
295
296 idx = iUrl.m_sUrl.rfind('/');
297 if (idx != string::npos) {
298 if (idx > 6) { // > strlen("http://..")
299 strUrlLocation = iUrl.m_sUrl.substr(0, idx + 1)
300 + strUrlLocation;
301 } else {
302 strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
303 }
304
305 } else {
306 file_length = -1;
307 break;
308 }
309 } else {
310 if (c1 == '/') {
311 strUrlLocation = iUrl.m_sUrl + strUrlLocation;
312 } else {
313 strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
314 }
315 }
316 }
317
318 CPage iPage;
319 if (iPage.IsFilterLink(strUrlLocation)) { //如果得到的重定向URL是要过滤的URL,我们立刻结束不再抓取
320 file_length = -1;
321 break;
322 }
323
324 cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl;
325 file_length = http.Fetch(strUrlLocation, &downloaded_file, &fileHead,
326 &location, &nSock);
327 nCount++;
328 }
329
330 nGSock = nSock; //将新得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备
331
332 if (file_length == -1) { //其他的各种错误,这个错误的原因在http.Fetch()中
333 cout << "!-: " << iUrl.m_sUrl << endl;
334 //pthread_mutex_lock(&mutexMemory);
335 if (fileHead) {
336 free(fileHead);
337 fileHead = NULL;
338 }
339 if (downloaded_file) {
340 free(downloaded_file);
341 downloaded_file = NULL;
342 }
343 //pthread_mutex_unlock(&mutexMemory);
344
345 cout << "-unreach host: " << iUrl.m_sHost << endl;
346 ;
347 return;
348 }
349
350 if (file_length == -2) { // out of ip block .//在IP阻塞范围内
351 //pthread_mutex_lock(&mutexMemory);
352 if (fileHead) {
353 free(fileHead);
354 fileHead = NULL;
355 }
356 if (downloaded_file) {
357 free(downloaded_file);
358 downloaded_file = NULL;
359 }
360 //pthread_mutex_unlock(&mutexMemory);
361
362 // save unreach host
363 SaveUnreachHost(iUrl.m_sHost);
364
365 cout << "-out of block host: " << iUrl.m_sHost << endl;
366 ;
367 return;
368 }
369
370 if (file_length == -3) { // invalid host or ip//URL的主机号是无效的主机号
371 //pthread_mutex_lock(&mutexMemory);
372 if (fileHead) {
373 free(fileHead);
374 fileHead = NULL;
375 }
376 if (downloaded_file) {
377 free(downloaded_file);
378 downloaded_file = NULL;
379 }
380 //pthread_mutex_unlock(&mutexMemory);
381 cout << "-invalid host: " << iUrl.m_sHost << endl;
382 return;
383 }
384
385 if (file_length == -4) { // MIME is image/xxx//图片类型的网页
386 //pthread_mutex_lock(&mutexMemory);
387 if (fileHead) {
388 free(fileHead);
389 fileHead = NULL;
390 }
391 if (downloaded_file) {
392 free(downloaded_file);
393 downloaded_file = NULL;
394 }
395 //pthread_mutex_unlock(&mutexMemory);
396
397 if (m_ofsLink4HistoryFile) { //为历史网页存档准备的链接
398 pthread_mutex_lock(&mutexLink4HistoryFile);
399 m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;
400 ; //将该URL保存在link4History.url文件中
401 pthread_mutex_unlock(&mutexLink4HistoryFile);
402 }
403
404 cout << "-imgage host: " << iUrl.m_sHost << endl;
405 return;
406 }
407
408 /* still experiment
409 char **dir;
410 dir = ParseRobot( downloaded_file, file_length);
411 for( int i = 0; dir[i] != NULL ; i++){
412 cout << dir[i] << endl;
413 free( dir[i] );
414 }
415
416 exit(1);
417 */
418
419 // so small, maybe some unuseful info, skipped
420 //if(file_length < 40){ // for ImgSE,
421 /*
422 if(file_length < 256){ // for SE
423 //pthread_mutex_lock(&mutexMemory);
424 if (fileHead)
425 {
426 free(fileHead); fileHead=NULL;
427 }
428 if (downloaded_file)
429 {
430 free(downloaded_file); downloaded_file=NULL;
431 }
432 //pthread_mutex_unlock(&mutexMemory);
433 cout << "#";
434 return;
435 }
436 */
437
438 // deal with normal page
439
440 //处理正常的网页[网页头信息和网页体信息只要有一个是NULL,我们就认为它不是正常的网页]
441 if (!fileHead || !downloaded_file) //不能获得网页头信息或者网页体信息
442 {
443 //pthread_mutex_lock(&mutexMemory);
444 if (fileHead) {
445 free(fileHead);
446 fileHead = NULL;
447 }
448 if (downloaded_file) {
449 free(downloaded_file);
450 downloaded_file = NULL;
451 }
452 //pthread_mutex_unlock(&mutexMemory);
453 close(nGSock);
454 nGSock = -1;
455 cout << "-size0 host: " << iUrl.m_sHost << endl;
456 return;
457 }
458
459 //这里很重要,将抓取到的网页信息全部放入CPage类中
460 CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file,
461 file_length);
462 //pthread_mutex_lock(&mutexMemory);
463 if (fileHead) {
464 free(fileHead);
465 fileHead = NULL;
466 }
467 if (downloaded_file) {
468 free(downloaded_file);
469 downloaded_file = NULL;
470 }
471 //pthread_mutex_unlock(&mutexMemory);
472
473 //解析网页头信息
474 iPage.ParseHeaderInfo(iPage.m_sHeader);
475
476 if (iPage.m_bConnectionState == false) {
477 close(nGSock);
478 nGSock = -1;
479 }
480
481 // when crawling images for ImgSE, remember to comment the paragraph
482 // when crawling plain text for SE, remember to open the paragraph
483 // paragraph begin
484
485 // iPage.m_sContentType != "text/css" &&
486
487 //过滤掉不是我们想要的网页体的类型
488 if (iPage.m_sContentType != "text/html"
489 && iPage.m_sContentType != "text/plain"
490 && iPage.m_sContentType != "text/xml"
491 && iPage.m_sContentType != "application/msword"
492 && iPage.m_sContentType != "application/pdf"
493 && iPage.m_sContentType != "text/rtf"
494 && iPage.m_sContentType != "application/postscript"
495 && iPage.m_sContentType != "application/vnd.ms-execl"
496 && iPage.m_sContentType != "application/vnd.ms-powerpoint") {
497
498 cout << "-unwant type host: " << iUrl.m_sHost << endl;
499 return;
500 }
501
502 // paragraph end
503
504 //解压缩开始
505 //如果是gzip编码,要解压缩,然后提取超链接信息,现在门户网站的首页有增大趋势
506 //为了加快传输速度,通常采用gzip编码压缩后传输
507 char sUnzipContent[1024000]; //1000K<1M
508 int nUnzipLength = 0;
509 if (iPage.m_sContentEncoding == "gzip"
510 && iPage.m_sContentType == "text/html") {
511
512 gzFile zip;
513 //这是一个过渡文件,将没有解压缩的网页体信息放入到这个文件中
514 string ofsGzipName;
515
516 ofsGzipName = CStrFun::itos(pthread_self()) + ".gz";
517
518 //以二进制截断的方式打开文件
519 //ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件
520 ofstream ofsDownloadFile(ofsGzipName.c_str(), ios::trunc | ios::binary);
521
522 cout << "file_length: " << file_length << endl;
523 ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent);
524 ofsDownloadFile.close();
525
526 zip = gzopen(ofsGzipName.c_str(), "rb");
527 if (zip == NULL) {
528 cout << "Open zip file " << ofsGzipName.c_str() << " error."
529 << endl;
530 exit(-1);
531 }
532
533 //解压缩过程,将解压缩后的网页体信息放入到缓冲区域sUnzipContent
534 nUnzipLength = gzread(zip, sUnzipContent, 1024000);
535 if (nUnzipLength == -1) {
536 cout << "Read zip file " << ofsGzipName.c_str() << " error."
537 << endl;
538 exit(-1);
539 }
540
541 sUnzipContent[nUnzipLength] = 0;
542
543 gzclose(zip);
544
545 //将解压缩后的网页体信息覆盖原来的没有解压缩的网页体信息
546 //iPage.m_sContent.assign(sUnzipContent,nUnzipLength);
547 //iPage.m_nLenContent=nUnzipLength;
548 }
549 //解压缩结束
550
551 CMD5 iMD5;
552 string strDigest;
553
554 /////////////////////////////
555 // because we can make sure the url in the setVisitedUrlMd5
556 // is not same(we have check it before insert it to the collection),
557 // we intert it directly. however...
558 //iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() );
559
560 //判断该URL是否在open list[setVisitedUrlMD5]中,在返回;不在加到open list中,并保存
561 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),iUrl.m_sUrl.length());
562 strDigest = iMD5.ToString();
563
564 pthread_mutex_lock(&mutexVisitedUrlMD5);
565 if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) //已经抓取过了!
566 {
567 cout << "!vurl: "; //1.crawled already
568 pthread_mutex_unlock(&mutexVisitedUrlMD5);
569 return;
570 }
571
572 //不在setVisitedUrlMD5中,现在必须插入setVisitedUrlMD5中
573 //因为该URL现在已经访问过了
574 setVisitedUrlMD5.insert(strDigest);
575 SaveVisitedUrlMD5(strDigest);
576 pthread_mutex_unlock(&mutexVisitedUrlMD5);
577
578 /////////////////////////////
579 // whether it is a visited page
580 // for ImgSE, should comment this paragraph
581 // for SE, should uncomment this paragraph
582
583 // begin
584
585 //判断该网页体是否已经访问过,访问过返回,没有访问过加到setVisitedPageMD5集合中
586 iMD5.GenerateMD5((unsigned char*) iPage.m_sContent.c_str(),iPage.m_sContent.length());
587 strDigest = iMD5.ToString();
588 pthread_mutex_lock(&mutexVisitedPageMD5);
589 //网页体MD5同URL的关系插入到容器replicas中
590 replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl));
591 if (setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end()) //在setVisitedPageMD5中:表明出现了镜像网页
592 {
593 cout << "!vpage: "; // crawled already
594 pthread_mutex_unlock(&mutexVisitedPageMD5);
595 return;
596 }
597 setVisitedPageMD5.insert(strDigest);
598
599 SaveVisitedPageMD5(strDigest);
600 pthread_mutex_unlock(&mutexVisitedPageMD5);
601
602 // end
603
604 cout << "+";
605
606 ////////////////////
607 // save as Tianwang format
608 //将抓取到的网页以天网格式放到原始网页库中
609 SaveTianwangRawData(pTianwangFile, &iUrl, &iPage);
610
611 ////////////////////
612 // save visited Urls
613 if (iPage.m_sLocation.length() < 1) {
614 SaveVisitedUrl(iUrl.m_sUrl);
615 } else {
616 SaveVisitedUrl(iPage.m_sLocation);
617 }
618
619 //return; // just crawl seeds
620
621 /////////////////////////////////////
622 // Parse hyperlinks
623 if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html
624 return;
625 }
626
627 /*
628 if (iPage.ParseHyperLinks() == false){
629 return;
630 }
631
632 SaveLink4SE( &iPage);
633 SaveLink4History( &iPage);
634
635 map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin();
636 string str;
637 for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){
638 str = (*it4SE).first;
639 AddUrl( str.c_str() );
640
641 }
642 */
643 // using XIE Han's link parser
644
645 struct uri page_uri;
646 //FILE *tmp;
647
648 //tmp = tmpfile();
649
650 //fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp);
651 //fseek(tmp, 0, SEEK_SET);
652 //fclose(tmp);
653
654 pthread_mutex_lock(&mutexDetect);
655
656 if (iPage.m_sLocation.empty()) {
657 uri_parse_string(iPage.m_sUrl.c_str(), &page_uri);
658 } else {
659 uri_parse_string(iPage.m_sLocation.c_str(), &page_uri);
660 }
661
662 struct package p = { this, &iPage };
663 //hlink_detect(tmp, &page_uri, onfind, &p);
664
665 hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p);
666
667 struct file_arg pLinks = { &iUrl, &iPage };
668
669 SaveLink4SE031121(&pLinks);
670
671 // save as Link4SE format
672 //SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage);
673
674 pthread_mutex_unlock(&mutexDetect);
675
676 uri_destroy(&page_uri);
677 cout << "Parse End......" << endl;
678
679 return;
680 }
681
682 void SaveReplicas(const char* filename) {
683 //ofstream ofs(filename, ios::out|ios::app);
684 ofstream ofs(filename, ios::out | ios::binary | ios::app);
685 if (!ofs) {
686 cout << "error open file " << endl;
687 }
688 string md5;
689
690 pthread_mutex_lock(&mutexReplicas);
691 multimap<string, string, less<string> >::const_iterator it;
692 ostringstream *oss = 0;
693 int i = 0;
694 for (it = replicas.begin(); it != replicas.end(); it++) {
695 if (!md5.empty() && md5 != it->first) {
696 if (i >= 2)
697 ofs << (*oss).str() << endl;
698 //pthread_mutex_lock(&mutexMemory);
699 delete (oss);
700 oss = new ostringstream;
701 //pthread_mutex_unlock(&mutexMemory);
702 (*oss) << it->first << endl;
703 i = 0;
704 md5 = it->first;
705 } else if (md5.empty()) {
706 md5 = it->first;
707 //pthread_mutex_lock(&mutexMemory);
708 oss = new ostringstream;
709 //pthread_mutex_unlock(&mutexMemory);
710 (*oss) << it->first << endl;
711 i = 0;
712 }
713 if (oss != 0)
714 (*oss) << it->second << endl;
715 i++;
716 }
717
718 pthread_mutex_unlock(&mutexReplicas);
719 }
720
721 ////////////////////////////////////////////////////////////////////////////
722 // Construction/Destruction
723 ////////////////////////////////////////////////////////////////////////////
724
725 CCrawl::CCrawl() {
726 }
727
728 CCrawl::CCrawl(string inputFileName, string outputFileName) {
729 m_sInputFileName = inputFileName;
730 m_sOutputFileName = outputFileName; // + ".txt"
731 }
732
733 CCrawl::~CCrawl() {
734 m_ofsVisitedUrlFile.close();
735 m_ofsLink4SEFile.close();
736 m_ofsLink4HistoryFile.close();
737 m_isamFile.Close();
738 m_ofsVisitedUrlMD5File.close();
739 m_ofsVisitedPageMD5File.close();
740 }
741
742 /*****************************************************************
743 ** Function name: SigTerm
744 ** Input argv:
745 ** --
746 ** Output argv:
747 ** --
748 ** Return:
749 ** Function Description: signal function
750 ** Version: 1.0
751 ** Be careful:
752 *****************************************************************/
753 static void SigTerm(int x) {//信号处理函数
754 SaveUnvisitedUrl();
755 SaveReplicas("repli");
756
757 cout << "Terminated!" << endl;
758 exit(0);
759 }
760
761 void CCrawl::GetVisitedUrlMD5() {//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
762 ifstream ifsMD5(URL_MD5_FILE.c_str(), ios::binary);
763 if (!ifsMD5) {
764 //cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl;
765 return;
766 }
767
768 string strMD5;
769 while (getline(ifsMD5, strMD5)) {
770 setVisitedUrlMD5.insert(strMD5);
771 }
772
773 ifsMD5.close();
774 cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls"
775 << endl;
776 }
777
778 void CCrawl::GetVisitedPageMD5() {//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
779 ifstream ifsMD5(PAGE_MD5_FILE.c_str(), ios::binary);
780 if (!ifsMD5) {
781 //cerr << "did not find " << PageMD5_FILE << " for iutput" << endl;
782 return;
783 }
784
785 string strMD5;
786 while (getline(ifsMD5, strMD5)) {
787 setVisitedPageMD5.insert(strMD5);
788 }
789
790 ifsMD5.close();
791 cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages"
792 << endl;
793 }
794
795 void CCrawl::GetIpBlock() {//得到阻塞的IP,放入mapIpBlock容器中
796 ifstream ifsIpBlock(IP_BLOCK_FILE.c_str());
797 if (!ifsIpBlock) {
798 //cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl;
799 return;
800 }
801 string strIpBlock;
802 while (getline(ifsIpBlock, strIpBlock)) {
803 if (strIpBlock[0] == '\0' || strIpBlock[0] == '#'
804 || strIpBlock[0] == '\n') {
805
806 continue;
807 }
808
809 char buf1[64], buf2[64];
810
811 buf1[0] = '\0';
812 buf2[0] = '\0';
813 sscanf(strIpBlock.c_str(), "%s %s", buf1, buf2);
814
815 mapIpBlock.insert(valTypeIpBlock(inet_addr(buf1), inet_addr(buf2)));
816 }
817 ifsIpBlock.close();
818
819 }
820
821 void CCrawl::GetUnreachHostMD5() {//得到不可到达的主机号,放入setUnreachHostMD5中
822 //vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM);
823 ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str());
824 if (!ifsUnreachHost) {
825 cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl;
826 return;
827 }
828
829 string strUnreachHost;
830 //int i=0;
831 while (getline(ifsUnreachHost, strUnreachHost)) {
832 if (strUnreachHost[0] == '\0' || strUnreachHost[0] == '#'
833 || strUnreachHost[0] == '\n') {
834
835 continue;
836 }
837
838 CStrFun::Str2Lower(strUnreachHost, strUnreachHost.size());
839 //vsUnreachHost.push_back(strUnreachHost);
840 CMD5 iMD5;
841 iMD5.GenerateMD5((unsigned char*) strUnreachHost.c_str(),
842 strUnreachHost.size());
843 string strDigest = iMD5.ToString();
844 setUnreachHostMD5.insert(strDigest);
845 //i++;
846 //if(i == MAX_UNREACHABLE_HOST_NUM) break;
847 }
848
849 ifsUnreachHost.close();
850
851 }
852
853 /**************************************************************************************
854 * Function name: SaveTianwangRawData
855 * Input argv:
856 * -- pTianwangFile: tianwang file handle
857 * -- pUrl: url
858 * -- pPage: web page
859 * Output argv:
860 * --
861 * Return:
862 * Function Description: save raw page data as tianwang file
863 **************************************************************************************/
864 void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile, CUrl *pUrl,CPage *pPage) {//将抓取的网页以天网格式存储
865 if (!pTianwangFile || !pUrl || !pPage) {
866 return;
867 }
868
869 file_arg arg;
870 arg.pUrl = pUrl;
871 arg.pPage = pPage;
872
873 // each thread writes itself, so dnnot need mutex
874 pTianwangFile->Write((void*) &arg);
875 }
876
877 /**************************************************************************************
878 * Function name: SaveLink4SERawData
879 * Input argv:
880 * -- pLink4SEFile: link4SE file handle
881 * -- pUrl: url
882 * -- pPage: web page
883 * Output argv:
884 * --
885 * Return:
886 * Function Description: save raw page data as tianwang file
887 **************************************************************************************/
888 void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile, CUrl *pUrl,
889 CPage *pPage) { //将抓取的网页从中提取超链接信息建立网页结构库
890 if (!pLink4SEFile || !pUrl || !pPage) {
891 return;
892 }
893
894 file_arg arg;
895 arg.pUrl = pUrl;
896 arg.pPage = pPage;
897
898 // each thread writes itself, so dnnot need mutex
899 pLink4SEFile->Write((void*) &arg);
900 }
901
902 /**************************************************************************************
903 * Function name: SaveIsamRawData
904 * Input argv:
905 * -- pUrl: url
906 * -- pPage: web page
907 * Output argv:
908 * --
909 * Return:
910 * Function Description: save raw page data as ISAM file
911 **************************************************************************************/
912 void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage) {
913 if (!pUrl || !pPage) {
914 return;
915 }
916
917 file_arg arg;
918 arg.pUrl = pUrl;
919 arg.pPage = pPage;
920
921 pthread_mutex_lock(&mutexIsamFile);
922
923 m_isamFile.Write((void *) &arg);
924
925 pthread_mutex_unlock(&mutexIsamFile);
926 }
927
928 /**************************************************************************************
929 * Function name: SaveVisitedUrl
930 * Input argv:
931 * -- url: url
932 * Output argv:
933 * --
934 * Return:
935 * Function Description: save raw the Visited Url
936 **************************************************************************************/
937 void CCrawl::SaveVisitedUrl(string url) {
938 if (m_ofsVisitedUrlFile) {
939 pthread_mutex_lock(&mutexVisitedUrlFile);
940
941 m_ofsVisitedUrlFile << url << endl;
942
943 pthread_mutex_unlock(&mutexVisitedUrlFile);
944 }
945 }
946
947 void CCrawl::SaveUnreachHost(string host) {
948 CMD5 iMD5;
949 iMD5.GenerateMD5((unsigned char*) host.c_str(), host.size());
950 string strDigest = iMD5.ToString();
951 if (setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end()) {
952 pthread_mutex_lock(&mutexUnreachHost);
953
954 setUnreachHostMD5.insert(strDigest);
955 if (m_ofsUnreachHostFile) {
956 m_ofsUnreachHostFile << host << endl;
957 }
958
959 pthread_mutex_unlock(&mutexUnreachHost);
960 }
961 }
962
963 void CCrawl::SaveLink4SE(CPage *iPage) {
964 if (m_ofsLink4SEFile && iPage->m_nRefLink4SENum > 0) {
965 pthread_mutex_lock(&mutexLink4SEFile);
966
967 m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;
968 m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;
969 m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;
970 m_ofsLink4SEFile << "link_anchortext: " << endl;
971
972 map<string, string>::iterator it4SE = iPage->m_mapLink4SE.begin();
973 for (; it4SE != iPage->m_mapLink4SE.end(); ++it4SE) {
974
975 m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second
976 << endl;
977 ;
978
979 }
980
981 pthread_mutex_unlock(&mutexLink4SEFile);
982 }
983 }
984
985 bool CCrawl::SaveLink4SE031121(void *arg) {
986 if (!arg || !m_ofsLink4SEFile)
987 return false;
988
989 //pthread_mutex_lock(&mutexLink4SEFile);
990
991 if (vsParsedLinks.size() == 0)
992 return false;
993
994 file_arg *pFile = (file_arg *) arg;
995
996 CUrl *iUrl = pFile->pUrl;
997 CPage *iPage = pFile->pPage;
998
999 char strDownloadTime[128];
1000 time_t tDate;
1001
1002 memset(strDownloadTime, 0, 128);
1003 time(&tDate);
1004 strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1005
1006 string links;
1007 vector<string>::iterator it = vsParsedLinks.begin();
1008 for (; it != vsParsedLinks.end(); ++it) {
1009 links = links + *it + "\n";
1010 }
1011
1012 m_ofsLink4SEFile << "version: 1.0\n";
1013 if (iPage->m_sLocation.size() == 0) {
1014 m_ofsLink4SEFile << "url: " << iPage->m_sUrl;
1015 } else {
1016 m_ofsLink4SEFile << "url: " << iPage->m_sLocation;
1017 m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl;
1018 }
1019
1020 m_ofsLink4SEFile << "\ndate: " << strDownloadTime;
1021
1022 if (mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end()) {
1023 m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost;
1024 } else {
1025 m_ofsLink4SEFile << "\nip: "
1026 << (*(mapCacheHostLookup.find(iUrl->m_sHost))).second;
1027 }
1028
1029 m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size();
1030 m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1
1031 << "\n\n" << iPage->m_sHeader << "\n";
1032 m_ofsLink4SEFile << links;
1033 m_ofsLink4SEFile << endl;
1034
1035 vsParsedLinks.clear();
1036 //pthread_mutex_unlock(&mutexLink4SEFile);
1037
1038 return true;
1039 }
1040
1041 // not well
1042 void CCrawl::SaveLink4History(CPage *iPage) {//保存为历史网页存档准备的超链接信息
1043 if (m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum > 0) {
1044 pthread_mutex_lock(&mutexLink4HistoryFile);
1045
1046 //m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl;
1047 //m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl;
1048 //m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl;
1049 //m_ofsLink4HistoryFile << "link: " << endl;
1050
1051 vector<string>::iterator it4History = iPage->m_vecLink4History.begin();
1052 for (; it4History != iPage->m_vecLink4History.end(); ++it4History) {
1053 string s = *it4History;
1054 m_ofsLink4HistoryFile << s << endl;
1055 }
1056
1057 pthread_mutex_unlock(&mutexLink4HistoryFile);
1058 }
1059 }
1060
1061 /**************************************************************************************
1062 * Function name: SaveVisitedUrlMd5
1063 * Input argv:
1064 * -- md5: page md5 value
1065 * Output argv:
1066 * --
1067 * Return:
1068 * Function Description: save the visited url Md5
1069 **************************************************************************************/
1070 void CCrawl::SaveVisitedUrlMD5(string md5) {
1071 if (m_ofsVisitedUrlMD5File) {
1072 m_ofsVisitedUrlMD5File << md5 << endl;
1073 }
1074 }
1075
1076 /**************************************************************************************
1077 * Function name: SaveVisitedPageMd5
1078 * Input argv:
1079 * -- md5: page md5 value
1080 * Output argv:
1081 * --
1082 * Return:
1083 * Function Description: save the visited url Md5
1084 **************************************************************************************/
1085 void CCrawl::SaveVisitedPageMD5(string md5) {
1086 if (m_ofsVisitedPageMD5File) {
1087 m_ofsVisitedPageMD5File << md5 << endl;
1088 }
1089 }
1090
1091 /**************************************************************************************
1092 * Function name: OpenFileForOutput
1093 * Input argv:
1094 * --
1095 * Output argv:
1096 * --
1097 * Return:
1098 * Function Description: Open the files for output
1099 **************************************************************************************/
1100 void CCrawl::OpenFilesForOutput() {
1101 // open isam file for output
1102 m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME);
1103
1104 // open visited.url file for output
1105 m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(),
1106 ios::out | ios::app | ios::binary);
1107 if (!m_ofsVisitedUrlFile) {
1108 cerr << "cannot open " << VISITED_FILE << " for output\n" << endl;
1109 }
1110
1111 // open link4SE.url file for output
1112 m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(),
1113 ios::out | ios::app | ios::binary);
1114 if (!m_ofsLink4SEFile) {
1115 cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl;
1116 }
1117
1118 // open link4History.url file for output
1119 m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(),
1120 ios::out | ios::app | ios::binary);
1121 if (!m_ofsLink4HistoryFile) {
1122 cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl;
1123 }
1124
1125 // open unreach host file for output
1126 m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(),
1127 ios::out | ios::app | ios::binary);
1128 if (!m_ofsUnreachHostFile) {
1129 cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl;
1130 }
1131
1132 // open visited url md5 file for output
1133 m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(),
1134 ios::out | ios::app | ios::binary);
1135 if (!m_ofsVisitedUrlMD5File) {
1136 cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl;
1137 }
1138
1139 // open visited page md5 file for output
1140 m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(),
1141 ios::out | ios::app | ios::binary);
1142 if (!m_ofsVisitedPageMD5File) {
1143 cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl;
1144 }
1145 }
1146
1147 /***************************************************************************************
1148 * Function name: DoCrawl
1149 * Input argv:
1150 * --
1151 * Output argv:
1152 * --
1153 * Return:
1154 * Function Description: the main function for crawl
1155 * Be careful:
1156 ***************************************************************************************/
1157 void CCrawl::DoCrawl() {//CCrawl类中的总控函数
1158 /* set the signal function */
1159 signal(SIGTERM, SigTerm);
1160 signal(SIGKILL, SigTerm);
1161 signal(SIGINT, SigTerm);
1162 signal(SIGPIPE, SIG_IGN);
1163 signal(SIGCHLD, SIG_IGN);
1164
1165 // output the begin time
1166 char strTime[128];
1167 time_t tDate;
1168
1169 memset(strTime, 0, 128);
1170 time(&tDate);
1171 strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1172 cout << "\n\nBegin at: " << strTime << "\n\n";
1173
1174 // get the other info from file
1175 GetVisitedUrlMD5();
1176 GetVisitedPageMD5();
1177
1178 GetIpBlock();
1179
1180 GetUnreachHostMD5();
1181
1182 // open the seed url file
1183 ifstream ifsSeed(m_sInputFileName.c_str());
1184 if (!ifsSeed) {
1185 cerr << "Cannot open " << m_sInputFileName << " for input\n";
1186 return;
1187 }
1188
1189 // open the files for output
1190 OpenFilesForOutput();
1191
1192 // Create thread ID structures.
1193 pthread_t *tids = (pthread_t*) malloc(NUM_WORKERS * sizeof(pthread_t));
1194 if (tids == NULL) {
1195 cerr << "malloc error" << endl;
1196 }
1197
1198 for (unsigned int i = 0; i < NUM_WORKERS; i++) {
1199 if (pthread_create(&tids[i], NULL, start, this))
1200 cerr << "create threads error" << endl;
1201 }
1202
1203 string strUrl;
1204 CPage iCPage;
1205 while (getline(ifsSeed, strUrl)) {
1206 string::size_type idx;
1207
1208 if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
1209 continue;
1210 }
1211
1212 idx = strUrl.find('\t');
1213 if (idx != string::npos) {
1214 strUrl = strUrl.substr(0, idx);
1215 }
1216
1217 //idx = strUrl.find("http");
1218 idx = CStrFun::FindCase(strUrl, "http");
1219 if (idx == string::npos) {
1220 //continue;
1221 idx = strUrl.find('/');
1222 if (idx == string::npos) {
1223 strUrl = "http://" + strUrl + "/";
1224 } else {
1225 strUrl = "http://" + strUrl;
1226 }
1227 }
1228
1229 //if( strUrl.length() < 8 ) continue;
1230
1231 if (iCPage.IsFilterLink(strUrl))
1232 continue;
1233 AddUrl(strUrl.c_str());
1234 }
1235
1236 // Get the unvisited URL
1237 ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str());
1238 if (ifsUnvisitedUrl) {
1239 while (getline(ifsUnvisitedUrl, strUrl)) {
1240 string::size_type idx;
1241
1242 if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
1243 continue;
1244 }
1245
1246 idx = strUrl.find('\t');
1247 if (idx != string::npos) {
1248 strUrl = strUrl.substr(0, idx);
1249 }
1250
1251 // filter invalid urls
1252 if (iCPage.IsFilterLink(strUrl))
1253 continue;
1254
1255 AddUrl(strUrl.c_str());
1256 }
1257 } else {
1258 //cerr << "Cannot open " << UNVISITED_FILE << " for input\n";
1259 }
1260
1261 // sleep(30);
1262 b_fOver = true;
1263 cout << "finished to get all unvisited urls." << endl;
1264
1265 // Wait for the threads.
1266 for (unsigned int i = 0; i < NUM_WORKERS; ++i) {
1267 (void) pthread_join(tids[i], NULL);
1268 }
1269
1270 cout << "closed " << NUM_WORKERS << " threads." << endl;
1271
1272 SaveUnvisitedUrl();
1273 SaveReplicas("repli");
1274
1275 memset(strTime, 0, 128);
1276 time(&tDate);
1277 strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1278 cout << "\n\nEnd at: " << strTime << "\n\n";
1279 }
1280
1281 /*****************************************************************
1282 ** Function name: AddUrl
1283 ** Input argv:
1284 ** --
1285 ** Output argv:
1286 ** --
1287 ** Return:
1288 ** Function Description: Add a parsed url into the collection
1289 ** Version: 1.0
1290 ** Be careful: An important function!!!
1291 *****************************************************************/
1292 void CCrawl::AddUrl(const char * url) {
1293 string strUrl = url;
1294 if (strUrl.empty() || strUrl.size() < 8) { //invalid url
1295 cout << "!so small!" << strUrl << endl;
1296 return;
1297 }
1298
1299 CPage iCPage;
1300 if (iCPage.NormalizeUrl(strUrl) == false) {
1301 // cout << "!normalize fail!" << strUrl << endl;
1302 return;
1303 }
1304
1305 CUrl iUrl;
1306
1307 // for ImgSE, comment the paragraph
1308 // if image/xxx url, store it to link4History.url
1309 // begin
1310 if (iUrl.IsImageUrl(strUrl)) {
1311 if (m_ofsLink4HistoryFile) {
1312 pthread_mutex_lock(&mutexLink4HistoryFile);
1313 m_ofsLink4HistoryFile << strUrl << endl;
1314 ;
1315 pthread_mutex_unlock(&mutexLink4HistoryFile);
1316 }
1317 return;
1318 }
1319 // end
1320
1321 if (iUrl.ParseUrlEx(strUrl) == false) {
1322 cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl;
1323 return;
1324 }
1325
1326 // if it is an invalid host, discard it
1327 if (iUrl.IsValidHost(iUrl.m_sHost.c_str()) == false) {
1328 cout << "!invalid host: " << iUrl.m_sHost << endl;
1329 return;
1330 }
1331
1332 // filter foreign hosts
1333 if (iUrl.IsForeignHost(iUrl.m_sHost)) {
1334 cout << "!foreign hosts: " << iUrl.m_sHost << endl;
1335 return;
1336 }
1337
1338 // if it is a block ip, discard it
1339 // this work is left in the CreatSocket()
1340 // because the work of getting ip is inevitable in the CreatSocket function
1341 // and this work is expensive
1342 // if it is an unreach host, discard it
1343 // here we only deal with numbers-and-dots notations
1344 unsigned long inaddr = 0;
1345 char *ip = NULL;
1346
1347 inaddr = (unsigned long) inet_addr(iUrl.m_sHost.c_str());
1348 if (inaddr != INADDR_NONE) { // host is just ip
1349 //pthread_mutex_lock(&mutexMemory);
1350 ip = new char[iUrl.m_sHost.size() + 1];
1351 //pthread_mutex_unlock(&mutexMemory);
1352 memset(ip, 0, iUrl.m_sHost.size() + 1);
1353 memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size());
1354
1355 if (!iUrl.IsValidIp(ip)) { // out of ip block
1356 //pthread_mutex_lock(&mutexMemory);
1357 delete[] ip;
1358 ip = NULL;
1359 //pthread_mutex_unlock(&mutexMemory);
1360 //cout << "!unreach hosts: " << iUrl.m_sHost << endl;
1361 return;
1362 }
1363 //pthread_mutex_lock(&mutexMemory);
1364 delete[] ip;
1365 ip = NULL;
1366 //pthread_mutex_unlock(&mutexMemory);
1367 }
1368
1369 CStrFun::Str2Lower(iUrl.m_sHost, iUrl.m_sHost.size());
1370 CMD5 iMD5;
1371 iMD5.GenerateMD5((unsigned char*) iUrl.m_sHost.c_str(),
1372 iUrl.m_sHost.size());
1373 string strDigest = iMD5.ToString();
1374 if (setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end()) {
1375 //cout << "!unreach host! " << iUrl.m_sHost << endl;
1376 return;
1377 }
1378
1379 // if crawled, discard it
1380 iMD5.GenerateMD5((unsigned char*) strUrl.c_str(), strUrl.size());
1381 strDigest = iMD5.ToString();
1382
1383 if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) {
1384 // cout << "!visited! " << strUrl << endl;
1385 return;
1386 }
1387
1388 // if already in the collection, discard it
1389 if (setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end()) {
1390 // cout << "!in collection! " << strUrl << endl;
1391 return;
1392 } else {
1393 pthread_mutex_lock(&mutexUnvisitedUrlMD5);
1394 setUnvisitedUrlMD5.insert(strDigest);
1395 pthread_mutex_unlock(&mutexUnvisitedUrlMD5);
1396 }
1397
1398 // add
1399 // make sure limited threads crawling on a site
1400 int cnt = 0;
1401 for (;;) {
1402 //if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){
1403
1404 if (1) {
1405 //pthread_mutex_lock(&mutexVisitedUrlMD5);
1406
1407 // if crawled, discard it :) double secure
1408 //if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
1409 //cout << "!v! " << strUrl << endl;
1410 //pthread_mutex_unlock(&mutexVisitedUrlMD5);
1411 //return;
1412 //} else {
1413
1414 pthread_mutex_lock(&mutexVisitedUrlMD5);
1415 mmapUrls.insert(mvalType(iUrl.m_sHost, strUrl));
1416 pthread_mutex_unlock(&mutexVisitedUrlMD5);
1417 break;
1418 //}
1419 } else {
1420 cnt++;
1421 if (cnt % 100 == 0) {
1422 cout << "~";
1423 //cnt = 0;
1424 }
1425
1426 // If we have waiting so long, we may remove it
1427 if (cnt == 50000) {
1428 cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl;
1429 break;
1430 }
1431 usleep(4000);
1432 }
1433
1434 }
1435
1436 }