CCrawl

 1 #ifndef _Crawl_H_031104_
 2 #define _Crawl_H_031104_
 3 
 4 //#include <openssl/md5.h>
 5 #include <zlib.h>
 6 
 7 #include "Tse.h"
 8 #include "Http.h"
 9 #include "StrFun.h"
10 #include "Url.h"
11 #include "Page.h"
12 #include "TianwangFile.h"
13 #include "IsamFile.h"
14 #include "Link4SEFile.h"
15 
16 using namespace std;
17 
18 class CCrawl
19 {
20 public:
21     string m_sInputFileName;    //种子URL的文件名字: tse_seed.pku
22     string m_sOutputFileName;    //保存我们已经访问过的URL的文件名字: visited.all
23 
24     CIsamFile m_isamFile;        // ISAM file handle
25 
26     ofstream m_ofsVisitedUrlFile;    //visited.all的文件句柄
27     ofstream m_ofsLink4SEFile;    //link4SE.url的文件句柄
28     ofstream m_ofsLink4HistoryFile;    //link4History.url的文件句柄
29     ofstream m_ofsUnreachHostFile;    //tse_unreachHost.list的文件句柄
30 
31     ofstream m_ofsVisitedUrlMD5File;//tse_md5.visitedurl的文件句柄
32     ofstream m_ofsVisitedPageMD5File;//tse_md5.visitedpage
33 
34     ofstream m_ofsUnreachUrlFile;    // unreach URL file handle
35 
36 
37 public:
38     CCrawl();//无参构造函数  "tse_seed.pku" "visited.all"
39     CCrawl(string strInputFile, string strOutputFile);
40     ~CCrawl();
41 
42     //CCrawl类中最重要的函数
43     void DoCrawl();
44 
45     //根据URL以及套接字文件描述符抓取URL对应的网页
46     void DownloadFile( CTianwangFile *pTianwangFile,CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock);
47 
48     //每个线程函数start()都调用这个函数
49     void fetch(void *arg);
50 
51     //如果url满足条件加到mmapUrls[待访问的url]容器中
52     void AddUrl(const char *url);
53 
54     void GetVisitedUrlMD5();//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
55     void GetVisitedPageMD5();//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
56 
57     void GetIpBlock();//得到阻塞的IP,放入mapIpBlock容器中
58 
59     void GetUnreachHostMD5();//得到不可到达的主机号,放入setUnreachHostMD5中
60     void OpenFilesForOutput();//打开所有的输出流
61 
62     // save in the process
63     void SaveTianwangRawData(CTianwangFile *pTianwangFile,CUrl *pUrl, CPage *pPage);//将抓取的网页以天网格式存储
64     void SaveLink4SERawData(CLink4SEFile *pLink4SEFile,CUrl *pUrl, CPage *pPage);//将抓取的网页从中提取超链接信息建立网页结构库
65 
66     void SaveIsamRawData(CUrl *pUrl, CPage *Page);
67     void SaveVisitedUrl(string url);//保存已经访问过的URL
68     void SaveUnreachHost(string host);//保存不可到达的主机号
69     void SaveLink4SE(CPage *Page);//保存为搜索引擎准备的超链接信息
70     bool SaveLink4SE031121(void *arg);
71     void SaveLink4History(CPage *Page)//保存为历史网页存档准备的超链接信息
72 
73     // save while the program running
74     void SaveVisitedUrlMD5(string md5);//保存已经访问过的URL对应的MD5值
75     void SaveVisitedPageMD5(string md5);//得到已经访问过的web网页体对应的MD5值
76 
77 };
78 
79 #endif /* _CRAWL_H_031104_ */
   1 #include "Crawl.h"
   2 #include "Url.h"
   3 #include "Md5.h"
   4 
   5 #include <list.h>
   6 #include <hlink.h>
   7 #include <uri.h>
   8 
   9 extern pthread_mutex_t mymutex;
  10 extern map<string, string> mapCacheHostLookup; //DNS缓存
  11 extern vector<string> vsUnreachHost;
  12 extern char **ParseRobot(char *data, char len);
  13 
  14 set<string> setVisitedUrlMD5; //open list[已经访问的URL对应的MD5值]
  15 set<string> setVisitedPageMD5; //已经访问过的web网页体对应的MD5值
  16 set<string> setUnvisitedUrlMD5; //close list[没有访问过的URL对应的MD5值]
  17 
  18 set<string> setUnreachHostMD5; //不可达到的主机号对应的MD5值的集合
  19 
  20 multimap<string, string, less<string> > replicas; //web网页体对应的MD5值<->web网页体对应的URL
  21 
  22 //定义线程的互斥变量并初始化
  23 pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER; //保护mmapUrls资源
  24 pthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER; //保护setUnreachHostMD5&&m_ofsUnreachHostFile资源
  25 pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setUnvisitedUrlMD5资源
  26 pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisiteUrlMD5&&m_ofsVisiteUrlMD5File资源
  27 pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisitePageMD5&&m_ofsVisitePageMD5File资源
  28 
  29 pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;
  30 pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER; //保护m_ofsLink4SEFile资源
  31 pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;
  32 pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;
  33 pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;
  34 pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;
  35 pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;
  36 //pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;
  37 
  38 map<unsigned long, unsigned long> mapIpBlock; //IP阻塞范围
  39 bool b_fOver; //线程运行控制参数
  40 //multimap<string,string, less<string> > mmapUrls;
  41 multimap<string, string> mmapUrls; //保存没有访问过的URL的主机号<->对应的URL
  42 
  43 typedef map<unsigned long, unsigned long>::value_type valTypeIpBlock;
  44 typedef map<string, string>::value_type mvalType;
  45 
  46 void SaveReplicas(const char* filename); //保存镜像网页对应的URL的一个值到指定的文件名中
  47 
  48 struct package {
  49     CCrawl *crawl;
  50     CPage *page;
  51 };
  52 
  53 vector<string> vsParsedLinks;
  54 
  55 int onfind(const char *elem, const char *attr, struct uri *uri, void *arg) {
  56     struct package *p = (struct package*) arg;
  57     char buff[URL_LEN + 1];
  58 
  59     if (uri_recombine(uri, buff, URL_LEN + 1,
  60             C_SCHEME | C_AUTHORITY | C_PATH | C_QUERY) >= 0)
  61 
  62     {
  63         vsParsedLinks.push_back(buff);
  64         if (!p->page->IsFilterLink(buff)) {
  65             // accept "a,link,frame,iframe,img,area"
  66 
  67             if (strcasecmp(elem, "img") == 0) {
  68                 pthread_mutex_lock(&mutexLink4HistoryFile);
  69                 if (p->crawl->m_ofsLink4HistoryFile) {
  70                     p->crawl->m_ofsLink4HistoryFile << buff << endl;
  71                 }
  72                 pthread_mutex_unlock(&mutexLink4HistoryFile);
  73 
  74             } else {
  75                 p->crawl->AddUrl(buff);
  76             }
  77             /*
  78              else if (strcasecmp(elem, "img") == 0)
  79              {
  80              pthread_mutex_lock(&mutexLink4HistoryFile);
  81              if( p->crawl->m_ofsLink4HistoryFile ){
  82              p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;;
  83              }
  84              pthread_mutex_unlock(&mutexLink4HistoryFile);
  85              }
  86              */
  87         }
  88     }
  89 
  90     uri_destroy(uri);
  91     free(uri);
  92     return 1;
  93 }
  94 
  95 /***********************************************************************
  96  * Function name: start
  97  * Input argv:
  98  *     -- arg: the CCrawl handle
  99  * Output argv:
 100  *     --
 101  * Return:
 102  ***********************************************************************/
 103 //线程函数-->每个线程函数调用fetch(void*arg)函数
 104 void* start(void *arg) {
 105     ((CCrawl*) arg)->fetch(arg);
 106 }
 107 
 108 /*
 109  这个函数设计的很巧妙,这里说的巧妙不是函数写有多hi:
 110  我们知道spider在最开始抓取网页的时候需要种子url,
 111  我们这个spider的种子url文件库是tse_seed.ur文件
 112  而这个函数正好在我们强制中断程序的时候,将mmapUrls
 113  中没有访问完的url放入tse_unvisited.url文件中,
 114  扩充了我们的种子URL库!*/
 115 void SaveUnvisitedUrl() {
 116     ofstream ofsUnvisitedUrl;
 117     ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(),
 118             ios::in | ios::out | ios::trunc | ios::binary); //以二进制可追加写方式打开文件
 119     if (!ofsUnvisitedUrl) //打开失败
 120     {
 121         cerr << "cannot open " << UNVISITED_FILE << "for output" << endl;
 122         exit(-1);
 123     }
 124 
 125     //将mmapUrls中没有访问完的url放入tse_unvisited.url文件中,扩充了我们的URL种子库!
 126     multimap<string, string>::iterator it = mmapUrls.begin();
 127     for (; it != mmapUrls.end(); it++) {
 128         ofsUnvisitedUrl << ((*it).second).c_str() << "\n";
 129     }
 130 
 131     ofsUnvisitedUrl << endl;
 132     ofsUnvisitedUrl.close();
 133 
 134 }
 135 
 136 /***********************************************************************
 137  * Function name: fetch
 138  * Input argv:
 139  *     -- arg: the CCrawl handle
 140  * Output argv:
 141  *     --
 142  * Return:
 143  ***********************************************************************/
 144 void CCrawl::fetch(void *arg) //每个线程都执行这个函数
 145         {
 146     string strUrl, host;
 147 
 148     int nGSock = -1; //之前的套接字文件描述符
 149     string strGHost = ""; //字前的主机号
 150 
 151     // create a Tianwang file for output the raw page data
 152     string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); //Tianwang.raw+"线程号"
 153     CTianwangFile tianwangFile(ofsName); //创建一个天网格式的文件,保存为原始网页库
 154 
 155     // create a Link4SE file for output the raw link data
 156     ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self()); //Link4SE.raw+"线程号"
 157     CLink4SEFile link4SEFile(ofsName); //创建一个网页结构库
 158 
 159     int iSleepCnt = 0; //线程运行控制参数
 160     for (;;) {
 161         pthread_mutex_lock(&mutexCollection); //互斥的锁定函数
 162         //if( !mmapUrls.empty() ){
 163         int cnt = mmapUrls.size();
 164         if (cnt > 0) {
 165             //已经收集的没有访问的url
 166             cout << "collection has: " << cnt << " unvisited urls" << endl;
 167             multimap<string, string>::iterator it = mmapUrls.begin();
 168             if (it != mmapUrls.end()) {
 169                 // get an URL
 170                 strUrl = (*it).second; //从待访问的URL队列中得到一个URL进行访问
 171 
 172                 // remove it from the collection
 173                 mmapUrls.erase(it); //删除迭代器所指的元素
 174 
 175                 pthread_mutex_unlock(&mutexCollection); //互斥的解锁函数
 176 
 177                 // parse URL
 178                 CUrl iUrl; //关键是看看strUrl是否有http://协议号,没有返回false
 179                 if (iUrl.ParseUrlEx(strUrl) == false) {
 180                     cout << "ParseUrlEx error in fetch(): " << strUrl << endl;
 181                     continue;
 182                 }
 183 
 184                 //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同
 185                 //故,我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的
 186                 //套接字文件描述符进行通信,这是由于循环导致的
 187                 if (strGHost != iUrl.m_sHost) {
 188                     close(nGSock);
 189                     nGSock = -1;
 190                     strGHost = iUrl.m_sHost;
 191                 }
 192 
 193                 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
 194                 ((CCrawl*) arg)->DownloadFile(&tianwangFile, &link4SEFile, iUrl,
 195                         nGSock);
 196 
 197                 cnt = 0;
 198             } else {
 199                 pthread_mutex_unlock(&mutexCollection);
 200             }
 201         } else {
 202             //待访问的URL队列mmapUrls中没有URL了,这个时候我们必须挂起线程进行等待
 203             pthread_mutex_unlock(&mutexCollection);
 204             usleep(1000);
 205             iSleepCnt++;
 206         }
 207 
 208         if (b_fOver == true && iSleepCnt == 200) //当URL队列mmapUrls有200次都是空的时候就结束这个线程调用的fetch()函数
 209             break;
 210         /*
 211          if( b_fOver == true ){
 212          break;
 213          } else if( cnt == 100 ) {
 214          cout << "w.";
 215          cnt = 0;
 216          }
 217          */
 218     }
 219 
 220     tianwangFile.Close();
 221     link4SEFile.Close();
 222 }
 223 
 224 /***********************************************************************
 225  * Function name: DownloadFile
 226  * Input argv:
 227  *     -- pTianwang: the CCrawl handle
 228  *     -- pLink4SE: the CCrawl handle
 229  *     -- iUrl: the URL for crawling
 230  *     -- nGSock: the previous global socket
 231  * Output argv:
 232  *     --
 233  * Return:
 234  ***********************************************************************/
 235 
 236 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
 237 void CCrawl::DownloadFile(CTianwangFile *pTianwangFile,
 238         CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock) {
 239     char *downloaded_file = NULL, //网页体信息
 240             *fileHead = NULL, //网页头信息
 241             *location = NULL; //网页的重定向信息
 242     int file_length = 0; //网页体真实的字节长度
 243     string strUrlLocation = ""; //保存网页的重定向超链接
 244 
 245     //之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间
 246     int nSock = nGSock; //将之前的套接字文件描述符赋值给nSock
 247 
 248     cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl;
 249 
 250     CHttp http;
 251 
 252     //这是一个真正的抓取网页的函数,有了URL搜集系统可以根据URL的标识抓取其对应的网页
 253     file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead,
 254             &location, &nSock);
 255 
 256     int nCount = 0; //用来标识URL重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页
 257 
 258     while (file_length == -300) //表明该iUrl.m_sUrl对应的网页重定向了
 259     { // moved to an another place
 260         if (strlen(location) > URL_LEN - 1 || nCount == 3
 261                 || strlen(location) == 0) {
 262             if (location) {
 263                 //pthread_mutex_lock(&mutexMemory); 
 264                 free(location);
 265                 location = NULL;
 266                 //pthread_mutex_unlock(&mutexMemory);
 267             }
 268             file_length = -1;
 269             break;
 270         }
 271 
 272         //将获取到的重定向的URL给strUrlLocation为下次抓取网页做准备
 273         strUrlLocation = location;
 274         if (location) {
 275             //pthread_mutex_lock(&mutexMemory);
 276             free(location);
 277             location = NULL;
 278             //pthread_mutex_unlock(&mutexMemory);
 279         }
 280 
 281         //这个地方要注意,因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径
 282         //跟CPage类中提取超链接信息一样
 283         string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http");
 284 
 285         if (idx1 != 0) { //没有找"http://"协议号
 286 
 287             char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length() - 1);
 288             char c2 = strUrlLocation.at(0);
 289 
 290             if (c2 == '/') //重定向的URL一定是相对路径
 291                     {
 292                 strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation;
 293             } else if (c1 != '/' && c2 != '/') {
 294                 string::size_type idx;
 295 
 296                 idx = iUrl.m_sUrl.rfind('/');
 297                 if (idx != string::npos) {
 298                     if (idx > 6) { // > strlen("http://..")
 299                         strUrlLocation = iUrl.m_sUrl.substr(0, idx + 1)
 300                                 + strUrlLocation;
 301                     } else {
 302                         strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
 303                     }
 304 
 305                 } else {
 306                     file_length = -1;
 307                     break;
 308                 }
 309             } else {
 310                 if (c1 == '/') {
 311                     strUrlLocation = iUrl.m_sUrl + strUrlLocation;
 312                 } else {
 313                     strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
 314                 }
 315             }
 316         }
 317 
 318         CPage iPage;
 319         if (iPage.IsFilterLink(strUrlLocation)) { //如果得到的重定向URL是要过滤的URL,我们立刻结束不再抓取
 320             file_length = -1;
 321             break;
 322         }
 323 
 324         cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl;
 325         file_length = http.Fetch(strUrlLocation, &downloaded_file, &fileHead,
 326                 &location, &nSock);
 327         nCount++;
 328     }
 329 
 330     nGSock = nSock; //将新得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备
 331 
 332     if (file_length == -1) { //其他的各种错误,这个错误的原因在http.Fetch()中
 333         cout << "!-: " << iUrl.m_sUrl << endl;
 334         //pthread_mutex_lock(&mutexMemory);
 335         if (fileHead) {
 336             free(fileHead);
 337             fileHead = NULL;
 338         }
 339         if (downloaded_file) {
 340             free(downloaded_file);
 341             downloaded_file = NULL;
 342         }
 343         //pthread_mutex_unlock(&mutexMemory);
 344 
 345         cout << "-unreach host: " << iUrl.m_sHost << endl;
 346         ;
 347         return;
 348     }
 349 
 350     if (file_length == -2) { // out of ip block .//在IP阻塞范围内
 351         //pthread_mutex_lock(&mutexMemory);
 352         if (fileHead) {
 353             free(fileHead);
 354             fileHead = NULL;
 355         }
 356         if (downloaded_file) {
 357             free(downloaded_file);
 358             downloaded_file = NULL;
 359         }
 360         //pthread_mutex_unlock(&mutexMemory);
 361 
 362         // save unreach host
 363         SaveUnreachHost(iUrl.m_sHost);
 364 
 365         cout << "-out of block host: " << iUrl.m_sHost << endl;
 366         ;
 367         return;
 368     }
 369 
 370     if (file_length == -3) { // invalid host or ip//URL的主机号是无效的主机号
 371         //pthread_mutex_lock(&mutexMemory);
 372         if (fileHead) {
 373             free(fileHead);
 374             fileHead = NULL;
 375         }
 376         if (downloaded_file) {
 377             free(downloaded_file);
 378             downloaded_file = NULL;
 379         }
 380         //pthread_mutex_unlock(&mutexMemory);
 381         cout << "-invalid host: " << iUrl.m_sHost << endl;
 382         return;
 383     }
 384 
 385     if (file_length == -4) { // MIME is image/xxx//图片类型的网页
 386         //pthread_mutex_lock(&mutexMemory);
 387         if (fileHead) {
 388             free(fileHead);
 389             fileHead = NULL;
 390         }
 391         if (downloaded_file) {
 392             free(downloaded_file);
 393             downloaded_file = NULL;
 394         }
 395         //pthread_mutex_unlock(&mutexMemory);
 396 
 397         if (m_ofsLink4HistoryFile) { //为历史网页存档准备的链接
 398             pthread_mutex_lock(&mutexLink4HistoryFile);
 399             m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;
 400             ; //将该URL保存在link4History.url文件中
 401             pthread_mutex_unlock(&mutexLink4HistoryFile);
 402         }
 403 
 404         cout << "-imgage host: " << iUrl.m_sHost << endl;
 405         return;
 406     }
 407 
 408     /* still experiment
 409      char **dir;
 410      dir =  ParseRobot( downloaded_file, file_length);
 411      for( int i = 0; dir[i] != NULL ; i++){
 412      cout << dir[i] << endl;
 413      free( dir[i] );
 414      }
 415 
 416      exit(1);
 417      */
 418 
 419     // so small, maybe some unuseful info, skipped
 420     //if(file_length < 40){    // for ImgSE, 
 421     /*
 422      if(file_length < 256){    // for SE
 423      //pthread_mutex_lock(&mutexMemory);
 424      if (fileHead)
 425      {
 426      free(fileHead); fileHead=NULL;
 427      }
 428      if (downloaded_file)
 429      {
 430      free(downloaded_file); downloaded_file=NULL;
 431      }
 432      //pthread_mutex_unlock(&mutexMemory);
 433      cout << "#";
 434      return;
 435      }
 436      */
 437 
 438     // deal with normal page
 439 
 440     //处理正常的网页[网页头信息和网页体信息只要有一个是NULL,我们就认为它不是正常的网页]
 441     if (!fileHead || !downloaded_file) //不能获得网页头信息或者网页体信息
 442             {
 443         //pthread_mutex_lock(&mutexMemory);
 444         if (fileHead) {
 445             free(fileHead);
 446             fileHead = NULL;
 447         }
 448         if (downloaded_file) {
 449             free(downloaded_file);
 450             downloaded_file = NULL;
 451         }
 452         //pthread_mutex_unlock(&mutexMemory);
 453         close(nGSock);
 454         nGSock = -1;
 455         cout << "-size0 host: " << iUrl.m_sHost << endl;
 456         return;
 457     }
 458 
 459     //这里很重要,将抓取到的网页信息全部放入CPage类中
 460     CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file,
 461             file_length);
 462     //pthread_mutex_lock(&mutexMemory);
 463     if (fileHead) {
 464         free(fileHead);
 465         fileHead = NULL;
 466     }
 467     if (downloaded_file) {
 468         free(downloaded_file);
 469         downloaded_file = NULL;
 470     }
 471     //pthread_mutex_unlock(&mutexMemory);
 472 
 473     //解析网页头信息
 474     iPage.ParseHeaderInfo(iPage.m_sHeader);
 475 
 476     if (iPage.m_bConnectionState == false) {
 477         close(nGSock);
 478         nGSock = -1;
 479     }
 480 
 481     // when crawling images for ImgSE, remember to comment the paragraph
 482     // when crawling plain text for SE, remember to open the paragraph
 483     // paragraph begin
 484 
 485     // iPage.m_sContentType != "text/css" &&
 486 
 487     //过滤掉不是我们想要的网页体的类型
 488     if (iPage.m_sContentType != "text/html"
 489             && iPage.m_sContentType != "text/plain"
 490             && iPage.m_sContentType != "text/xml"
 491             && iPage.m_sContentType != "application/msword"
 492             && iPage.m_sContentType != "application/pdf"
 493             && iPage.m_sContentType != "text/rtf"
 494             && iPage.m_sContentType != "application/postscript"
 495             && iPage.m_sContentType != "application/vnd.ms-execl"
 496             && iPage.m_sContentType != "application/vnd.ms-powerpoint") {
 497 
 498         cout << "-unwant type  host: " << iUrl.m_sHost << endl;
 499         return;
 500     }
 501 
 502     // paragraph end
 503 
 504     //解压缩开始
 505     //如果是gzip编码,要解压缩,然后提取超链接信息,现在门户网站的首页有增大趋势
 506     //为了加快传输速度,通常采用gzip编码压缩后传输
 507     char sUnzipContent[1024000]; //1000K<1M
 508     int nUnzipLength = 0;
 509     if (iPage.m_sContentEncoding == "gzip"
 510             && iPage.m_sContentType == "text/html") {
 511 
 512         gzFile zip;
 513         //这是一个过渡文件,将没有解压缩的网页体信息放入到这个文件中
 514         string ofsGzipName;
 515 
 516         ofsGzipName = CStrFun::itos(pthread_self()) + ".gz";
 517 
 518         //以二进制截断的方式打开文件
 519         //ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件
 520         ofstream ofsDownloadFile(ofsGzipName.c_str(), ios::trunc | ios::binary);
 521 
 522         cout << "file_length: " << file_length << endl;
 523         ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent);
 524         ofsDownloadFile.close();
 525 
 526         zip = gzopen(ofsGzipName.c_str(), "rb");
 527         if (zip == NULL) {
 528             cout << "Open zip file " << ofsGzipName.c_str() << " error."
 529                     << endl;
 530             exit(-1);
 531         }
 532 
 533         //解压缩过程,将解压缩后的网页体信息放入到缓冲区域sUnzipContent
 534         nUnzipLength = gzread(zip, sUnzipContent, 1024000);
 535         if (nUnzipLength == -1) {
 536             cout << "Read zip file " << ofsGzipName.c_str() << " error."
 537                     << endl;
 538             exit(-1);
 539         }
 540 
 541         sUnzipContent[nUnzipLength] = 0;
 542 
 543         gzclose(zip);
 544 
 545         //将解压缩后的网页体信息覆盖原来的没有解压缩的网页体信息
 546         //iPage.m_sContent.assign(sUnzipContent,nUnzipLength);
 547         //iPage.m_nLenContent=nUnzipLength;
 548     }
 549     //解压缩结束
 550 
 551     CMD5 iMD5;
 552     string strDigest;
 553 
 554     /////////////////////////////
 555     // because we can make sure the url in the setVisitedUrlMd5
 556     // is not same(we have check it before insert it to the collection),
 557     // we intert it directly.  however...
 558     //iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() );
 559 
 560     //判断该URL是否在open list[setVisitedUrlMD5]中,在返回;不在加到open list中,并保存
 561     iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),iUrl.m_sUrl.length());
 562     strDigest = iMD5.ToString();
 563 
 564     pthread_mutex_lock(&mutexVisitedUrlMD5);
 565     if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) //已经抓取过了!
 566             {
 567         cout << "!vurl: "; //1.crawled already
 568         pthread_mutex_unlock(&mutexVisitedUrlMD5);
 569         return;
 570     }
 571 
 572     //不在setVisitedUrlMD5中,现在必须插入setVisitedUrlMD5中
 573     //因为该URL现在已经访问过了
 574     setVisitedUrlMD5.insert(strDigest);
 575     SaveVisitedUrlMD5(strDigest);
 576     pthread_mutex_unlock(&mutexVisitedUrlMD5);
 577 
 578     /////////////////////////////
 579     // whether it is a visited page
 580     // for ImgSE, should comment this paragraph
 581     // for SE, should uncomment this paragraph
 582 
 583     // begin
 584 
 585     //判断该网页体是否已经访问过,访问过返回,没有访问过加到setVisitedPageMD5集合中
 586     iMD5.GenerateMD5((unsigned char*) iPage.m_sContent.c_str(),iPage.m_sContent.length());
 587     strDigest = iMD5.ToString();
 588     pthread_mutex_lock(&mutexVisitedPageMD5);
 589     //网页体MD5同URL的关系插入到容器replicas中
 590     replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl));
 591     if (setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end()) //在setVisitedPageMD5中:表明出现了镜像网页
 592             {
 593         cout << "!vpage: "; // crawled already
 594         pthread_mutex_unlock(&mutexVisitedPageMD5);
 595         return;
 596     }
 597     setVisitedPageMD5.insert(strDigest);
 598 
 599     SaveVisitedPageMD5(strDigest);
 600     pthread_mutex_unlock(&mutexVisitedPageMD5);
 601 
 602     // end
 603 
 604     cout << "+";
 605 
 606     ////////////////////
 607     // save as Tianwang format
 608     //将抓取到的网页以天网格式放到原始网页库中
 609     SaveTianwangRawData(pTianwangFile, &iUrl, &iPage);
 610 
 611     ////////////////////
 612     // save visited Urls
 613     if (iPage.m_sLocation.length() < 1) {
 614         SaveVisitedUrl(iUrl.m_sUrl);
 615     } else {
 616         SaveVisitedUrl(iPage.m_sLocation);
 617     }
 618 
 619     //return;    // just crawl seeds
 620 
 621     /////////////////////////////////////
 622     // Parse hyperlinks
 623     if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html
 624         return;
 625     }
 626 
 627     /*
 628      if (iPage.ParseHyperLinks() == false){
 629      return;
 630      }
 631 
 632      SaveLink4SE( &iPage);
 633      SaveLink4History( &iPage);
 634 
 635      map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin();
 636      string str;
 637      for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){
 638      str = (*it4SE).first;
 639      AddUrl( str.c_str() );
 640 
 641      }
 642      */
 643     // using XIE Han's link parser
 644 
 645     struct uri page_uri;
 646     //FILE *tmp;
 647 
 648     //tmp = tmpfile();
 649 
 650     //fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp);
 651     //fseek(tmp, 0, SEEK_SET);
 652     //fclose(tmp);
 653 
 654     pthread_mutex_lock(&mutexDetect);
 655 
 656     if (iPage.m_sLocation.empty()) {
 657         uri_parse_string(iPage.m_sUrl.c_str(), &page_uri);
 658     } else {
 659         uri_parse_string(iPage.m_sLocation.c_str(), &page_uri);
 660     }
 661 
 662     struct package p = { this, &iPage };
 663     //hlink_detect(tmp, &page_uri, onfind, &p);
 664 
 665     hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p);
 666 
 667     struct file_arg pLinks = { &iUrl, &iPage };
 668 
 669     SaveLink4SE031121(&pLinks);
 670 
 671     // save as Link4SE format
 672     //SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage);
 673 
 674     pthread_mutex_unlock(&mutexDetect);
 675 
 676     uri_destroy(&page_uri);
 677     cout << "Parse End......" << endl;
 678 
 679     return;
 680 }
 681 
 682 void SaveReplicas(const char* filename) {
 683     //ofstream ofs(filename, ios::out|ios::app);
 684     ofstream ofs(filename, ios::out | ios::binary | ios::app);
 685     if (!ofs) {
 686         cout << "error open file " << endl;
 687     }
 688     string md5;
 689 
 690     pthread_mutex_lock(&mutexReplicas);
 691     multimap<string, string, less<string> >::const_iterator it;
 692     ostringstream *oss = 0;
 693     int i = 0;
 694     for (it = replicas.begin(); it != replicas.end(); it++) {
 695         if (!md5.empty() && md5 != it->first) {
 696             if (i >= 2)
 697                 ofs << (*oss).str() << endl;
 698             //pthread_mutex_lock(&mutexMemory);
 699             delete (oss);
 700             oss = new ostringstream;
 701             //pthread_mutex_unlock(&mutexMemory);
 702             (*oss) << it->first << endl;
 703             i = 0;
 704             md5 = it->first;
 705         } else if (md5.empty()) {
 706             md5 = it->first;
 707             //pthread_mutex_lock(&mutexMemory);
 708             oss = new ostringstream;
 709             //pthread_mutex_unlock(&mutexMemory);
 710             (*oss) << it->first << endl;
 711             i = 0;
 712         }
 713         if (oss != 0)
 714             (*oss) << it->second << endl;
 715         i++;
 716     }
 717 
 718     pthread_mutex_unlock(&mutexReplicas);
 719 }
 720 
 721 ////////////////////////////////////////////////////////////////////////////
 722 // Construction/Destruction
 723 ////////////////////////////////////////////////////////////////////////////
 724 
 725 CCrawl::CCrawl() {
 726 }
 727 
 728 CCrawl::CCrawl(string inputFileName, string outputFileName) {
 729     m_sInputFileName = inputFileName;
 730     m_sOutputFileName = outputFileName; // + ".txt"
 731 }
 732 
 733 CCrawl::~CCrawl() {
 734     m_ofsVisitedUrlFile.close();
 735     m_ofsLink4SEFile.close();
 736     m_ofsLink4HistoryFile.close();
 737     m_isamFile.Close();
 738     m_ofsVisitedUrlMD5File.close();
 739     m_ofsVisitedPageMD5File.close();
 740 }
 741 
 742 /*****************************************************************
 743  ** Function name: SigTerm
 744  ** Input argv:
 745  **      --
 746  ** Output argv:
 747  **      --
 748  ** Return:
 749  ** Function Description: signal function
 750  ** Version: 1.0
 751  ** Be careful:
 752  *****************************************************************/
 753 static void SigTerm(int x) {//信号处理函数
 754     SaveUnvisitedUrl();
 755     SaveReplicas("repli");
 756 
 757     cout << "Terminated!" << endl;
 758     exit(0);
 759 }
 760 
 761 void CCrawl::GetVisitedUrlMD5() {//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
 762     ifstream ifsMD5(URL_MD5_FILE.c_str(), ios::binary);
 763     if (!ifsMD5) {
 764         //cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl;
 765         return;
 766     }
 767 
 768     string strMD5;
 769     while (getline(ifsMD5, strMD5)) {
 770         setVisitedUrlMD5.insert(strMD5);
 771     }
 772 
 773     ifsMD5.close();
 774     cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls"
 775             << endl;
 776 }
 777 
 778 void CCrawl::GetVisitedPageMD5() {//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
 779     ifstream ifsMD5(PAGE_MD5_FILE.c_str(), ios::binary);
 780     if (!ifsMD5) {
 781         //cerr << "did not find " << PageMD5_FILE << " for iutput" << endl;
 782         return;
 783     }
 784 
 785     string strMD5;
 786     while (getline(ifsMD5, strMD5)) {
 787         setVisitedPageMD5.insert(strMD5);
 788     }
 789 
 790     ifsMD5.close();
 791     cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages"
 792             << endl;
 793 }
 794 
 795 void CCrawl::GetIpBlock() {//得到阻塞的IP,放入mapIpBlock容器中
 796     ifstream ifsIpBlock(IP_BLOCK_FILE.c_str());
 797     if (!ifsIpBlock) {
 798         //cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl;
 799         return;
 800     }
 801     string strIpBlock;
 802     while (getline(ifsIpBlock, strIpBlock)) {
 803         if (strIpBlock[0] == '\0' || strIpBlock[0] == '#'
 804                 || strIpBlock[0] == '\n') {
 805 
 806             continue;
 807         }
 808 
 809         char buf1[64], buf2[64];
 810 
 811         buf1[0] = '\0';
 812         buf2[0] = '\0';
 813         sscanf(strIpBlock.c_str(), "%s %s", buf1, buf2);
 814 
 815         mapIpBlock.insert(valTypeIpBlock(inet_addr(buf1), inet_addr(buf2)));
 816     }
 817     ifsIpBlock.close();
 818 
 819 }
 820 
 821 void CCrawl::GetUnreachHostMD5() {//得到不可到达的主机号,放入setUnreachHostMD5中
 822     //vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM);
 823     ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str());
 824     if (!ifsUnreachHost) {
 825         cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl;
 826         return;
 827     }
 828 
 829     string strUnreachHost;
 830     //int i=0;
 831     while (getline(ifsUnreachHost, strUnreachHost)) {
 832         if (strUnreachHost[0] == '\0' || strUnreachHost[0] == '#'
 833                 || strUnreachHost[0] == '\n') {
 834 
 835             continue;
 836         }
 837 
 838         CStrFun::Str2Lower(strUnreachHost, strUnreachHost.size());
 839         //vsUnreachHost.push_back(strUnreachHost);
 840         CMD5 iMD5;
 841         iMD5.GenerateMD5((unsigned char*) strUnreachHost.c_str(),
 842                 strUnreachHost.size());
 843         string strDigest = iMD5.ToString();
 844         setUnreachHostMD5.insert(strDigest);
 845         //i++;
 846         //if(i == MAX_UNREACHABLE_HOST_NUM) break;
 847     }
 848 
 849     ifsUnreachHost.close();
 850 
 851 }
 852 
 853 /**************************************************************************************
 854  *  Function name: SaveTianwangRawData
 855  *  Input argv:
 856  *      --    pTianwangFile: tianwang file handle
 857  *      --    pUrl: url
 858  *      --    pPage: web page
 859  *  Output argv:
 860  *      --
 861  *  Return:
 862  *  Function Description: save raw page data as tianwang file
 863  **************************************************************************************/
 864 void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile, CUrl *pUrl,CPage *pPage) {//将抓取的网页以天网格式存储
 865     if (!pTianwangFile || !pUrl || !pPage) {
 866         return;
 867     }
 868 
 869     file_arg arg;
 870     arg.pUrl = pUrl;
 871     arg.pPage = pPage;
 872 
 873     // each thread writes itself, so dnnot need mutex
 874     pTianwangFile->Write((void*) &arg);
 875 }
 876 
 877 /**************************************************************************************
 878  *  Function name: SaveLink4SERawData
 879  *  Input argv:
 880  *      --    pLink4SEFile: link4SE file handle
 881  *      --    pUrl: url
 882  *      --    pPage: web page
 883  *  Output argv:
 884  *      --
 885  *  Return:
 886  *  Function Description: save raw page data as tianwang file
 887  **************************************************************************************/
 888 void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile, CUrl *pUrl,
 889         CPage *pPage) { //将抓取的网页从中提取超链接信息建立网页结构库
 890     if (!pLink4SEFile || !pUrl || !pPage) {
 891         return;
 892     }
 893 
 894     file_arg arg;
 895     arg.pUrl = pUrl;
 896     arg.pPage = pPage;
 897 
 898     // each thread writes itself, so dnnot need mutex
 899     pLink4SEFile->Write((void*) &arg);
 900 }
 901 
 902 /**************************************************************************************
 903  *  Function name: SaveIsamRawData
 904  *  Input argv:
 905  *      --    pUrl: url
 906  *      --    pPage: web page
 907  *  Output argv:
 908  *      --
 909  *  Return:
 910  *  Function Description: save raw page data as ISAM file
 911  **************************************************************************************/
 912 void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage) {
 913     if (!pUrl || !pPage) {
 914         return;
 915     }
 916 
 917     file_arg arg;
 918     arg.pUrl = pUrl;
 919     arg.pPage = pPage;
 920 
 921     pthread_mutex_lock(&mutexIsamFile);
 922 
 923     m_isamFile.Write((void *) &arg);
 924 
 925     pthread_mutex_unlock(&mutexIsamFile);
 926 }
 927 
 928 /**************************************************************************************
 929  *  Function name: SaveVisitedUrl
 930  *  Input argv:
 931  *      --    url: url
 932  *  Output argv:
 933  *      --
 934  *  Return:
 935  *  Function Description: save raw the Visited Url
 936  **************************************************************************************/
 937 void CCrawl::SaveVisitedUrl(string url) {
 938     if (m_ofsVisitedUrlFile) {
 939         pthread_mutex_lock(&mutexVisitedUrlFile);
 940 
 941         m_ofsVisitedUrlFile << url << endl;
 942 
 943         pthread_mutex_unlock(&mutexVisitedUrlFile);
 944     }
 945 }
 946 
 947 void CCrawl::SaveUnreachHost(string host) {
 948     CMD5 iMD5;
 949     iMD5.GenerateMD5((unsigned char*) host.c_str(), host.size());
 950     string strDigest = iMD5.ToString();
 951     if (setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end()) {
 952         pthread_mutex_lock(&mutexUnreachHost);
 953 
 954         setUnreachHostMD5.insert(strDigest);
 955         if (m_ofsUnreachHostFile) {
 956             m_ofsUnreachHostFile << host << endl;
 957         }
 958 
 959         pthread_mutex_unlock(&mutexUnreachHost);
 960     }
 961 }
 962 
 963 void CCrawl::SaveLink4SE(CPage *iPage) {
 964     if (m_ofsLink4SEFile && iPage->m_nRefLink4SENum > 0) {
 965         pthread_mutex_lock(&mutexLink4SEFile);
 966 
 967         m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;
 968         m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;
 969         m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;
 970         m_ofsLink4SEFile << "link_anchortext: " << endl;
 971 
 972         map<string, string>::iterator it4SE = iPage->m_mapLink4SE.begin();
 973         for (; it4SE != iPage->m_mapLink4SE.end(); ++it4SE) {
 974 
 975             m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second
 976                     << endl;
 977             ;
 978 
 979         }
 980 
 981         pthread_mutex_unlock(&mutexLink4SEFile);
 982     }
 983 }
 984 
 985 bool CCrawl::SaveLink4SE031121(void *arg) {
 986     if (!arg || !m_ofsLink4SEFile)
 987         return false;
 988 
 989     //pthread_mutex_lock(&mutexLink4SEFile);
 990 
 991     if (vsParsedLinks.size() == 0)
 992         return false;
 993 
 994     file_arg *pFile = (file_arg *) arg;
 995 
 996     CUrl *iUrl = pFile->pUrl;
 997     CPage *iPage = pFile->pPage;
 998 
 999     char strDownloadTime[128];
1000     time_t tDate;
1001 
1002     memset(strDownloadTime, 0, 128);
1003     time(&tDate);
1004     strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1005 
1006     string links;
1007     vector<string>::iterator it = vsParsedLinks.begin();
1008     for (; it != vsParsedLinks.end(); ++it) {
1009         links = links + *it + "\n";
1010     }
1011 
1012     m_ofsLink4SEFile << "version: 1.0\n";
1013     if (iPage->m_sLocation.size() == 0) {
1014         m_ofsLink4SEFile << "url: " << iPage->m_sUrl;
1015     } else {
1016         m_ofsLink4SEFile << "url: " << iPage->m_sLocation;
1017         m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl;
1018     }
1019 
1020     m_ofsLink4SEFile << "\ndate: " << strDownloadTime;
1021 
1022     if (mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end()) {
1023         m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost;
1024     } else {
1025         m_ofsLink4SEFile << "\nip: "
1026                 << (*(mapCacheHostLookup.find(iUrl->m_sHost))).second;
1027     }
1028 
1029     m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size();
1030     m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1
1031             << "\n\n" << iPage->m_sHeader << "\n";
1032     m_ofsLink4SEFile << links;
1033     m_ofsLink4SEFile << endl;
1034 
1035     vsParsedLinks.clear();
1036     //pthread_mutex_unlock(&mutexLink4SEFile);
1037 
1038     return true;
1039 }
1040 
1041 // not well
1042 void CCrawl::SaveLink4History(CPage *iPage) {//保存为历史网页存档准备的超链接信息
1043     if (m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum > 0) {
1044         pthread_mutex_lock(&mutexLink4HistoryFile);
1045 
1046         //m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl;
1047         //m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl;    
1048         //m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl;
1049         //m_ofsLink4HistoryFile << "link: " << endl;
1050 
1051         vector<string>::iterator it4History = iPage->m_vecLink4History.begin();
1052         for (; it4History != iPage->m_vecLink4History.end(); ++it4History) {
1053             string s = *it4History;
1054             m_ofsLink4HistoryFile << s << endl;
1055         }
1056 
1057         pthread_mutex_unlock(&mutexLink4HistoryFile);
1058     }
1059 }
1060 
1061 /**************************************************************************************
1062  *  Function name: SaveVisitedUrlMd5
1063  *  Input argv:
1064  *      --    md5: page md5 value
1065  *  Output argv:
1066  *      --
1067  *  Return:
1068  *  Function Description: save the visited url Md5
1069  **************************************************************************************/
1070 void CCrawl::SaveVisitedUrlMD5(string md5) {
1071     if (m_ofsVisitedUrlMD5File) {
1072         m_ofsVisitedUrlMD5File << md5 << endl;
1073     }
1074 }
1075 
1076 /**************************************************************************************
1077  *  Function name: SaveVisitedPageMd5
1078  *  Input argv:
1079  *      --    md5: page md5 value
1080  *  Output argv:
1081  *      --
1082  *  Return:
1083  *  Function Description: save the visited url Md5
1084  **************************************************************************************/
1085 void CCrawl::SaveVisitedPageMD5(string md5) {
1086     if (m_ofsVisitedPageMD5File) {
1087         m_ofsVisitedPageMD5File << md5 << endl;
1088     }
1089 }
1090 
1091 /**************************************************************************************
1092  *  Function name: OpenFileForOutput
1093  *  Input argv:
1094  *      --
1095  *  Output argv:
1096  *      --
1097  *  Return:
1098  *  Function Description: Open the files for output
1099  **************************************************************************************/
1100 void CCrawl::OpenFilesForOutput() {
1101     // open isam file for output
1102     m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME);
1103 
1104     // open visited.url file for output
1105     m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(),
1106             ios::out | ios::app | ios::binary);
1107     if (!m_ofsVisitedUrlFile) {
1108         cerr << "cannot open " << VISITED_FILE << " for output\n" << endl;
1109     }
1110 
1111     // open link4SE.url file for output
1112     m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(),
1113             ios::out | ios::app | ios::binary);
1114     if (!m_ofsLink4SEFile) {
1115         cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl;
1116     }
1117 
1118     // open link4History.url file for output
1119     m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(),
1120             ios::out | ios::app | ios::binary);
1121     if (!m_ofsLink4HistoryFile) {
1122         cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl;
1123     }
1124 
1125     // open unreach host file for output
1126     m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(),
1127             ios::out | ios::app | ios::binary);
1128     if (!m_ofsUnreachHostFile) {
1129         cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl;
1130     }
1131 
1132     // open visited url md5 file for output
1133     m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(),
1134             ios::out | ios::app | ios::binary);
1135     if (!m_ofsVisitedUrlMD5File) {
1136         cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl;
1137     }
1138 
1139     // open visited page md5 file for output
1140     m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(),
1141             ios::out | ios::app | ios::binary);
1142     if (!m_ofsVisitedPageMD5File) {
1143         cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl;
1144     }
1145 }
1146 
1147 /***************************************************************************************
1148  *  Function name: DoCrawl
1149  *  Input argv:
1150  *      --
1151  *  Output argv:
1152  *      --
1153  *  Return:
1154  *  Function Description: the main function for crawl
1155  *  Be careful:
1156  ***************************************************************************************/
1157 void CCrawl::DoCrawl() {//CCrawl类中的总控函数
1158     /* set the signal function */
1159     signal(SIGTERM, SigTerm);
1160     signal(SIGKILL, SigTerm);
1161     signal(SIGINT, SigTerm);
1162     signal(SIGPIPE, SIG_IGN);
1163     signal(SIGCHLD, SIG_IGN);
1164 
1165     // output the begin time
1166     char strTime[128];
1167     time_t tDate;
1168 
1169     memset(strTime, 0, 128);
1170     time(&tDate);
1171     strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1172     cout << "\n\nBegin at: " << strTime << "\n\n";
1173 
1174     // get the other info from file
1175     GetVisitedUrlMD5();
1176     GetVisitedPageMD5();
1177 
1178     GetIpBlock();
1179 
1180     GetUnreachHostMD5();
1181 
1182     // open the seed url file
1183     ifstream ifsSeed(m_sInputFileName.c_str());
1184     if (!ifsSeed) {
1185         cerr << "Cannot open " << m_sInputFileName << " for input\n";
1186         return;
1187     }
1188 
1189     // open the files for output
1190     OpenFilesForOutput();
1191 
1192     // Create thread ID structures. 
1193     pthread_t *tids = (pthread_t*) malloc(NUM_WORKERS * sizeof(pthread_t));
1194     if (tids == NULL) {
1195         cerr << "malloc error" << endl;
1196     }
1197 
1198     for (unsigned int i = 0; i < NUM_WORKERS; i++) {
1199         if (pthread_create(&tids[i], NULL, start, this))
1200             cerr << "create threads error" << endl;
1201     }
1202 
1203     string strUrl;
1204     CPage iCPage;
1205     while (getline(ifsSeed, strUrl)) {
1206         string::size_type idx;
1207 
1208         if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
1209             continue;
1210         }
1211 
1212         idx = strUrl.find('\t');
1213         if (idx != string::npos) {
1214             strUrl = strUrl.substr(0, idx);
1215         }
1216 
1217         //idx = strUrl.find("http");
1218         idx = CStrFun::FindCase(strUrl, "http");
1219         if (idx == string::npos) {
1220             //continue;
1221             idx = strUrl.find('/');
1222             if (idx == string::npos) {
1223                 strUrl = "http://" + strUrl + "/";
1224             } else {
1225                 strUrl = "http://" + strUrl;
1226             }
1227         }
1228 
1229         //if( strUrl.length() < 8 ) continue;
1230 
1231         if (iCPage.IsFilterLink(strUrl))
1232             continue;
1233         AddUrl(strUrl.c_str());
1234     }
1235 
1236     // Get the unvisited URL
1237     ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str());
1238     if (ifsUnvisitedUrl) {
1239         while (getline(ifsUnvisitedUrl, strUrl)) {
1240             string::size_type idx;
1241 
1242             if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
1243                 continue;
1244             }
1245 
1246             idx = strUrl.find('\t');
1247             if (idx != string::npos) {
1248                 strUrl = strUrl.substr(0, idx);
1249             }
1250 
1251             // filter invalid urls
1252             if (iCPage.IsFilterLink(strUrl))
1253                 continue;
1254 
1255             AddUrl(strUrl.c_str());
1256         }
1257     } else {
1258         //cerr << "Cannot open " << UNVISITED_FILE << " for input\n";
1259     }
1260 
1261     // sleep(30);
1262     b_fOver = true;
1263     cout << "finished to get all unvisited urls." << endl;
1264 
1265     // Wait for the threads. 
1266     for (unsigned int i = 0; i < NUM_WORKERS; ++i) {
1267         (void) pthread_join(tids[i], NULL);
1268     }
1269 
1270     cout << "closed " << NUM_WORKERS << " threads." << endl;
1271 
1272     SaveUnvisitedUrl();
1273     SaveReplicas("repli");
1274 
1275     memset(strTime, 0, 128);
1276     time(&tDate);
1277     strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
1278     cout << "\n\nEnd at: " << strTime << "\n\n";
1279 }
1280 
1281 /*****************************************************************
1282  ** Function name: AddUrl
1283  ** Input argv:
1284  **      --
1285  ** Output argv:
1286  **      --
1287  ** Return:
1288  ** Function Description: Add a parsed url into the collection
1289  ** Version: 1.0
1290  ** Be careful:   An important function!!!
1291  *****************************************************************/
1292 void CCrawl::AddUrl(const char * url) {
1293     string strUrl = url;
1294     if (strUrl.empty() || strUrl.size() < 8) { //invalid url
1295         cout << "!so small!" << strUrl << endl;
1296         return;
1297     }
1298 
1299     CPage iCPage;
1300     if (iCPage.NormalizeUrl(strUrl) == false) {
1301         // cout << "!normalize fail!" << strUrl << endl;
1302         return;
1303     }
1304 
1305     CUrl iUrl;
1306 
1307     // for ImgSE, comment the paragraph
1308     // if image/xxx url, store it to link4History.url
1309     // begin
1310     if (iUrl.IsImageUrl(strUrl)) {
1311         if (m_ofsLink4HistoryFile) {
1312             pthread_mutex_lock(&mutexLink4HistoryFile);
1313             m_ofsLink4HistoryFile << strUrl << endl;
1314             ;
1315             pthread_mutex_unlock(&mutexLink4HistoryFile);
1316         }
1317         return;
1318     }
1319     // end
1320 
1321     if (iUrl.ParseUrlEx(strUrl) == false) {
1322         cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl;
1323         return;
1324     }
1325 
1326     // if it is an invalid host, discard it
1327     if (iUrl.IsValidHost(iUrl.m_sHost.c_str()) == false) {
1328         cout << "!invalid host: " << iUrl.m_sHost << endl;
1329         return;
1330     }
1331 
1332     // filter foreign hosts
1333     if (iUrl.IsForeignHost(iUrl.m_sHost)) {
1334         cout << "!foreign hosts: " << iUrl.m_sHost << endl;
1335         return;
1336     }
1337 
1338     // if it is a block ip, discard it
1339     // this work is left in the CreatSocket()
1340     // because the work of getting ip is inevitable in the CreatSocket function
1341     //     and this work is expensive
1342     // if it is an unreach host, discard it
1343     // here we only deal with numbers-and-dots notations
1344     unsigned long inaddr = 0;
1345     char *ip = NULL;
1346 
1347     inaddr = (unsigned long) inet_addr(iUrl.m_sHost.c_str());
1348     if (inaddr != INADDR_NONE) { // host is just ip
1349         //pthread_mutex_lock(&mutexMemory);
1350         ip = new char[iUrl.m_sHost.size() + 1];
1351         //pthread_mutex_unlock(&mutexMemory);
1352         memset(ip, 0, iUrl.m_sHost.size() + 1);
1353         memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size());
1354 
1355         if (!iUrl.IsValidIp(ip)) { // out of ip block
1356             //pthread_mutex_lock(&mutexMemory);
1357             delete[] ip;
1358             ip = NULL;
1359             //pthread_mutex_unlock(&mutexMemory);
1360             //cout << "!unreach hosts: " << iUrl.m_sHost << endl;
1361             return;
1362         }
1363         //pthread_mutex_lock(&mutexMemory);
1364         delete[] ip;
1365         ip = NULL;
1366         //pthread_mutex_unlock(&mutexMemory);
1367     }
1368 
1369     CStrFun::Str2Lower(iUrl.m_sHost, iUrl.m_sHost.size());
1370     CMD5 iMD5;
1371     iMD5.GenerateMD5((unsigned char*) iUrl.m_sHost.c_str(),
1372             iUrl.m_sHost.size());
1373     string strDigest = iMD5.ToString();
1374     if (setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end()) {
1375         //cout << "!unreach host! " << iUrl.m_sHost << endl;    
1376         return;
1377     }
1378 
1379     // if crawled, discard it
1380     iMD5.GenerateMD5((unsigned char*) strUrl.c_str(), strUrl.size());
1381     strDigest = iMD5.ToString();
1382 
1383     if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) {
1384         // cout << "!visited! " << strUrl << endl;    
1385         return;
1386     }
1387 
1388     // if already in the collection, discard it
1389     if (setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end()) {
1390         // cout << "!in collection! " << strUrl << endl;    
1391         return;
1392     } else {
1393         pthread_mutex_lock(&mutexUnvisitedUrlMD5);
1394         setUnvisitedUrlMD5.insert(strDigest);
1395         pthread_mutex_unlock(&mutexUnvisitedUrlMD5);
1396     }
1397 
1398     // add
1399     // make sure limited threads crawling on a site
1400     int cnt = 0;
1401     for (;;) {
1402         //if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){
1403 
1404         if (1) {
1405             //pthread_mutex_lock(&mutexVisitedUrlMD5);
1406 
1407             // if crawled, discard it :) double secure
1408             //if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
1409             //cout << "!v! " << strUrl << endl;
1410             //pthread_mutex_unlock(&mutexVisitedUrlMD5);
1411             //return;
1412             //} else {
1413 
1414             pthread_mutex_lock(&mutexVisitedUrlMD5);
1415             mmapUrls.insert(mvalType(iUrl.m_sHost, strUrl));
1416             pthread_mutex_unlock(&mutexVisitedUrlMD5);
1417             break;
1418             //}
1419         } else {
1420             cnt++;
1421             if (cnt % 100 == 0) {
1422                 cout << "~";
1423                 //cnt = 0;
1424             }
1425 
1426             // If we have waiting so long, we may remove it
1427             if (cnt == 50000) {
1428                 cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl;
1429                 break;
1430             }
1431             usleep(4000);
1432         }
1433 
1434     }
1435 
1436 }

posted on 2012-07-06 15:38  kakamilan  阅读(770)  评论(0编辑  收藏  举报

导航