利用Python实现全站爬虫

 

  1 from lxml import etree
  2 import requests
  3 import sys
  4 import optparse
  5 import time
  6 
  7 class MySpider:
  8     def __init__(self) -> None:
  9         self.start_url = self.get_params()
 10         self.headers = {
 11             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0'
 12         }
 13         self.links = []
 14         self.crawled_links = []
 15 
 16     def get_params(self):
 17         parser = optparse.OptionParser('Usage: <Program> -d domain')
 18         parser.add_option('-d', '--domain', dest='domain', type='string', help='Specify domain to crawl')
 19         options, args = parser.parse_args()
 20         if options.domain is None:
 21             print(parser.usage)
 22             sys.exit(0)
 23         if options.domain.startswith('http://'):
 24             return options.domain
 25         elif options.domain.startswith('https://'):
 26         
 27             return options.domain
 28         else:
 29             return 'http://'+options.domain
 30     
 31 
 32     def request_page(self, url):
 33         try:
 34             
 35             response = requests.get(url=url, headers=self.headers)
 36             if response.status_code == 200:
 37                 # print(response.text)
 38                 return response.text
 39             return None
 40         except:
 41             pass
 42     
 43 
 44     def extract_link(self, url, response):
 45         if  response:
 46             html = etree.HTML(response.encode('utf-8'))
 47             links = html.xpath('//a/@href')
 48             # print(links)
 49             for link in links:
 50                 if link.startswith('http://'): 
 51                     if link in self.links:
 52                         continue               
 53                     self.links.append(link)
 54                 elif link.startswith('https://'):
 55                     if link in self.links:
 56                         continue
 57                     self.links.append(link)
 58                 elif link.startswith('//'):
 59                     if 'http:'+link in self.links:
 60                         continue
 61                     else:
 62                         link = 'http:' + link
 63                         self.links.append(link)
 64                 elif link.startswith('#'):
 65                     continue
 66 
 67                 else:
 68                     if url+link in self.links:
 69                         continue
 70                     else:
 71                         link = url + link
 72                         self.links.append(link)   
 73                 # print(link)
 74                 
 75 
 76     
 77     def run(self):
 78 
 79         response = self.request_page(self.start_url)
 80         self.extract_link(self.start_url, response)      
 81         self.crawled_links.append(self.start_url)  
 82         
 83         for url in self.links:
 84             if url in self.crawled_links:
 85                 continue
 86             response = self.request_page(url)
 87             self.extract_link(url, response)
 88             time.sleep(1)
 89             self.crawled_links.append(url)
 90             print(url)
 91             if len(self.crawled_links) > 300:
 92                 break
 93         
 94         for url in self.links:
 95             print(url)
 96               
 97 
 98 
 99 if __name__ == '__main__':
100     myspider = MySpider()
101     myspider.run()
102        

 

posted @ 2022-05-28 16:08  Jason_huawen  阅读(212)  评论(0编辑  收藏  举报