利用Python实现全站爬虫
1 from lxml import etree 2 import requests 3 import sys 4 import optparse 5 import time 6 7 class MySpider: 8 def __init__(self) -> None: 9 self.start_url = self.get_params() 10 self.headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0' 12 } 13 self.links = [] 14 self.crawled_links = [] 15 16 def get_params(self): 17 parser = optparse.OptionParser('Usage: <Program> -d domain') 18 parser.add_option('-d', '--domain', dest='domain', type='string', help='Specify domain to crawl') 19 options, args = parser.parse_args() 20 if options.domain is None: 21 print(parser.usage) 22 sys.exit(0) 23 if options.domain.startswith('http://'): 24 return options.domain 25 elif options.domain.startswith('https://'): 26 27 return options.domain 28 else: 29 return 'http://'+options.domain 30 31 32 def request_page(self, url): 33 try: 34 35 response = requests.get(url=url, headers=self.headers) 36 if response.status_code == 200: 37 # print(response.text) 38 return response.text 39 return None 40 except: 41 pass 42 43 44 def extract_link(self, url, response): 45 if response: 46 html = etree.HTML(response.encode('utf-8')) 47 links = html.xpath('//a/@href') 48 # print(links) 49 for link in links: 50 if link.startswith('http://'): 51 if link in self.links: 52 continue 53 self.links.append(link) 54 elif link.startswith('https://'): 55 if link in self.links: 56 continue 57 self.links.append(link) 58 elif link.startswith('//'): 59 if 'http:'+link in self.links: 60 continue 61 else: 62 link = 'http:' + link 63 self.links.append(link) 64 elif link.startswith('#'): 65 continue 66 67 else: 68 if url+link in self.links: 69 continue 70 else: 71 link = url + link 72 self.links.append(link) 73 # print(link) 74 75 76 77 def run(self): 78 79 response = self.request_page(self.start_url) 80 self.extract_link(self.start_url, response) 81 self.crawled_links.append(self.start_url) 82 83 for url in self.links: 84 if url in self.crawled_links: 85 continue 86 response = self.request_page(url) 87 self.extract_link(url, response) 88 time.sleep(1) 89 self.crawled_links.append(url) 90 print(url) 91 if len(self.crawled_links) > 300: 92 break 93 94 for url in self.links: 95 print(url) 96 97 98 99 if __name__ == '__main__': 100 myspider = MySpider() 101 myspider.run() 102
STRIVE FOR PROGRESS,NOT FOR PERFECTION