利用Collection实现网站全部链接的提取

import requests
from collections import deque
import optparse
import sys
from lxml import etree
import re
import time

class MySpider:
    def __init__(self) -> None:
        self.start_url = self.url_prefix(self.get_params())
        self.url_list = deque([self.start_url])
        self.scraped_urls = set()
        # self.emails = set()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0"
        }

    def get_params(self):
        parser = optparse.OptionParser('Usage: < Program > -u url')
        parser.add_option('-u', '--url', dest='url', type='string', help='Specify url to scrape')
        options, args = parser.parse_args()
        if options.url is None:
            print(parser.usage)
            sys.exit()
        return options.url 
    
    def url_prefix(self,url):
        if url.startswith('http://'):
            return url
        elif url.startswith('https://'):
            return url
        else:
            return 'http://' + url
    

    def retrieve_web_page(self,url):
        try:
            print('[-] Scraping %s' % url)
            response = requests.get(url=url,headers=self.headers)         
            self.scraped_urls.add(url)
            if response.status_code == 200:
                return  response.text
        
        except Exception as e:
            print(e)
            pass
    
    def retrieve_links(self, response):
        html = etree.HTML(response)
        links = html.xpath('//a/@href')
        for link in links:
            # if link.startswith('/'):
            #     link = url+link            
            if link.startswith('#'):
                continue
            if link not in self.url_list and link not in self.scraped_urls:
                print(link)
                self.url_list.append(link)
    
    def run(self):
        i = 0
        while len(self.url_list):
            print("[-] Scraping %d" % i)
            i += 1
            url = self.url_list.popleft()
            response = self.retrieve_web_page(url)
            self.retrieve_links(response)
            time.sleep(2)
        
        print(self.scraped_urls)
            

if __name__ == '__main__':
    myspider = MySpider()
    myspider.run()

 

posted @ 2022-06-18 18:36  Jason_huawen  阅读(54)  评论(0编辑  收藏  举报