1.4.4链接爬虫(每天一更)
# -*- coding: utf-8 -*- ''' Created on 2019年5月7日 @author: 薛卫卫 ''' import re import urllib.request def download(url, user_agent="wswp",num_retries=2): print("Downloading: " , url) headers = { 'User-agent': user_agent} request = urllib.request.Request(url, headers=headers) try: html = urllib.request.urlopen(request).read() except urllib.request.URLError as e: print('Download error:' , e.reason) html = None if num_retries > 0 : if hasattr(e, 'code') and 500 <= e.code < 600: return download(url, user_agent, num_retries-1) return html def link_crawler(seek_url, link_regex): """Crawl from the given seed URL following links matched by _link_regexes """ crawl_queue = [seek_url] while crawl_queue: url = crawl_queue.pop() html = download(url) html = html.decode("UTF-8") # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seek_url,link) crawl_queue.append(link) def get_links(html): """Return a list of links from html """ # a regular experssion to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) # seek_url = 'http://example.webscraping.com' # link_regex = '/(index|view)' # # link_crawler(seek_url, link_regex) link_crawler("http://example.webscraping.com", '/(index|view)')