1.4.3 ID遍历爬虫(每天一更)
# -*- coding: utf-8 -*- ''' Created on 2019年5月7日 @author: 薛卫卫 ''' import itertools import urllib.request import re def download(url, user_agent="wswp",num_retries=2): print("Downloading: " , url) headers = { 'User-agent': user_agent} request = urllib.request.Request(url, headers=headers) try: html = urllib.request.urlopen(request).read() except urllib.request.URLError as e: print('Download error:' , e.reason) html = None if num_retries > 0 : if hasattr(e, 'code') and 500 <= e.code < 600: return download(url, user_agent, num_retries-1) return html for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html is None: break else: # success - can scrape the result pass # # # maximum number of consecutive download errors allowed # max_error = 5 # # current number of consecutive download errors # nun_errors = 0 # for page in itertools.count(1): # url = 'http://example.webcraping.com/view/-%d' % page # html = download(url) # if html is None: # # received an error trying to download this webpage # num_errors +=1 # if num_errors == max_errors: # # reached maxinum number of # # consecutive errors so exit # break # else: # # success - can scrape the result # # ... # num_errors = 0