python 爬虫下载网页
#下载网页 import urllib.request import urllib.error #下载网页 def download(url, num_retries=2): print('Downloading:',url) #打印下载路径 try: html=urllib.request.urlopen(url).read() except urllib.error as e: print('download error:',e.reason) html=None if num_retries>0:#下载遇到错误时尝试下载 if hasattr(e,'code') and 500 <=e.code <600:#404 notfound 这种错误,说明网页不存在,故不需要重新下载 return download(url,num_retries-1) return html download('http://example.webscraping.com/')
download('http://httpstat.us/500')#测试错误500
#运行后输出结果为:
Downloading: http://example.webscraping.com/ Downloading: http://httpstat.us/500 download error: Internal Server Error Downloading: http://httpstat.us/500 download error: Internal Server Error Downloading: http://httpstat.us/500 download error: Internal Server Error