python 爬虫下载网页

#下载网页
import urllib.request
import urllib.error
#下载网页
def download(url, num_retries=2):
    print('Downloading:',url) #打印下载路径
    try:
        html=urllib.request.urlopen(url).read()
    except urllib.error as e:
        print('download error:',e.reason)
        html=None
        if num_retries>0:#下载遇到错误时尝试下载
            if hasattr(e,'code') and 500 <=e.code <600:#404 notfound 这种错误，说明网页不存在，故不需要重新下载
                
                return download(url,num_retries-1)

    return html
download('http://example.webscraping.com/')

download('http://httpstat.us/500')#测试错误500

#运行后输出结果为：

Downloading: http://example.webscraping.com/
Downloading: http://httpstat.us/500
download error: Internal Server Error
Downloading: http://httpstat.us/500
download error: Internal Server Error
Downloading: http://httpstat.us/500
download error: Internal Server Error

posted @ 2018-12-22 11:22 青春叛逆者阅读(739) 评论(0) 编辑收藏举报

刷新页面返回顶部

青春叛逆者

python 爬虫下载网页

公告