
# -*- coding: utf-8 -*-
Created on 2019年5月6日

@author: 薛卫卫

import urllib.request
import re

def download(url, user_agent="wswp",num_retries=2):
    print("Downloading: " , url)
    headers = { 'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(request).read()
    except urllib.request.URLError as e:
        print('Download error:' , e.reason)
        html = None
        if num_retries > 0 :
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # 不修改正则表达式,修改输出的结果,将urlopen().read()返回的data进行解码
    sitemap = sitemap.decode('utf-8')
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    #download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...


posted @ 2019-05-07 21:59  小菜鸡的梦想  阅读(390)  评论(0编辑  收藏  举报