TypeError: cannot use a string pattern on a bytes-like object

Downloading: http://example.webscraping.com/sitemap.xml
Traceback (most recent call last):
  File "/Users/mac126/111/网站地图爬虫.py", line 14, in <module>
    crawler_sitemap('http://example.webscraping.com/sitemap.xml')
  File "/Users/mac126/111/网站地图爬虫.py", line 8, in crawler_sitemap
    links=re.findall('<loc>(.*?)</loc>',sitemap)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/re.py", line 222, in findall
    return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
#网站地图爬虫.py源码
import re

from 爬虫下载网页 import download


def crawler_sitemap(url):
    sitemap=download(url)
    links=re.findall('<loc>(.*?)</loc>',sitemap)

    for link in links:
        html=download(link)
        # print(html)

crawler_sitemap('http://example.webscraping.com/sitemap.xml')
#下载网页
#具有功能:捕获异常,重试下载并设置用户代理
import urllib.request
import urllib.error
#下载网页
#wscp:默认用户代理 web scraping with python 缩写
def download(url, user_agent='wscp',num_retries=2):
    print('Downloading:',url) #打印下载路径
    headers={'USer-Agent':user_agent}
    request=urllib.request.Request(url,headers=headers)
    try:
        html=urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('download error:',e.reason)
        html=None
        if num_retries>0:#下载遇到错误时尝试下载
            if hasattr(e,'code') and 500 <=e.code <600:#404 notfound 这种错误,说明网页不存在,故不需要重新下载
                print(user_agent)
                return download(url,user_agent,num_retries-1)

    return html
# download('http://example.webscraping.com/')
# download('http://httpstat.us/500')#测试错误500
# print(dir(urllib))

解决方法如图:

 

posted @ 2018-12-22 12:25  青春叛逆者  阅读(1870)  评论(0编辑  收藏  举报