python中‘can't use a string pattern on a bytes-like object’错误
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 14 16:39:25 2017
@author: masserd
"""
from atexit import register
from re import compile
from threading import Thread
from time import ctime
from urllib.request import urlopen,Request
REGEX = compile('#([\d,]+) in Books')
AMZN = 'https://www.amazon.com/dp/'
ISBNs = {'0132269937':'Core Python Programming',
'0132356139':'Python Web Development with Django',
'0137143419':'Python Fundamentals',}
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'
headers = { 'User-Agent' : user_agent }
def getRanking(isbn):
url = '%s%s' % (AMZN,isbn)
req = Request(url)
req.add_header("User-Agent",user_agent)
req.add_header("GET",url)
req.add_header("Host","www.amazon.com")
req.add_header("Referer","https://www.amazon.com/dp/0132269937")
page = urlopen(req)
data = page.read()
page.close()
return REGEX.findall(data)[0]
def showRanking(isbn):
print('- %r ranked %s' %(ISBNs[isbn],getRanking(isbn)))
def main():
print('At',ctime(),'on Amazon...')
threads = []
for isbn in ISBNs:
t = Thread(target=showRanking,args=(isbn,))
threads.append(t)
for i in range(len(ISBNs)):
threads[i].start()
for i in range(len(ISBNs)):
threads[i].join()
print('all DONE at:',ctime())
@register
def atexit():
print('all DONE at:',ctime())
if __name__ == '__main__':
main()
先上代码,如上代码片段是用来查询亚马逊网站指定书排名的,但是刚开始按照书上编写代码,总是提示无法访问,但是浏览器就能访问,查找资料得知是某些网站有自动反爬虫机制,所以添加了header来模拟浏览器,成功爬取了网页(到这里都是题外话)。但是在用正则表达式提取排名时就出现了如题的错误,为什么呢?
通过书上后续说的和网上查找资料得知是编码问题,正则表达式是一个Unicode字符串,而urlopen()返回来的类似文件对象的结果经过read()方法得到的是一个ASCII/bytes字符串。书上的修复方案是将其编译为一个bytes对象,而不是一个文本字符串。因此修改该行
REGEX = compile('#([\d,]+) in Books')
为
REGEX = compile(b'#([\d,]+) in Books')
在前面加一个b,让re.compile()编译一个bytes字符串,但是输出的结果会是一个字符串前面带一个b,为了解决这个问题,需要将最后的返回代码改成
return str(REGEX.findall(data)[0],'utf-8')
来将其转换成一个(Unicode)字符串。
还有第二种方法:
这里不修改正则表达式,修改输出的结果,将urlopen().read()返回的data进行解码,
data = data.decode('utf-8')
加在return语句前面即可