解决爬虫response.text后中文的乱码问题
有两种解决方式
1.使用response.encoding = 'utf-8'
2.使用.encode('iso-8859-1').decode('gbk')
爬取美女壁纸缩略图并解决标题乱码问题
http://pic.netbian.com/4kmeinv/
http://pic.netbian.com/4kmeinv/index_2.html
import requests
from lxml import etree
start_page = int(input('start page num:'))
end_page = int(input('end page num:'))
if not os.path.exists('./meinvs'):
os.mkdir('./meinvs')
#通用的url模板(不能修改)
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(start_page,end_page+1):
if page == 1:
new_url = 'http://pic.netbian.com/4kmeinv/'
else:
new_url = format(url%page)
response = requests.get(url=new_url,headers=headers)
# response.encoding = 'utf-8' 第一种方式
page_text = response.text
#解析名称和图片的src属性值
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name = li.xpath('./a/img/@alt')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')+'.jpg' # 第二种方式
img_src = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_path = './meinvs/'+img_name
request.urlretrieve(img_src,img_path)
print(img_name,'下载成功!!!')