python最小白的爬虫~爬取豆瓣小说
爬取豆瓣小说图片,并以标题命名该图片
# Author:li import re import requests headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.33 Safari/537.36'} #加头文件防止被反扒 url = 'https://book.douban.com/top250' #获取源代码 yuandaima = requests.get(url,headers = headers).text #print(yuandaima) #写正则 picture =r'<.*?class="item">.*?<.*?src="(.*?)" width=.*?>.*?<a href=".*?".*?onclick=.*?title=".*?"' title = r'<.*?class="item">.*?<.*?src=".*?" width=.*?>.*?<a href=".*?".*?onclick=.*?title="(.*?)"' #进行匹配图片url和图片名字 picture_url = re.findall(picture,yuandaima,re.S) #re.S匹配换行符号 title_name = re.findall(title,yuandaima,re.S) #把文件写入 for i,b in enumerate(title_name): #enumerate 第一个取下标,第二关取值 print(b) response = requests.get(picture_url[i],headers=headers)#i是该列表内的下标,b是列表内的值 with open('{}.jpg'.format(b),'wb') as f: f.write(response.content)
然后加上翻页功能,