爬取网站多页图片
概述:
当能够爬取一页内容时,爬取多页通常不是很困难,主要是找在页数发生变化时URL 之间的关系,然后使用个变量代替url 中变化的内容,然后写个for 循环即可。
demo:
from lxml import etree import requests import os #判断文件夹是否存在,不存在则创建一个 dirName = 'GirlsImage' if not os.path.exists(dirName): os.mkdir(dirName) headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44' } url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
#爬取2到5页的图片 for page in range(2, 5): new_url = format(url % page) response = requests.get(url=new_url, headers=headers) response.encoding = 'gbk' page_text = response.text #图片名称+图片数据 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') #使用xpath解析 for li in li_list: title = li.xpath('./a/img/@alt')[0] + '.jpg' img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] img_data = requests.get(url=img_src, headers=headers).content img_path = dirName + '/' + title with open(img_path, 'wb') as f: f.write(img_data) print(title, '保存成功!!')