import os
import requests
from lxml import etree
from urllib import request
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
def parse(url):
"""解析网页,提取数据封装为列表返回"""
page_source = requests.get(url,headers=headers).text
html = etree.HTML(page_source)
uiboxs = html.xpath("//div[@class='uibox']")[1:]
items = []
for uibox in uiboxs:
category = uibox.xpath("./div[@class='uibox-title']/a/text()")[0]
img_urls = uibox.xpath(".//li//img/@src")
img_urls = list(map(lambda url : "https:"+url, img_urls))#将map对象转换成list对象# for img_url in img_urls:# img_url = "https"+img_url
item = {'category':category, 'img_urls': img_urls}#将数据封装为字典加入列表并返回
items.append(item)
return items
def pipeline(url):
"""自动创建分类文件夹保存图片"""
abspath = os.path.dirname(__file__)#获取当前文件所在的父路径
imgpath = os.path.join(abspath,"images")#拼接当前路径
if not os.path.exists(imgpath):#不存在则创建
os.mkdir(imgpath)
for item in parse(url):
category = item['category']
img_urls = item['img_urls']
category_path = os.path.join(imgpath, category)#分类列表不存在则创建
if not os.path.exists(category_path):
os.mkdir(category_path)
for img_url in img_urls:
img_name = img_url.split('_')[-1]
savepath = os.path.join(category_path, img_name)
request.urlretrieve(img_url, savepath)
print(img_name, "下载完成")
if __name__ == "__main__":
url = "https://car.autohome.com.cn/pic/series/65.html#pvareaid=3454507"
pipeline(url)