'''
需要修改部分:
1. style_list
2. skudic['type']
3.with open ('spulist_1.json','wt') as f00: 文件存储路径
'''
from selenium import webdriver
from selenium.webdriver.common.by import By # 通过什么
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC #期望的条件达到
from selenium.webdriver.support.wait import WebDriverWait # 等待
from selenium.webdriver.chrome.options import Options
import uuid
import json
import requests
# chrome_options = Options()
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统如果无界面不加这条会
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
# style_list = ['mg/','bhh/','knx/','xrk/','zll/','mtx/']
style_list = ['mg/']
base_url = 'https://www.haohua.com/xianhua/'
total_url = []
main_imgs = []
big_lists = [] # 所有的sku的原始列表 多个[title, desc, price, sku_dic, spu_imgs]
spulist = []
skulist = []
spu_pic_list = []
index = 0
# spu 需要内容:title,detail,spu_main_img,price(uuid 生成一个随机字符串,用来和sku 匹配)
# spuimgs 需要内容:uid main_imgs (主图的几张图片)
# sku 需要内容:uid,type,name,price,img
for style in style_list:
url = base_url + style
driver.get(url)
# 显示等待,明确等待满足某一个条件为止
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "imghover")))
# 平滑滚动
height = driver.execute_script("return document.body.clientHeight")
driver.execute_script("""
window.scrollTo({
top: %s,
behavior:"smooth"
});
""" % height)
urls=[]
# 要拿到li 标签下的a标签的href 属性
tags = driver.find_elements_by_class_name('imghover')
for tag in tags:
# https: // www.haohua.com / xianhua / 45177.html
tag_url = tag.get_attribute('href')
urls.append(tag_url)
total_url.append(urls)
# 给定一个url, 进行爬取数据
def task(single_url):
driver.get(single_url)
# 要爬的数据:标题,简介,价格,图片,sku名称,图片
title = driver.find_element_by_class_name('shop-title').text
desc = driver.find_element_by_class_name('shop-description').text
price = driver.find_element_by_class_name('sell-val').text
skus = driver.find_element_by_class_name('specs-item').find_elements_by_tag_name('img')
sku_dic = {}
for sku in skus:
sku_img = sku.get_attribute('src')
sku_name = sku.get_attribute('title')
sku_dic[sku_name] = sku_img
spu_img_tags = driver.find_element_by_class_name('shop-preview-item').find_elements_by_tag_name('li')
spu_imgs = []
for spu_img in spu_img_tags:
img = spu_img.find_element_by_tag_name('img').get_attribute('src')
spu_imgs.append(img)
main_imgs.append(spu_imgs[0])
return [title, desc, price, sku_dic, spu_imgs]
from concurrent.futures import ThreadPoolExecutor
import time
pool = ThreadPoolExecutor(3)
type_urls = total_url[0]
def main():
for type_url in type_urls:
done = pool.submit(task,type_url)
lis = done.result()
big_lists.append(lis)
main_imgs.append(lis[4][1])
print(lis) # 列表套列表,要取出主图放入文件夹中
if __name__ == '__main__':
main()
pool.shutdown(wait=True)
img_name_url = []
for imgurl in main_imgs:
img_name = imgurl.split('/')[-1]
img_name_url.append([img_name,imgurl])
sku_name_urls = []
spu_name_urls = []
for big_list in big_lists:
spudic = {} # 每一个spu的参数字典
spudic['title'] = big_list[0].split('-')[-1]
spudic['detail'] = big_list[1]
spudic['price'] = big_list[2]
spudic['spu_main_img'] = 'main_img/' + big_list[4][0].split('/')[-1] # 'https://www.haohua.com/upload/image/2018-11/22/2761c_182d6.png',
uid = str(uuid.uuid4())
spudic['uid'] = uid
spulist.append(spudic)
# big_list[3] {'12枝紫罗兰': img_url, '9枝紫罗兰': img_url} sku的图片存到 SKUimg
for sku_name in big_list[3]:
# 把所有的sku图片放在一个列表中,以备以后下载图片
sku_name_urls.append(big_list[3][sku_name])
skudic = {}
skudic['uid'] = uid
skudic['type'] = 1
skudic['name'] = sku_name
skudic['price'] = big_list[2]
skudic['img'] = 'SKUimg/' + big_list[3][sku_name].split('/')[-1]
skulist.append(skudic)
# 下面是 SPUpictures 参数
for spupicture in big_list[4]: # [url1,url2....]
# 把所有的spu图片放在一个列表中,以备以后下载图片
spu_name_urls.append(spupicture)
spu_img_pic = {}
spu_img_pic['uid'] = uid
spu_img_pic['img'] = 'SPUimg/' + spupicture.split('/')[-1]
index += 1
spu_img_pic['index'] = index
spu_pic_list.append(spu_img_pic)
# 这里是一张主图 main_img
for name_url in img_name_url:
r = requests.get(name_url[1])
save_url = 'F:/期中架构/practice2/main_img/' + name_url[0]
if r.status_code == 200:
content = r.content
with open(save_url, 'ab') as f:
f.write(content)
# 这里是sku图片 sku_name_urls = []
for sku_name in sku_name_urls: # 一个sku 的图片
r = requests.get(sku_name)
save_url = 'F:/期中架构/practice2/SKUimg/' + sku_name.split('/')[-1]
if r.status_code == 200:
content = r.content
with open(save_url, 'ab') as f:
f.write(content)
# 这里是 spu的几张主图 spu_name_urls = []
for spu_name in spu_name_urls: # 一个sku 的图片
r = requests.get(spu_name)
save_url = 'F:/期中架构/practice2/SPUimg/' + spu_name.split('/')[-1]
if r.status_code == 200:
content = r.content
with open(save_url, 'ab') as f:
f.write(content)
# 这里是type = 5 的所有数据
# spulist = [],skulist = [],spu_pic_list = [] 写入三个文件中
with open ('spulist_1.json','wt') as f00:
json.dump(spulist,f00)
with open ('skulist_1.json','wt') as f11:
json.dump(skulist,f11)
with open ('spu_pic_list_1.json','wt') as f22:
json.dump(spu_pic_list,f22)