【爬虫】Python获取星巴克所有产品
视频只介绍了BS4的简单使用,但我想全部获取出来
其实翻看接口,直接有一个json资源提供了这些数据,但是没有分类
import re import urllib.request from bs4 import BeautifulSoup import json import datetime # 用来提取url的正则 REGEXP1 = '\"([^\"]*)\"' # 源地址和菜单地址 SOURCE = 'https://www.starbucks.com.cn' API = 'https://www.starbucks.com.cn/menu/' # 没有限制,可以直接读取 response = urllib.request.urlopen(API) html = response.read().decode('UTF-8') # print(html) # 用BS4解析 soupObject = BeautifulSoup(html, 'lxml') ulList = soupObject.select('ul[class="grid padded-3 product"]') # print(ulList) # 准备JSON容器 productList = [] # 先取所有ul,获取类型名称 for ul in ulList: category = ul.select_one(selector='h3.caption') if category is None: continue categoryName = category.text print(categoryName) # 再获取A标签,得到对应的产品和图片url aTagList = ul.select(selector='li > a') for aTag in aTagList: name = aTag.text.strip() styleStr = aTag.select_one(selector='div')['style'] styleStr = re.findall(REGEXP1, styleStr)[0] imgUrl = SOURCE + styleStr print(f'{name} {imgUrl}') # 封装数据 product = { 'type': categoryName, 'name': name, 'image': imgUrl } productList.append(product) # 把Python集合对象转换成JSON数据 jsonData = json.dumps(productList) # 写入磁盘,文件名标注时间 nowTime = datetime.datetime.now() nowTime = datetime.datetime.strftime(nowTime, '%Y年%m月%d日%H时%M分%S秒') fp = open(file=f'星巴克产品菜单-{nowTime}.json', mode='w', encoding='UTF-8') fp.write(jsonData)