【爬虫】Python获取星巴克所有产品

 

视频只介绍了BS4的简单使用,但我想全部获取出来

其实翻看接口,直接有一个json资源提供了这些数据,但是没有分类

import re
import urllib.request
from bs4 import BeautifulSoup
import json
import datetime

# 用来提取url的正则
REGEXP1 = '\"([^\"]*)\"'
# 源地址和菜单地址
SOURCE = 'https://www.starbucks.com.cn'
API = 'https://www.starbucks.com.cn/menu/'

# 没有限制,可以直接读取
response = urllib.request.urlopen(API)
html = response.read().decode('UTF-8')
# print(html)

# 用BS4解析
soupObject = BeautifulSoup(html, 'lxml')
ulList = soupObject.select('ul[class="grid padded-3 product"]')
# print(ulList)

# 准备JSON容器
productList = []

# 先取所有ul,获取类型名称
for ul in ulList:
    category = ul.select_one(selector='h3.caption')
    if category is None:
        continue

    categoryName = category.text
    print(categoryName)

    # 再获取A标签,得到对应的产品和图片url
    aTagList = ul.select(selector='li > a')
    for aTag in aTagList:
        name = aTag.text.strip()

        styleStr = aTag.select_one(selector='div')['style']
        styleStr = re.findall(REGEXP1, styleStr)[0]
        imgUrl = SOURCE + styleStr
        print(f'{name} {imgUrl}')

        # 封装数据
        product = {
            'type': categoryName,
            'name': name,
            'image': imgUrl
        }
        productList.append(product)

# 把Python集合对象转换成JSON数据
jsonData = json.dumps(productList)

# 写入磁盘,文件名标注时间
nowTime = datetime.datetime.now()
nowTime = datetime.datetime.strftime(nowTime, '%Y年%m月%d日%H时%M分%S秒')
fp = open(file=f'星巴克产品菜单-{nowTime}.json', mode='w', encoding='UTF-8')
fp.write(jsonData)

  

posted @ 2022-07-09 23:34  emdzz  阅读(280)  评论(0编辑  收藏  举报