爬虫实践06 | 爬取某网站的参展客户和展位信息
完整代码
#终版2023-10-11 import requests import json import time import pandas as pd data=[] for i in range(1,15): #从第一页开始取,取到14页 url=f'https://www.cantonfair.org.cn/b2bshop/api/themeRos/public/productShops/searchByVariables?productSearchable=true&industrySiteId=461110967833538560&unbox=true&lang=zh-CN%2Cen-US&categoryId=461148159452647424&page={i}&size=40&scoreStrategy=shop&productFilter=salesInfo.status%20eq%20%27LISTED%27%20and%20salesInfo.shop.id%20ne%20%27665972138192240640%27' headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'} params={ "productSearchable": "true", "industrySiteId": "461110967833538560", "unbox": "true", "lang": "zh-CN,en-US", "categoryId": "461148159452647424", "page": "0", "size": "40", "scoreStrategy": "shop", "productFilter": "salesInfo.status eq 'LISTED' and salesInfo.shop.id ne '665972138192240640'" } res= requests.get(url=url, params=params, headers=headers).json() #print(res) if res: for item in res['_embedded']['b2b:shops']: name=item['name'] address=item['address'].get('fullAddress') if item['address'] else None code_list=item['offlineShops'] codes=[] for shop in code_list: category=shop.get('category') if category and category.get('id') == '461148159452647424': codes.append(shop.get('code')) if not codes: codes_result=None else: codes_result = ','.join(str(code) for code in set(codes) if code is not None) #print(name,address,codes_result) data.append([name,address,codes_result]) time.sleep(2) df = pd.DataFrame(data, columns=['名称', '地址', '展位']) df.to_excel('展会终版.xlsx', index=False) print("数据已保存到展会终版.xlsx 文件中。")
代码不够简洁,还是努力学习中……