爬虫实践05 | 爬取参展公司信息
完整代码:
#2023-08-09 这个筛选了Apparel的代码 import requests import json import pandas as pd import time data=[] for i in range(1, 11): #从第1页开始取,取到第10页 url = f'https://api.balluun.com/search?page={i}&per_page=48' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Balluun-Client-Id': '516332e69e7a9d219ac62c42046feae522a1', 'Balluun-Domain-Id': '60332E414BC8F8D1D068FB422E0329BA' } payload ={"advanced":True,"type":"brand","filters":{"customfield":{"7901":["Apparel"],"15138":["China"]},"only_brand":True,"locale":["en-us","en-US"],"subscription":[2601,2602]},"sort":[],"locale":"en-US"} res = requests.post(url=url, json=payload, headers=headers).json() if 'search_results' in res: time.sleep(2) #time 放在这里,相当于每次跑完48条数据48/页,休息1秒,然后跑下一页,第二页 for item in res['search_results']['hits']: for customfield in item['customfields']: if customfield['customfield_id'] == 7901: customfield_value = customfield['customfield_value'] break name = item['name'] #获取字典的方式也可以用name=item.get('name') if len(item['company_address']) > 0: # 检查是否有地址信息 city = item['company_address'][0]['city'] else: city = "N/A" # 如果没有城市信息,将其设为 "N/A" if len(item['company_address']) > 0: # 检查是否有地址信息 state = item['company_address'][0]['state'] else: state = "N/A" # 如果没有城市信息,将其设为 "N/A" if len(item['company_address']) > 0: # 检查是否有地址信息 address1 = item['company_address'][0]['address1'] else: address1 = "N/A" # 如果没有城市信息,将其设为 "N/A" #print(name,city,state,address1,customfield_value) data.append([name,state,address1,customfield_value,city]) df = pd.DataFrame(data, columns=['Name', 'State', 'address1', 'Custom Field Value','city']) df.to_excel('data8.xlsx', index=False) print("数据已保存到 data8.xlsx 文件中。")