开线程爬取黑猫里的阿里投诉信息
仅供学习,请适度开线程
一.代码
复制import requests
from requests_html import HTMLSession
import time
from concurrent.futures import ThreadPoolExecutor
import json
pool = ThreadPoolExecutor(30)
big_list = []
pool_name_list =[]
session = HTMLSession()
def dewu_company(x):
try:
print(f'第{x+1}页')
params = {
'couid': '1878960481',
'type': '1',
'page_size': f'{(x + 1) * 10}',
'page': f'{x + 1}',
# 'callback':'jQuery11',
}
url = 'https://tousu.sina.com.cn/api/company/received_complaints'
res = requests.get(url, params=params, verify=False)
info_list = res.json()['result']['data']['complaints']
for dict_info in info_list:
dict_info['main']['url'] = 'https:' + dict_info['main']['url']
dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
info_url = dict_info['main']['url']
print(info_url)
res = session.get(info_url, verify=False)
new_dict = dict()
new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
# new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
# new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
have_http_img_list = []
for a in not_have_http_img_list:
have_http_img_list.append('https:' + a)
new_dict['投诉图片'] = have_http_img_list
vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
print(vide_id_list)
new_vide_list = []
if vide_id_list:
for vide_id in vide_id_list:
t = int(time.time())
vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
res = session.get(vide_info_url, verify=False)
try:
new_vide_list.append(res.json())
except:
pass
new_dict['投诉视频详情'] = new_vide_list
dict_info['投诉详情'] = new_dict
big_list.append(dict_info)
except:
print('错误跳过这一页')
def run(page):
'''爬取的页面数量'''
for x in range(page):
name = pool.submit(dewu_company,x)
pool_name_list.append(name)
for name_1 in pool_name_list:
name_1.result()
print('全部结束开始保存本地')
with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
json.dump(big_list, fw)
print('保存完毕')
if __name__ == '__main__':
run(1)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理