python3多线程爬取京东投诉信息
开启线程池示例
import time
import threading
from concurrent.futures import ThreadPoolExecutor
pool =ThreadPoolExecutor(100)
spider_list = []
#爬虫方法
#page/url 代表爬取第几页或爬取第几个详情url
#def func1(url):
def func1(page):
print("a",page)
pages = 50 #urls=50 # 参数可以是:列表页数/商品列表urls
for page in range(pages):
# 正在运行的线程id
# print('running thread id : %d now=%d' % (threading.get_ident(), url))
print('running thread id : %d now=%d' % (threading.get_ident(), page))
# 将列表页数或商品列表url提交到函数方法
# str_url = pool.submit(func1, url)
str_page = pool.submit(func1,page)
# 完成抓取数据列表舔加到spider
# spider_list.append(str_url)
spider_list.append(str_page)
print("spider_list=",spider_list)
for list in spider_list:
# 完成的结果
list.result()
print('线程全部执行完毕')
一、多线程爬取京东投诉信息
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
session = HTMLSession()
proxies =None
# 线程池
pool = ThreadPoolExecutor(30)
big_list = []
pool_list = []
def dewu_company(pages):
# 爬取第几页
print("第"+str(pages)+"页")
t=str(int(time.time()*1000))
url = "https://tousu.sina.com.cn/api/company/received_complaints"
headers ={
"authority": "tousu.sina.com.cn",
"method": "GET",
"path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
"scheme": "https",
"accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
"referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
params = {
# "callback":"jQuery11120045959640946885205_1584672560291",
"couid":"5650743478", # 1878960481 阿里 5650743478 京东 # 得物 7046706808
"type":"1",
"page_size":"10",
"page":pages,
"_":t,
}
res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
# print(res.text)
info_list = res.json()["result"]["data"]["complaints"]
for info in info_list:
# title = info.get("title")
# uid = info.get("uid")
# summary = info.get("summary")
info_url = 'https:' + info['main']['url']
# print(info_url)
return parse_detail(info_url, info)
def parse_detail(info_url,info):
#https://tousu.sina.com.cn/complaint/view/17349163730/
try:
res = session.get(info_url, proxies=proxies, verify=False)
# print(res.text)
new_dict = dict()
new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
# 获取投诉图片
img_info_list=[]
img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
for url in img_url:
img_info_list.append("https:"+url)
new_dict['投诉图片'] = img_info_list
# 获取视频列表
vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
if len(vide_id_list)>=1:
# 投诉视频详情
new_vide_list = []
if vide_id_list:
for vide_id in vide_id_list:
t = int(time.time())
vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
# print("vide_info_url=" ,vide_info_url)
res = session.get(vide_info_url, verify=False)
# result = res.encode('utf-8').decode('unicode_escape')
result = json.loads(res.text)
# print("result =",type(result))
new_vide_list.append(result)
if new_vide_list:
new_dict['投诉视频详情'] = new_vide_list
info['投诉详情'] = new_dict
# else:
# new_dict['投诉视频详情'] = None
# info['投诉详情'] = new_dict
big_list.append(new_dict)
print("big_list==",big_list)
except Exception as e:
print(e)
# 写入json 文件
with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')
def main(pages):
startTime = time.time()
# 爬取页数
for page in range(pages):
name = pool.submit(dewu_company,page)
pool_list.append(name)
for n in pool_list:
n.result()
print("全部结束并保存本地")
# 以下写入json文件不能换行,那位大神可以指点下
# with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
# fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')
endTime = time.time()
print('Done, Time cost: %s ' % (endTime - startTime))
if __name__ == '__main__':
# 输入爬取页数
main(20)
20页数据爬取时间:Done, Time cost: 1.6854908466339111
二、多线程爬取阿里详情投诉信息
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
session = HTMLSession()
proxies =None
def dewu_company():
for page in range(1,20):
print("第"+str(page)+"页")
t=str(int(time.time()*1000))
url = "https://tousu.sina.com.cn/api/company/received_complaints"
headers ={
"authority": "tousu.sina.com.cn",
"method": "GET",
"path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
"scheme": "https",
"accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
"referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
params = {
# "callback":"jQuery11120045959640946885205_1584672560291",
"couid":"1878960481", # 1878960481 阿里 5650743478 京东 # 得物 7046706808
"type":"1",
"page_size":"10",
"page":page,
"_":t,
}
res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
# print(res.text)
# 开启线程调用商品详情打开
ths = []
info_list = res.json()["result"]["data"]["complaints"]
for info in info_list:
# title = info.get("title")
# uid = info.get("uid")
# summary = info.get("summary")
info_url = 'https:' + info['main']['url']
# print(info_url)
## 开启线程调用商品详情
th = threading.Thread(target=parse_detail,args=(info_url,info))
th.start()
ths.append(th)
if len(ths) > 10:
for th_one in ths:
th_one.join()
ths = []
for th_one in ths:
th_one.join()
def parse_detail(info_url,info):
#https://tousu.sina.com.cn/complaint/view/17349163730/
try:
big_list = []
res = session.get(info_url, proxies=proxies, verify=False)
# print(res.text)
new_dict = dict()
new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
# 获取投诉图片
img_info_list=[]
img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
for url in img_url:
img_info_list.append("https:"+url)
new_dict['投诉图片'] = img_info_list
# 获取视频列表
vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
if len(vide_id_list)>=1:
# print("vide_id_list=",vide_id_list)
# 投诉视频详情
new_vide_list = []
if vide_id_list:
for vide_id in vide_id_list:
t = int(time.time())
vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
# print("vide_info_url=" ,vide_info_url)
res = session.get(vide_info_url, verify=False)
# result = res.encode('utf-8').decode('unicode_escape')
result = json.loads(res.text)
# print("result =",type(result))
new_vide_list.append(result)
if new_vide_list:
new_dict['投诉视频详情'] = new_vide_list
info['投诉详情'] = new_dict
# else:
# new_dict['投诉视频详情'] = None
# info['投诉详情'] = new_dict
big_list.append(new_dict)
print("big_list==",big_list,len(big_list))
except Exception as e:
print(e)
with open('阿里投诉信息.json', "a+", encoding = 'utf-8') as fw:
fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')
if __name__ == '__main__':
startTime = time.time()
dewu_company()
endTime = time.time()
print ('Done, Time cost: %s ' % (endTime - startTime))
20页数据爬取时间:Done, Time cost: 20.348562240600586