python3多线程爬取京东投诉信息

开启线程池示例

import time
import threading
from concurrent.futures import ThreadPoolExecutor


pool =ThreadPoolExecutor(100)
spider_list = []

#爬虫方法
#page/url  代表爬取第几页或爬取第几个详情url
#def func1(url):
def func1(page):
    print("a",page)

pages = 50 #urls=50 # 参数可以是:列表页数/商品列表urls
for page in range(pages):
    # 正在运行的线程id
    # print('running thread id : %d   now=%d' % (threading.get_ident(), url))
    print('running thread id : %d   now=%d' % (threading.get_ident(), page))
    # 将列表页数或商品列表url提交到函数方法
    # str_url = pool.submit(func1, url)
    str_page = pool.submit(func1,page)
    # 完成抓取数据列表舔加到spider
    # spider_list.append(str_url)
    spider_list.append(str_page)
    print("spider_list=",spider_list)

for list in spider_list:
    # 完成的结果
    list.result()
print('线程全部执行完毕')

一、多线程爬取京东投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-

import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")

session = HTMLSession()
proxies =None

# 线程池
pool = ThreadPoolExecutor(30)
big_list = []
pool_list = []

def dewu_company(pages):
    # 爬取第几页
    print("第"+str(pages)+"页")
    t=str(int(time.time()*1000))

    url = "https://tousu.sina.com.cn/api/company/received_complaints"
    headers ={
        "authority": "tousu.sina.com.cn",
        "method": "GET",
        "path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
        "scheme": "https",
        "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
        # "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
        "referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
        "x-requested-with": "XMLHttpRequest",
    }

    params = {
        # "callback":"jQuery11120045959640946885205_1584672560291",
        "couid":"5650743478",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808
        "type":"1",
        "page_size":"10",
         "page":pages,
        "_":t,
    }

    res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
    # print(res.text)

    info_list = res.json()["result"]["data"]["complaints"]
    for info in info_list:
        # title = info.get("title")
        # uid = info.get("uid")
        # summary = info.get("summary")
        info_url = 'https:' + info['main']['url']
        # print(info_url)
        return parse_detail(info_url, info)


def parse_detail(info_url,info):
    #https://tousu.sina.com.cn/complaint/view/17349163730/
    try:
        res = session.get(info_url,  proxies=proxies, verify=False)
        # print(res.text)
        new_dict = dict()
        new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
        new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
        new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
        new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
        new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
        new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
        new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
        # 获取投诉图片
        img_info_list=[]
        img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
        for url in img_url:
            img_info_list.append("https:"+url)
            new_dict['投诉图片'] = img_info_list


        # 获取视频列表
        vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
        if len(vide_id_list)>=1:
            # 投诉视频详情
            new_vide_list = []
            if vide_id_list:
                for vide_id in vide_id_list:
                    t = int(time.time())
                    vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                    # print("vide_info_url=" ,vide_info_url)
                    res = session.get(vide_info_url, verify=False)
                    # result = res.encode('utf-8').decode('unicode_escape')
                    result = json.loads(res.text)
                    # print("result =",type(result))
                    new_vide_list.append(result)

            if new_vide_list:
                new_dict['投诉视频详情'] = new_vide_list
                info['投诉详情'] = new_dict
            # else:
            #     new_dict['投诉视频详情'] = None
            #     info['投诉详情'] = new_dict

        big_list.append(new_dict)
        print("big_list==",big_list)

    except Exception as e:
        print(e)

    # 写入json 文件
    with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
        fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')


def main(pages):
    startTime = time.time()
    # 爬取页数
    for page in range(pages):
        name = pool.submit(dewu_company,page)
        pool_list.append(name)
    for n in pool_list:
        n.result()
    print("全部结束并保存本地")

    # 以下写入json文件不能换行,那位大神可以指点下
    # with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
    #     fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')
    endTime = time.time()
    print('Done, Time cost: %s ' % (endTime - startTime))

if __name__ == '__main__':
    # 输入爬取页数
    main(20)

20页数据爬取时间:Done, Time cost: 1.6854908466339111

二、多线程爬取阿里详情投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-


import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")

session = HTMLSession()
proxies =None


def dewu_company():
    for page in range(1,20):
        print("第"+str(page)+"页")
        t=str(int(time.time()*1000))

        url = "https://tousu.sina.com.cn/api/company/received_complaints"
        headers ={
            "authority": "tousu.sina.com.cn",
            "method": "GET",
            "path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
            "scheme": "https",
            "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            # "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
            "referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
        }

        params = {
            # "callback":"jQuery11120045959640946885205_1584672560291",
            "couid":"1878960481",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808
            "type":"1",
            "page_size":"10",
             "page":page,
            "_":t,
        }

        res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
        # print(res.text)

        # 开启线程调用商品详情打开
        ths = []

        info_list = res.json()["result"]["data"]["complaints"]
        for info in info_list:
            # title = info.get("title")
            # uid = info.get("uid")
            # summary = info.get("summary")
            info_url = 'https:' + info['main']['url']
            # print(info_url)

            ## 开启线程调用商品详情
            th = threading.Thread(target=parse_detail,args=(info_url,info))
            th.start()
            ths.append(th)
            if len(ths) > 10:
                for th_one in ths:
                    th_one.join()
                ths = []
        for th_one in ths:
            th_one.join()


def parse_detail(info_url,info):

    #https://tousu.sina.com.cn/complaint/view/17349163730/
    try:
        big_list = []

        res = session.get(info_url,  proxies=proxies, verify=False)
        # print(res.text)
        new_dict = dict()
        new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
        new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
        new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
        new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
        new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
        new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
        new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
        # 获取投诉图片
        img_info_list=[]
        img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
        for url in img_url:
            img_info_list.append("https:"+url)
            new_dict['投诉图片'] = img_info_list


        # 获取视频列表
        vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
        if len(vide_id_list)>=1:
            # print("vide_id_list=",vide_id_list)
            # 投诉视频详情
            new_vide_list = []
            if vide_id_list:
                for vide_id in vide_id_list:
                    t = int(time.time())
                    vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                    # print("vide_info_url=" ,vide_info_url)
                    res = session.get(vide_info_url, verify=False)
                    # result = res.encode('utf-8').decode('unicode_escape')
                    result = json.loads(res.text)
                    # print("result =",type(result))
                    new_vide_list.append(result)

            if new_vide_list:
                new_dict['投诉视频详情'] = new_vide_list
                info['投诉详情'] = new_dict
            # else:
            #     new_dict['投诉视频详情'] = None
            #     info['投诉详情'] = new_dict

        big_list.append(new_dict)
        print("big_list==",big_list,len(big_list))

    except Exception as e:
        print(e)

    with open('阿里投诉信息.json', "a+", encoding = 'utf-8') as fw:
        fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')



if __name__ == '__main__':
    startTime = time.time()
    dewu_company()
    endTime = time.time()
    print ('Done, Time cost: %s ' % (endTime - startTime))

20页数据爬取时间:Done, Time cost: 20.348562240600586

posted @ 2020-03-22 22:21  莫贞俊晗  阅读(302)  评论(0编辑  收藏  举报