开线程爬取黑猫里的阿里投诉信息

仅供学习,请适度开线程

一.代码

import requests
from requests_html import HTMLSession
import time
from concurrent.futures import ThreadPoolExecutor
import json

pool = ThreadPoolExecutor(30)
big_list = []
pool_name_list =[]
session = HTMLSession()

def dewu_company(x):

    try:

        print(f'第{x+1}页')

        params = {
            'couid': '1878960481',
            'type': '1',
            'page_size': f'{(x + 1) * 10}',
            'page': f'{x + 1}',
            # 'callback':'jQuery11',
        }
        url = 'https://tousu.sina.com.cn/api/company/received_complaints'
        res = requests.get(url, params=params, verify=False)
        info_list = res.json()['result']['data']['complaints']
        for dict_info in info_list:
            dict_info['main']['url'] = 'https:' + dict_info['main']['url']
            dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
            info_url = dict_info['main']['url']
            print(info_url)
            res = session.get(info_url, verify=False)
            new_dict = dict()
            new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
            new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
            new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
            new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
            new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
            new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
            # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
            # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
            new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
            not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
            have_http_img_list = []
            for a in not_have_http_img_list:
                have_http_img_list.append('https:' + a)
            new_dict['投诉图片'] = have_http_img_list

            vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
            print(vide_id_list)
            new_vide_list = []
            if vide_id_list:
                for vide_id in vide_id_list:
                    t = int(time.time())
                    vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                    res = session.get(vide_info_url, verify=False)
                    try:
                        new_vide_list.append(res.json())
                    except:
                        pass
            new_dict['投诉视频详情'] = new_vide_list
            dict_info['投诉详情'] = new_dict
            big_list.append(dict_info)
    except:
        print('错误跳过这一页')

def run(page):
    '''爬取的页面数量'''
    for x in range(page):
        name = pool.submit(dewu_company,x)
        pool_name_list.append(name)
    for name_1 in pool_name_list:
        name_1.result()
    print('全部结束开始保存本地')
    with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
        json.dump(big_list, fw)
    print('保存完毕')

if __name__ == '__main__':
    run(1)

posted @ 2020-03-22 13:25 小小咸鱼YwY 阅读(939) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

加载时间中.....

Python 前端 爬虫 数据库 Django Flask 微信小程序 Linux Go

开线程爬取黑猫里的阿里投诉信息

一.代码

公告

加载时间中.....

Python 前端 爬虫 数据库 Django Flask 微信小程序 Linux Go

开线程爬取黑猫里的阿里投诉信息

一.代码

公告

Python 前端爬虫数据库 Django Flask 微信小程序 Linux Go