爬取知乎话题async使用协程

import requests
import json
import time
from pyquery import PyQuery
import pandas as pd
from collections import OrderedDict
import multiprocessing
import asyncio
from functools import partial
# cookies = input('请输入Cookie:')
# url = input('请输入url:')
init_url = 'https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10'
headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
    'Cookie': '**',
    'Referer': 'https://www.zhihu.com/topic/19606409/hot',
    'Host': 'www.zhihu.com',
    'X-UDID': 'AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA='
}

def get_all_url(url):
    res = requests.get(url,headers=headers)
    data = json.loads(res.text)
    next_page_url = data['paging']['next']
    url_list.append(next_page_url)
    print(len(url_list))
    end_page = data['paging']['is_end']  # true
    if end_page:
        return url_list
    else:
        get_all_url(next_page_url)



async def get_all_data(url):
    future = loop.run_in_executor(None,partial(requests.get,url,headers=headers))
    #res = requests.get(url,headers=headers)
    res = await future
    data = json.loads(res.text)
    res_data = data['data']
    print(len(data_list))
    for i in res_data:
        final_data = OrderedDict()
        type = i['target']['type']
        if type =='answer':
            final_data['title'] = i['target']['question']['title'] or ''
            try:
                final_data['content'] = PyQuery(i['target']['content']).text()
            except Exception as e:
                final_data['content'] = PyQuery(i['target']['excerpt']).text()
            final_data['comment_count'] = i['target']['comment_count']
            final_data['voteup_count'] = i['target']['voteup_count']
            data_list.append(final_data)

if __name__ == '__main__':
    data_list=[]
    url_list = []
    get_all_url(init_url)

    tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()

    df1 =pd.DataFrame(data_list)
    df1.to_excel('保险'+time.strftime("%Y%m%d%H%M%S")+'.xlsx',index=False)
    print('done')

 

posted @ 2018-08-03 18:21  Erick-LONG  阅读(331)  评论(0编辑  收藏  举报