通过pyppeteer 库获取请求的携带的相关参数

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#--author: Baozi

import asyncio
from pyppeteer import launch
import time
import re


url_params = ''
doc_id = ''
async def intercept_response(res):
    global url_params
    if '__dyn' in res.url and 'https://www.facebook.com/ajax/bz' in res.url and not url_params:
        url_params = res.url
        print(url_params)


async def request_check(req):
    '''请求过滤'''
    if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
        await req.abort()
    else:
        await req.continue_()

async def main(url,proxy,ua):
    browser = await launch({'headless': False, 'args': [ '--proxy-server={}'.format(proxy),'--disable-infobars'] })  # 启动pyppeteer 属于内存中实现交互的模拟器
    time.sleep(10)
    page = await browser.newPage()
    page.on('request', intercept_response)

    # 设置请求头userAgent
    await page.setUserAgent(ua)
    await page.goto(url,{'timeout': 1000*20})
    global url_params
    for i in range(3):
        if not url_params:
            time.sleep(10)
            await page.goto(url, {'timeout': 1000 * 20})

    comment_click = await page.xpath('//form[@rel="async"]//div[@class="_4vn1"]/span[@class="_4vn2"]/a')
    await comment_click[0].click()
    time.sleep(2.5)
    await browser.close()

def get_url(url,proxy,user_agent):

    global url_params
    pyputeer_params = {}
    try:
        asyncio.get_event_loop().run_until_complete(main(url,proxy,user_agent))
    except Exception as e:
        pass
    url_params = url_params + '&'
    pyputeer_params['__user'] = '0'
    pyputeer_params['__a'] = '1'
    pyputeer_params['__dyn'] = re.findall('__dyn=(.*?)&', url_params)[0]
    pyputeer_params['__csr'] = re.findall('__csr=(.*?)&', url_params)[0]
    pyputeer_params['__req'] = re.findall('__req=(.*?)&', url_params)[0]
    pyputeer_params['__beoa'] = re.findall('__beoa=(.*?)&', url_params)[0]
    pyputeer_params['__pc'] = re.findall('__pc=(.*?)&', url_params)[0]
    pyputeer_params['dpr'] = re.findall('dpr=(.*?)&', url_params)[0]
    pyputeer_params['__ccg'] = re.findall('dpr=(.*?)&', url_params)[0]
    pyputeer_params['__rev'] = re.findall('__rev=(.*?)&', url_params)[0]
    pyputeer_params['__s'] = re.findall('__s=(.*?)&', url_params)[0]
    pyputeer_params['__hsi'] = re.findall('__hsi=(.*?)&', url_params)[0]
    pyputeer_params['__comet_req'] = re.findall('__comet_req=(.*?)&', url_params)[0]
    pyputeer_params['lsd'] = re.findall('lsd=(.*?)&', url_params)[0]
    pyputeer_params['jazoest'] = re.findall('jazoest=(.*?)&', url_params)[0]
    pyputeer_params['__spin_r'] = re.findall('__spin_r=(.*?)&', url_params)[0]
    pyputeer_params['__spin_b'] = re.findall('__spin_b=(.*?)&', url_params)[0]
    pyputeer_params['__spin_t'] = re.findall('__spin_t=(.*?)&', url_params)[0]

    return pyputeer_params

if __name__ == '__main__':

    url = 'https://www.facebook.com/news.hkcd/posts/2966706433454938'
    proxy = 'http://172.16.7.14:13512'
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3965.0 Safari/537.36'
    print(get_url(url,proxy,user_agent))

 

posted @ 2020-08-21 20:46  小鲨鱼~  阅读(1427)  评论(0编辑  收藏  举报