通过pyppeteer 库获取请求的携带的相关参数
#!/usr/bin/env python # -*- coding:utf-8 -*- #--author: Baozi import asyncio from pyppeteer import launch import time import re url_params = '' doc_id = '' async def intercept_response(res): global url_params if '__dyn' in res.url and 'https://www.facebook.com/ajax/bz' in res.url and not url_params: url_params = res.url print(url_params) async def request_check(req): '''请求过滤''' if req.resourceType in ['image', 'media', 'eventsource', 'websocket']: await req.abort() else: await req.continue_() async def main(url,proxy,ua): browser = await launch({'headless': False, 'args': [ '--proxy-server={}'.format(proxy),'--disable-infobars'] }) # 启动pyppeteer 属于内存中实现交互的模拟器 time.sleep(10) page = await browser.newPage() page.on('request', intercept_response) # 设置请求头userAgent await page.setUserAgent(ua) await page.goto(url,{'timeout': 1000*20}) global url_params for i in range(3): if not url_params: time.sleep(10) await page.goto(url, {'timeout': 1000 * 20}) comment_click = await page.xpath('//form[@rel="async"]//div[@class="_4vn1"]/span[@class="_4vn2"]/a') await comment_click[0].click() time.sleep(2.5) await browser.close() def get_url(url,proxy,user_agent): global url_params pyputeer_params = {} try: asyncio.get_event_loop().run_until_complete(main(url,proxy,user_agent)) except Exception as e: pass url_params = url_params + '&' pyputeer_params['__user'] = '0' pyputeer_params['__a'] = '1' pyputeer_params['__dyn'] = re.findall('__dyn=(.*?)&', url_params)[0] pyputeer_params['__csr'] = re.findall('__csr=(.*?)&', url_params)[0] pyputeer_params['__req'] = re.findall('__req=(.*?)&', url_params)[0] pyputeer_params['__beoa'] = re.findall('__beoa=(.*?)&', url_params)[0] pyputeer_params['__pc'] = re.findall('__pc=(.*?)&', url_params)[0] pyputeer_params['dpr'] = re.findall('dpr=(.*?)&', url_params)[0] pyputeer_params['__ccg'] = re.findall('dpr=(.*?)&', url_params)[0] pyputeer_params['__rev'] = re.findall('__rev=(.*?)&', url_params)[0] pyputeer_params['__s'] = re.findall('__s=(.*?)&', url_params)[0] pyputeer_params['__hsi'] = re.findall('__hsi=(.*?)&', url_params)[0] pyputeer_params['__comet_req'] = re.findall('__comet_req=(.*?)&', url_params)[0] pyputeer_params['lsd'] = re.findall('lsd=(.*?)&', url_params)[0] pyputeer_params['jazoest'] = re.findall('jazoest=(.*?)&', url_params)[0] pyputeer_params['__spin_r'] = re.findall('__spin_r=(.*?)&', url_params)[0] pyputeer_params['__spin_b'] = re.findall('__spin_b=(.*?)&', url_params)[0] pyputeer_params['__spin_t'] = re.findall('__spin_t=(.*?)&', url_params)[0] return pyputeer_params if __name__ == '__main__': url = 'https://www.facebook.com/news.hkcd/posts/2966706433454938' proxy = 'http://172.16.7.14:13512' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3965.0 Safari/537.36' print(get_url(url,proxy,user_agent))
做一枚奔跑的老少年!