selenium + 抓包
携程列表页数据动态加载生成,接口参数加密,使用selenium + 抓包解决
初始化selenium, options参数加上,desired_capabilities参数无法使用
def dirver_init(): options = webdriver.ChromeOptions() # ip = get_pro() # print(ip) # 添加代理 # options.add_argument(('--proxy-server=http://' + ip)) # 设置开发者模式启动,该模式下webdriver属性为正常值 一般反爬比较好的网址都会根据这个反爬 options.add_experimental_option('excludeSwitches', ['enable-automation']) # 禁用浏览器弹窗 prefs = { 'profile.default_content_setting_values': { 'notifications': 2 }} options.add_experimental_option('prefs', prefs) # 添加UA options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"') # 将配置文件加载进webdriver # driver = webdriver.Chrome(options=options) driver = webdriver.Chrome(desired_capabilities=caps) return driver
动态加载,是鼠标滑动到当前窗口最下方
def dynamicLoading(driver): all_window_height = [] # 创建一个列表,用于记录每一次拖动滚动条后页面的最大高度 all_window_height.append(driver.execute_script("return document.body.scrollHeight;")) while True: print(all_window_height) # driver.execute_script("document.documentElement.scrollTop = 100000") driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") time.sleep(3) heck_height = driver.execute_script("return document.body.scrollHeight;") if heck_height == all_window_height[-1]: # 判断拖动滚动条后的最大高度与上一次的最大高度的大小,相等表明到了最底部 break else: all_window_height.append(heck_height)
判断酒店数是否>10及是否需要动态加载,并获取接口的相应内容
def web_selen(driver, url): driver.get(url) time.sleep(3) # 酒店个数 Html = etree.HTML(driver.page_source) nums_content = Html.xpath("//div[contains(@class, 'filter-title')]/h3/text()")[0] print(nums_content) nums = int(re.findall(r'找到(.*)家酒店', nums_content)[0]) print(nums) if nums > 10: # 动态加载 dynamicLoading(driver) time.sleep(10) # 获取接口相应内容 request_log = driver.get_log('performance') for i in range(len(request_log)): message = json.loads(request_log[i]['message']) message = message['message']['params'] # .get() 方式获取是了避免字段不存在时报错 request = message.get('request') if (request is None): continue url = request.get('url') if 'HotelSearch?testab' in url: print(url) print(message['requestId']) try: content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': message['requestId']}) print(content) except: pass print('-------------') driver.close() # 关闭页面 driver.quit()