playwright网络爬虫实战案例分享
之前是用selenium做的网络爬虫,这次用playwright,不过只是做了一个简单的,获取数据的response:
import logging from playwright.sync_api import Playwright, sync_playwright, expect # setup logging logging.basicConfig ( format='%(pastime)s | %(level_name)s : %(message)s', level=logging.INFO) def handle_json(json): # process our json data print ('json:', json) print ('msg:', json['msg']) print ('total:', json['total']) print ('data:', json['data']) print ('success:', json['success']) def handle(request, response): # pass if response is not None: # response url 是网站请求数据的url if response.url == 'https://merchant.finance.cainiao.com/record/api/getLoan.json': handle_json (response.json ()) def run(playwright: Playwright) -> None: user_data_dir = "/Users/kaka/chrome2" args = [ '--disable-blink-features=AutomationControlled', f"--disable-extensions", f"--disable-popup-blocking", f"--ignore-certificate-errors", f"--disable-plugins-discovery", f'--no-first-run', f'--no-service-autorun', f'--no-default-browser-check', # f'--no-startup-window', f'--disable-dev-shm-usage', # f"--disable-extensions-except={cookie_extension},{path_to_extension}", ] headless = False context = playwright.chromium.launch_persistent_context ( user_data_dir, channel="chrome", device_scale_factor=1, devtools=False, headless=False, args=args ) browser = context.browser page = context.new_page () page.goto ( "https://cnlogin.cainiao.com/login?appKey=12497914&istb=true&redirectURL=https%3A%2F%2Fmerchant.finance.cainiao.com%2Ffunds%2Fwaterlevel%2FCustProdWaterLevelReportConfig.htm&showae=true&showin=false&showdd=false&tbUserName=&sub=&type=ML&domain=&lang=&isNewLogin=&targetAccount=&meta=&isEnterprise=&isQTb=false&isCnSSOTb=false") page.set_default_timeout(2000) try: username = page.frame_locator ("#J_taobao iframe").locator ('#fm-login-id') username.click () username.fill ('账号') except Exception as e: print (f"账号名/邮箱/手机号->Exception:{e}") pass try: userpass = page.frame_locator ("#J_taobao iframe").locator ('#fm-login-password') userpass.click () userpass.fill ('密码') except Exception as e: print (f"请输入登录密码->Exception:{e}") pass # 获取拖动按钮位置并拖动 try: drop_button = page.frame_locator ("#J_taobao iframe").frame_locator ("#baxia-dialog-content").locator ( "#nc_1_n1z") box = drop_button.bounding_box () page.mouse.move (box['x'] + box['width'] / 2, box['y'] + box['height'] / 2) page.mouse.down () mov_x = box['x'] + box['width'] / 2 + 260 page.mouse.move (mov_x, box['y'] + box['height'] / 2) page.mouse.up () except Exception as e: print (f"获取拖动按钮位置并拖动->Exception:{e}") pass # 登录按钮 try: # page.frame_locator("#J_taobao iframe").locator(".fm-submit").click() page.frame_locator ("#J_taobao iframe").get_by_role ("button", name="登录").click () except Exception as e: print (f"登录按钮->Exception:{e}") pass # 快速进入 try: # page.frame_locator("#J_taobao iframe").locator(".fm-submit").click() c = page.frame_locator ("#J_taobao iframe").get_by_role ("button", name="快速进入") c.click (force=True) # page.wait_for_timeout (3000) c.click () except Exception as e: print (f"快速进入->Exception:{e}") pass page.wait_for_timeout (1000) # 跳转到一个可以访问数据的页面,因为我的这个账号不是商家的官方账号,所以,我这边用的我自己的淘宝的账号登陆上去之后,找了一个可以显示数据的页面进行一个简单的尝试 url = "https://merchant.finance.cainiao.com/platform/index.htm#/iframe?iframeUrl=%20https%3A%2F%2Fmerchant.finance.cainiao.com%2Frecord%2FloanRecord.htm" page.goto (url) page.wait_for_timeout (1000) # 调取打印返回的response page.on ("request", lambda request: handle (request=request, response=None)) page.on ("response", lambda response: handle (response=response, request=None)) # 全部贷款状态选择 page.frame_locator ("#J_IFRAME").locator ("#c-select-65").click () # 选取想要查询的状态,比如说我现在要选择 page.frame_locator ('#J_IFRAME').locator ('#c-list-90').get_by_text ('待审批').click () # 点击查询按钮 page.frame_locator ('#J_IFRAME').locator ('#J_SearchFormBtn').click () page.wait_for_timeout (3000) # context.close () # browser.close () with sync_playwright () as playwright: run (playwright)