playwright网络爬虫实战案例分享

之前是用selenium做的网络爬虫，这次用playwright，不过只是做了一个简单的，获取数据的response：

import logging

from playwright.sync_api import Playwright, sync_playwright, expect

# setup logging
logging.basicConfig (
    format='%(pastime)s | %(level_name)s : %(message)s', level=logging.INFO)


def handle_json(json):
    # process our json data
    print ('json：', json)
    print ('msg：', json['msg'])
    print ('total：', json['total'])
    print ('data：', json['data'])
    print ('success：', json['success'])


def handle(request, response):
    # pass
    if response is not None:
        # response url 是网站请求数据的url
        if response.url == 'https://merchant.finance.cainiao.com/record/api/getLoan.json':
            handle_json (response.json ())


def run(playwright: Playwright) -> None:
    user_data_dir = "/Users/kaka/chrome2"
    args = [
        '--disable-blink-features=AutomationControlled',
        f"--disable-extensions",
        f"--disable-popup-blocking",
        f"--ignore-certificate-errors",
        f"--disable-plugins-discovery",
        f'--no-first-run',
        f'--no-service-autorun',
        f'--no-default-browser-check',
        #   f'--no-startup-window',
        f'--disable-dev-shm-usage',
        #   f"--disable-extensions-except={cookie_extension},{path_to_extension}",
    ]
    headless = False
    context = playwright.chromium.launch_persistent_context (
        user_data_dir,
        channel="chrome",
        device_scale_factor=1,
        devtools=False,
        headless=False,
        args=args
    )
    browser = context.browser
    page = context.new_page ()
    page.goto (
        "https://cnlogin.cainiao.com/login?appKey=12497914&istb=true&redirectURL=https%3A%2F%2Fmerchant.finance.cainiao.com%2Ffunds%2Fwaterlevel%2FCustProdWaterLevelReportConfig.htm&showae=true&showin=false&showdd=false&tbUserName=&sub=&type=ML&domain=&lang=&isNewLogin=&targetAccount=&meta=&isEnterprise=&isQTb=false&isCnSSOTb=false")
    page.set_default_timeout(2000)
    try:
        username = page.frame_locator ("#J_taobao iframe").locator ('#fm-login-id')
        username.click ()
        username.fill ('账号')
    except Exception as e:
        print (f"账号名/邮箱/手机号->Exception:{e}")
        pass
    try:
        userpass = page.frame_locator ("#J_taobao iframe").locator ('#fm-login-password')
        userpass.click ()
        userpass.fill ('密码')
    except Exception as e:
        print (f"请输入登录密码->Exception:{e}")
        pass
    # 获取拖动按钮位置并拖动
    try:
        drop_button = page.frame_locator ("#J_taobao iframe").frame_locator ("#baxia-dialog-content").locator (
            "#nc_1_n1z")
        box = drop_button.bounding_box ()
        page.mouse.move (box['x'] + box['width'] / 2, box['y'] + box['height'] / 2)
        page.mouse.down ()
        mov_x = box['x'] + box['width'] / 2 + 260
        page.mouse.move (mov_x, box['y'] + box['height'] / 2)
        page.mouse.up ()
    except Exception as e:
        print (f"获取拖动按钮位置并拖动->Exception:{e}")
        pass
    # 登录按钮
    try:
        # page.frame_locator("#J_taobao iframe").locator(".fm-submit").click()
        page.frame_locator ("#J_taobao iframe").get_by_role ("button", name="登录").click ()
    except Exception as e:
        print (f"登录按钮->Exception:{e}")
        pass
    # 快速进入
    try:
        # page.frame_locator("#J_taobao iframe").locator(".fm-submit").click()
        c = page.frame_locator ("#J_taobao iframe").get_by_role ("button", name="快速进入")
        c.click (force=True)
        # page.wait_for_timeout (3000)
        c.click ()
    except Exception as e:
        print (f"快速进入->Exception:{e}")
        pass
    page.wait_for_timeout (1000)
    # 跳转到一个可以访问数据的页面，因为我的这个账号不是商家的官方账号，所以，我这边用的我自己的淘宝的账号登陆上去之后，找了一个可以显示数据的页面进行一个简单的尝试
    url = "https://merchant.finance.cainiao.com/platform/index.htm#/iframe?iframeUrl=%20https%3A%2F%2Fmerchant.finance.cainiao.com%2Frecord%2FloanRecord.htm"
    page.goto (url)
    page.wait_for_timeout (1000)

    # 调取打印返回的response
    page.on ("request", lambda request: handle (request=request, response=None))
    page.on ("response", lambda response: handle (response=response, request=None))

    # 全部贷款状态选择
    page.frame_locator ("#J_IFRAME").locator ("#c-select-65").click ()
    # 选取想要查询的状态，比如说我现在要选择
    page.frame_locator ('#J_IFRAME').locator ('#c-list-90').get_by_text ('待审批').click ()
    # 点击查询按钮
    page.frame_locator ('#J_IFRAME').locator ('#J_SearchFormBtn').click ()

    page.wait_for_timeout (3000)
    # context.close ()
    # browser.close ()


with sync_playwright () as playwright:
    run (playwright)