python爬数据,链接端口的那种

chrome://version/
可执行文件路径	/Applications/Google Chrome.app/Contents/MacOS/Google

Google\ Chrome --remote-debugging-port=9222 --user-data-dir="~/ChromeProfile"
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222

  然后代码:

import datetime
import time
import json

from collections.abc import Mapping
from collections.abc import MutableMapping

import MessageToSlack
import os
import pymysql.cursors
import random
import sys
import undetected_chromedriver.v2 as uc

from functools import reduce
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# 用来驱动浏览器的
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


# from decouple import config
class Cainiao_Scrape:
    need_customer_data = []
    we_need_data = []
    new_need_customer_data = []
    discount_rate_data = []
    parameter_setting_data = []
    page_count = 0

    # 初始设置函数
    def __init__(self, web_url, pro_db_host, pro_db_user, pro_db_pwd, pro_db_name, u02_db_host, u02_db_user, u02_db_pwd,
                 u02_db_name, u01_db_host, u01_db_user, u01_db_pwd, u01_db_name, cainiao_ac, cainiao_pwd, handle_force):
        self.vars = None
        self.driver = None

        self.web_url = web_url
        # production
        self.pro_db_host = pro_db_host
        self.pro_db_user = pro_db_user
        self.pro_db_pwd = pro_db_pwd
        self.pro_db_name = pro_db_name

        # u02
        self.u02_db_host = u02_db_host
        self.u02_db_user = u02_db_user
        self.u02_db_pwd = u02_db_pwd
        self.u02_db_name = u02_db_name

        # u01
        self.u01_db_host = u01_db_host
        self.u01_db_user = u01_db_user
        self.u01_db_pwd = u01_db_pwd
        self.u01_db_name = u01_db_name

        self.cainiao_ac = cainiao_ac
        self.cainiao_pwd = cainiao_pwd

        self.handle_force = handle_force

    # 浏览器配置
    def setup_method(self):
        port_number = "127.0.0.1:9222"
        try:
            check_folder = os.path.isdir ("ScreenCap")
            if not check_folder:
                os.makedirs ("ScreenCap")
            chrome_options = webdriver.ChromeOptions ()
            chrome_options.add_experimental_option (
                "debuggerAddress", port_number)
            capabilities = DesiredCapabilities.CHROME
            capabilities["goog:loggingPrefs"] = {
                'browser': 'ALL', "performance": "ALL"}  # 必须有这一句,才能在后面获取到performance
            # capabilities["goog:perfLoggingPrefs"] = {'enableNetwork': True}
            # chrome_options.add_argument ("--disable-extensions")
            # chrome_options.add_argument ("--disable-popup-blocking")
            # chrome_options.add_argument ("--ignore-certificate-errors")
            # chrome_options.add_argument ("--disable-plugins-discovery")
            # chrome_options.add_argument ('--no-first-run')
            # chrome_options.add_argument ('--no-service-autorun')
            # chrome_options.add_argument ('--no-default-browser-check')
            # chrome_options.add_argument ('no-startup-window')
            # chrome_options.add_argument ('--disable-dev-shm-usage')
            # chrome_options.add_argument('--no-sandbox')   # highest right to operate
            # --headless是不显示浏览器启动及执行过程
            # chrome_options.add_argument('--headless')
            # chrome_options.add_argument('--disable-gpu')  # 不加载gpu,规避bug

            self.driver = webdriver.Chrome (options=chrome_options, desired_capabilities=capabilities)
            # self.driver = uc.Chrome (options=chrome_options, desired_capabilities=capabilities, version_main=110)
            self.vars = {}
            # 清除浏览器cookie
            self.driver.delete_all_cookies ()
            MessageToSlack.post_message_to_slack ("Start Chrome Driver")
        except Exception as r:
            self.teardown_method
            MessageToSlack.post_message_to_slack (
                "setup_method info:%s" % str (r))
            sys.exit ()

    # 菜鸟登录的函数
    def login_cainiao(self):
        try:
            MessageToSlack.post_message_to_slack ("Go to cainiao website")
            try:
                self.driver.get (self.web_url)
            except Exception as get2:
                MessageToSlack.post_message_to_slack (
                    f'get that page again: {str (get2)}')
                self.driver.get (self.web_url)
            link = self.web_url
            time.sleep (random.randint (4, 6))

            # try:
            #     # 打开本地的浏览器的时候,先跳过登录这一块然后再测试
            #     self.driver.switch_to.frame (0)
            #     time.sleep (random.randint (4, 6))
            #     # 点击输入账户
            #     self.driver.find_element (
            #         By.XPATH, "//input[@id='fm-login-id']").click ()
            #     time.sleep (2)
            #     self.driver.find_element (
            #         By.XPATH, "//input[@id='fm-login-id']").send_keys (self.cainiao_ac)
            #     # 点击输入密码
            #     self.driver.find_element (
            #         By.XPATH, "//input[@id='fm-login-password']").click ()
            #     time.sleep (2)
            #     self.driver.find_element (
            #         By.XPATH, "//input[@id='fm-login-password']").send_keys (self.cainiao_pwd)
            #     # 调取滑块验证部分
            #     self.slide_auth ()
            #     # 点击登陆按钮
            #     self.driver.find_element (
            #         By.XPATH, "//button[@type='submit']").click ()
            #     MessageToSlack.post_message_to_slack ('pressed login button')
            #     # 如果有快速进入的验证框,点击快速验证
            #     self.driver.switch_to.frame (0)
            #     self.driver.find_element (
            #         By.XPATH, "//button[@type='submit']").click ()
            #     MessageToSlack.post_message_to_slack (
            #         'pressed quick entry button')
            # except Exception as gin1:
            #     MessageToSlack.post_message_to_slack (
            #         f'login error :{str (gin1)}')
            #     pass
            # time.sleep (random.randint (4, 6))

            try:
                self.driver.get (self.web_url)
            except Exception as get1:
                MessageToSlack.post_message_to_slack (
                    'get that page for the second time:%s' % str (get1))
                self.driver.get (self.web_url)
            # time.sleep (random.randint (4, 6))
            MessageToSlack.post_message_to_slack ("Get Data Page")

            # 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍等于今天的customer_name数据
            self.get_db_customer_data ()
            # 查询符合时间段内的全部公司列表
            self.query_all_customer ()
            # 将菜鸟返回的数据中的approveNo赋予cai_niao_watermark_dailylogs表中符合条件的数据
            self.dic_logic_handle ()
            # 判断是否从头开始跑还是从上次中断时开始跑
            if int (self.handle_force) == 0:
                # 获取cai_niao_company_discountrate_dailylogs今日是否更新数据
                self.get_db_discountrate_data ()
            else:
                self.new_need_customer_data = self.need_customer_data

            # 处理符合条件的数据中的跳转参数
            self.goto_parameter_setting ()
            # 处理数据保存
            self.save_to_db ()
            MessageToSlack.post_message_to_slack ("Data Scrape finished")
        except Exception as tc:
            self.driver.get_screenshot_as_file (
                "./ScreenCap/" + datetime.datetime.now ().strftime ("%Y%m%d_%H%M%p") + ".png")  # 异常时捕获页面截图
            MessageToSlack.post_message_to_slack (
                "def login_cainiao error:%s" % str (tc))
            self.teardown_method  # 调用关闭浏览器函数
            sys.exit ()  # 退出程序机制 通过引发SystemExit异常来退出Python程序

    # 获取拖动按钮位置并拖动
    def slide_auth(self):
        try:
            time.sleep (random.randint (4, 6))
            # check have slide verify elements or not
            if (self.driver.find_element (By.ID, "baxia-password")).is_displayed ():
                self.driver.switch_to.frame ('baxia-dialog-content')
                element = self.driver.find_element (By.ID, "nc_1_n1z")
                slide_bar = self.driver.find_element (By.ID, "nc_1__scale_text")
                action = ActionChains (self.driver)
                action.move_to_element (element).pause (1).click_and_hold (element).pause (1).move_by_offset (
                    slide_bar.size['width'] - element.size['width'], 0).pause (1).release ()  # mobile slider
                action.perform ()
                MessageToSlack.post_message_to_slack (
                    "Slide the verification bar")
                self.driver.switch_to.parent_frame ()
                time.sleep (2)
                if (self.driver.find_element (By.ID, "baxia-password")).is_displayed ():
                    counter1 = 0
                    while (self.driver.find_element (By.ID, "baxia-password")).is_displayed () and counter1 < 6:
                        self.driver.switch_to.frame ('baxia-dialog-content')
                        wrapper = self.driver.find_element (By.ID, "nocaptcha")
                        wrapper.click ()
                        time.sleep (random.randint (2, 3))
                        element = self.driver.find_element (By.ID, "nc_1_n1z")
                        slide_bar = self.driver.find_element (
                            By.ID, "nc_1__scale_text")
                        action = ActionChains (self.driver)
                        action.move_to_element (element).pause (1).click_and_hold (element).pause (1).move_by_offset (
                            slide_bar.size['width'] - element.size['width'], 0).release ()  # mobile slider
                        action.perform ()
                        MessageToSlack.post_message_to_slack (
                            "Slide the verification bar")
                        self.driver.switch_to.parent_frame ()
                        time.sleep (random.randint (2, 3))
                        counter1 += 1
                    counter1 = 0

                if (self.driver.find_element (By.ID, "baxia-password")).is_displayed ():
                    raise Exception (
                        "Still cannot  pass the slide certification after 5 attempt")
        except Exception as s:
            self.driver.get_screenshot_as_file (
                "./ScreenCap/" + datetime.datetime.now ().strftime ("%Y%m%d_%H%M%p") + ".png")
            MessageToSlack.post_message_to_slack (
                "def slide_auth error:%s" % str (s))
            self.teardown_method
            sys.exit ()

    # 查询符合时间段内的全部公司列表(日期範圍由2018-01-01的今天到今天)
    def query_all_customer(self):
        if (self.driver.find_element (By.XPATH, '//form[@id="J_Form"]')).is_displayed ():
            # 查询搜索框存在
            MessageToSlack.post_message_to_slack (
                'customer name search form exists')
            # 日期选择范围(之前是2年前的今天,后改为2018-01-01)
            # start_year = int(time.strftime('%Y', time.localtime(time.time()))) - 2
            # start_month = int(time.strftime('%m', time.localtime(time.time())))
            # start_day = int(time.strftime('%d', time.localtime(time.time())))

            start_year = 2018
            start_month = 1
            start_day = 1
            # 年月日的XPATH
            current_selection_year_start = '.c-cal-year[data-year="%s"]' % start_year
            current_selection_month_start = '.c-cal-month[data-month="%s"]' % start_month
            current_selection_day_start = '.c-cal-day[data-day="%s"]' % start_day
            # 点击日期选择器的DIV
            self.driver.find_element (
                By.XPATH, "//div[@id='c-calendar-select-66']").click ()
            time.sleep (1)
            # 点击日期选择器的年份
            self.driver.find_element (
                By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[1]').click ()
            time.sleep (1)
            self.driver.find_element (
                By.CSS_SELECTOR, current_selection_year_start).click ()
            # 点击日期选择器的月份
            self.driver.find_element (
                By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[2]').click ()
            time.sleep (1)
            self.driver.find_element (
                By.CSS_SELECTOR, current_selection_month_start).click ()
            # 点击日期选择器的日期
            self.driver.find_element (
                By.CSS_SELECTOR, current_selection_day_start).click ()
            # 点击确定
            self.driver.find_element (
                By.XPATH, '//*[@id="c-calendar-81"]/div[3]/button').click ()
            # 点击查询
            self.driver.find_element (By.XPATH, "//div[@id='J_search']").click ()
            time.sleep (random.randint (2, 4))
            # 获取总共的条数
            paging_total_num = int (self.driver.find_element (
                By.CSS_SELECTOR, '#J_Paging > div.paging-total-num > span').text)
            # 如果总条数大于10,设置每页显示条数为30
            if paging_total_num > 10:
                self.driver.get_log ("performance")
                self.multi_page_handle ()
            else:
                """
                获取第一页的performance_log,并分析存入数据库
                get_log方法用于获取给定日志类型的日 
                get log first time, after get once log will be deleted
                """
                first_page_log = self.driver.get_log ("performance")
                self.log_analysis_customer_query (first_page_log, 1)
        else:
            # 查询搜索框不存在时返回信息
            raise Exception (
                "cannot find this search form in customer list page.  Please check is login problem or other")
        # MessageToSlack.post_message_to_slack(
        #     'from customer_query log,we can update customer list is:' + str(self.parameter_setting_data))

    # 客户列表日志分析函数
    def log_analysis_customer_query(self, performance_log_customer_query, page):
        db_customer_data = self.need_customer_data
        we_need_data = self.we_need_data
        for row in performance_log_customer_query:
            log_json_customer_query = json.loads (row['message'])
            log_customer_query = log_json_customer_query['message']
            method_customer_query = str (log_customer_query['method'])
            if method_customer_query != 'Network.responseReceived':
                continue
            url_customer_query = str (
                log_customer_query['params']['response']['url'])
            if url_customer_query != 'https://merchant.finance.cainiao.com/funds/signApproveDetail/signApproveDetailData.do':
                continue
            requestId_customer_query = str (
                log_customer_query['params']['requestId'])
            try:
                customer_query_page_data = \
                    self.driver.execute_cdp_cmd ('Network.getResponseBody',
                                                 {'requestId': requestId_customer_query})[
                        'body']
                cqd = json.loads (customer_query_page_data)
                cqd['data']['page'] = page
                cqd['data']['page_total'] = len (cqd['data']['resultList'])
                # 存储数据到cai_niao_company_discountrate_snapshots到各个数据库
                self.save_to_discountrate_snapshots (cqd, page)
                total_customer_query = cqd['data']['total']
                if total_customer_query > 0:
                    # 调取跳转参数设置页面的函数
                    resultList = cqd['data']['resultList']
                    for item in resultList:
                        approveNo = str (item['approveNo'])
                        customer_name = str (item['companyName'])
                        item_status = str (item['status'])
                        if item_status == 'UN_SIGN':
                            continue
                        if item_status == 'FUND_APPROVE_PASS':
                            if customer_name not in we_need_data:
                                continue
                            else:
                                self.parameter_setting_data.append ({
                                    'approveNo': approveNo,
                                    'customer_name': customer_name
                                })
                else:
                    MessageToSlack.post_message_to_slack ('No data was found')
            except Exception as customer_query_e:
                MessageToSlack.post_message_to_slack (
                    'def log_analysis_customer_query error:' + str (customer_query_e))
                continue
            break

    # 多页面处理
    def multi_page_handle(self):
        # 点击每页【】条的DIV框
        self.driver.find_element (
            By.XPATH, '//*[@id="c-select-226"]').click ()
        time.sleep (1)
        # 点击页数列表的最后的一个li
        self.driver.find_element (
            By.XPATH, '//ul[@id="c-list-251"]/li[last()]/span').click ()
        time.sleep (random.randint (3, 4))
        first_page_log = self.driver.get_log ("performance")
        self.log_analysis_customer_query (first_page_log, 1)
        # 获取页面显示页数,如果页面显示页数大于1,for循环跳转页面,并获取相应的performance_log分析存入数据库
        paging_total_page = int (self.driver.find_element (
            By.CSS_SELECTOR, '#J_Paging > div.paging-total-page > span').text)
        if paging_total_page > 1:
            initial_number = 1
            while initial_number < paging_total_page:
                initial_number += 1
                # 判断点击跳转下一页的btn是否可点击
                btn_right = self.driver.find_element (
                    By.CSS_SELECTOR,
                    '#J_Paging > button.btn.btn-default.btn-sm.paging-next-btn.c-icon.c-icon-arrow-right')
                if btn_right.is_enabled ():
                    btn_right.click ()
                    time.sleep (random.randint (2, 4))
                    # 获取页面performance_log,并分析存入数据库
                    current_page_log = self.driver.get_log ("performance")
                    self.log_analysis_customer_query (
                        current_page_log, initial_number)
                else:
                    break

    # 多个数据库执行数据存储到cai_niao_company_discountrate_snapshots
    def save_to_discountrate_snapshots(self, data, page):
        # save to dbs
        self.save_discountrate_snapshots (
            data, page, self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name)
        # self.save_discountrate_snapshots (
        #     data, page, self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name)
        # self.save_discountrate_snapshots (
        #     data, page, self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name)

    # 存储数据到cai_niao_company_discountrate_snapshots
    def save_discountrate_snapshots(self, data, page, db_host, db_user, db_pwd, db_name):
        record_exist = 0
        page = int (page)
        if len (data) > 0:
            try:
                connection = pymysql.connect (
                    host=db_host, user=db_user, password=db_pwd, database=db_name,
                    cursorclass=pymysql.cursors.DictCursor)
                with connection:
                    raw_data = json.dumps (data)
                    download_date = datetime.datetime.now ().strftime ('%Y-%m-%d')
                    created_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S')
                    updated_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S')
                    with connection.cursor () as cursor:
                        check_sql = f"SELECT EXISTS ( SELECT `id` FROM `cai_niao_company_discountrate_snapshots` WHERE `download_date` = '{download_date}' AND `page` = {page})"
                        cursor.execute (check_sql)
                        result = cursor.fetchone ()
                        if (list (result.values ()))[0] == 1:
                            record_exist = 1

                        if record_exist == 0:
                            insert_sql = f"Insert INTO `cai_niao_company_discountrate_snapshots` (`download_date`,`created_at`,`raw_data`,`page`) VALUES ('{download_date}', '{created_at}', '{raw_data}', {page})"
                            cursor.execute (insert_sql)
                            # MessageToSlack.post_message_to_slack(
                            #     f"Not found today 'cai_niao_company_discountrate_snapshots' {download_date}-page{page} record, insert that record")
                            # MessageToSlack.post_message_to_slack(insert_sql)
                        else:
                            update_sql = f"UPDATE `cai_niao_company_discountrate_snapshots` SET `updated_at` = '{updated_at}',`raw_data` = '{raw_data}' WHERE `download_date` = '{download_date}' AND `page` = {page}"
                            cursor.execute (update_sql)
                            # MessageToSlack.post_message_to_slack(
                            #     f"Found today 'cai_niao_company_discountrate_snapshots' {download_date}-page{page} record, update that record")
                            # MessageToSlack.post_message_to_slack(update_sql)
                    connection.commit ()
            except Exception as ds:
                MessageToSlack.post_message_to_slack (
                    "save_discountrate_snapshots error:%s" % str (ds))
                connection.close ()
            MessageToSlack.post_message_to_slack (
                f"save data to {db_host} {db_name} cai_niao_company_discountrate_snapshots success")
        else:
            MessageToSlack.post_message_to_slack (
                'No data need to store to cai_niao_company_discountrate_snapshots as no data grabbed')

    # 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍等于今天的customer_name数据
    def get_db_customer_data(self):
        record_exist = 0
        db_host = self.pro_db_host
        db_user = self.pro_db_user
        db_pwd = self.pro_db_pwd
        db_name = self.pro_db_name

        # start_year = int(time.strftime('%Y', time.localtime(time.time()))) - 2
        # start_month = time.strftime('%m', time.localtime(time.time()))
        # start_day = time.strftime('%d', time.localtime(time.time()))
        # start_date = f"{start_year}-{start_month}-{start_day}"
        end_date = time.strftime ('%Y-%m-%d', time.localtime (time.time ()))
        try:
            connection = pymysql.connect (host=db_host, user=db_user, password=db_pwd, database=db_name,
                                          cursorclass=pymysql.cursors.DictCursor)
            MessageToSlack.post_message_to_slack (
                "The database is connected to query data to get cai_niao_watermark_dailylogs customer info")
            with connection:
                with connection.cursor () as cursor:
                    # check_sql = "SELECT EXISTS ( SELECT `customer_name` FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s)"
                    check_sql = "SELECT EXISTS ( SELECT `customer_name` FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `download_date` = %s)"
                    cursor.execute (check_sql, end_date)
                    result = cursor.fetchone ()
                    if (list (result.values ()))[0] == 1:
                        record_exist = 1
                    if record_exist == 1:
                        # select_sql = "SELECT customer_id,customer_name FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s"
                        select_sql = "SELECT customer_id,customer_name FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `download_date` = %s"
                        cursor.execute (select_sql, end_date)
                        dbs_response = cursor.fetchall ()
                        all_data = list_dict_duplicate_removal (
                            dbs_response)
                        self.need_customer_data = all_data
                        for wn in all_data:
                            self.we_need_data.append (wn['customer_name'])
                connection.commit ()
        except Exception as ds:
            MessageToSlack.post_message_to_slack (
                "def get_db_customer_data error:%s" % str (ds))
            connection.close ()
        # MessageToSlack.post_message_to_slack(
        #     "today we need to update customer list is:%s" % str(self.we_need_data))

    # 将菜鸟返回的数据中的approveNo赋予cai_niao_watermark_dailylogs表中符合条件的数据
    def dic_logic_handle(self):
        need_customer_data = self.need_customer_data
        parameter_setting_data = self.parameter_setting_data
        for item_p in parameter_setting_data:
            companyName = str (item_p['customer_name'])
            for item_c in need_customer_data:
                customer_name = str (item_c['customer_name'])
                if companyName == customer_name:
                    item_c['approveNo'] = item_p['approveNo']
                else:
                    continue
        self.need_customer_data = need_customer_data

    # 获取cai_niao_company_discountrate_dailylogs今日是否更新数据
    def get_db_discountrate_data(self):
        record_exist = 0
        db_host = self.pro_db_host
        db_user = self.pro_db_user
        db_pwd = self.pro_db_pwd
        db_name = self.pro_db_name
        parameter_setting_data = self.need_customer_data
        customer_id = []
        for i in parameter_setting_data:
            insert_str = "'" + i["customer_id"] + "'"
            customer_id.append (insert_str)
        customer_id = ",".join (customer_id)  # 将列表转为以逗号分隔的字符串
        download_date = datetime.datetime.now ().strftime ('%Y-%m-%d')
        try:
            connection = pymysql.connect (host=db_host, user=db_user, password=db_pwd, database=db_name,
                                          cursorclass=pymysql.cursors.DictCursor)
            MessageToSlack.post_message_to_slack (
                "The database is connected to query data cai_niao_company_discountrate_dailylogs update or not")
            with connection:
                with connection.cursor () as cursor:
                    check_sql = f"SELECT EXISTS ( SELECT `customer_id` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = '{download_date}' AND `customer_id` IN ({customer_id}))"
                    cursor.execute (check_sql)

                    result = cursor.fetchone ()
                    if (list (result.values ()))[0] == 1:
                        record_exist = 1

                    if record_exist == 1:
                        select_sql = f"SELECT `customer_id` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = '{download_date}' AND `customer_id` IN ({customer_id})"
                        cursor.execute (select_sql)
                        dbs_response = cursor.fetchall ()
                        query_customer_data = list_dict_duplicate_removal (
                            dbs_response)

                        exist_data = []
                        for qq in query_customer_data:
                            exist_data.append (qq['customer_id'])

                        for pp in parameter_setting_data:
                            pp_customer_id = pp['customer_id']
                            if pp_customer_id not in exist_data:
                                self.new_need_customer_data.append (pp)
                    else:
                        self.new_need_customer_data = parameter_setting_data
                connection.commit ()
        except Exception as ds:
            MessageToSlack.post_message_to_slack (
                "def get_db_discountrate_data:%s" % str (ds))
            connection.close ()

    # 处理符合条件的数据中的跳转参数
    def goto_parameter_setting(self):
        parameter_setting_data = self.new_need_customer_data
        # MessageToSlack.post_message_to_slack(
        #         "when the self.handle_force equal %s,the get_db_discountrate_data:%s" % (self.handle_force,str(parameter_setting_data)))
        if len (parameter_setting_data) == 0:
            MessageToSlack.post_message_to_slack (
                'Customer array without qualified jump parameters')
        for ii in parameter_setting_data:
            if 'approveNo' in ii:
                approveNo = ii['approveNo']
                customer_id = ii['customer_id']
                customer_name = ii['customer_name']
                parameter_setting_url = 'https://merchant.finance.cainiao.com/funds/signApproveDetail/loanParameterSettings.htm?approveNo=' + approveNo
                try:
                    self.driver.get (parameter_setting_url)
                except Exception as get_parameter_setting:
                    MessageToSlack.post_message_to_slack (
                        str (get_parameter_setting))
                    MessageToSlack.post_message_to_slack (
                        'get parameter setting page again')
                    self.driver.get (parameter_setting_url)
                # print('跳转到参数设置的页面标题为:' + self.driver.title)
                time.sleep (random.randint (4, 6))
                # 调取performance log分析函数
                performance_log_parameter_setting = self.driver.get_log (
                    "performance")
                self.log_analysis_parameter_setting (
                    performance_log_parameter_setting, customer_name, customer_id)
            else:
                MessageToSlack.post_message_to_slack (
                    "%s can't scrap" % str (ii))
                continue

    # 参数设置日志分析函数
    def log_analysis_parameter_setting(self, performance_log_parameter_setting, customer_name, customer_id):
        for row in performance_log_parameter_setting:
            log_json_parameter_setting = json.loads (row['message'])
            log_parameter_setting = log_json_parameter_setting['message']
            if str (log_parameter_setting['method']) != 'Network.responseReceived':
                continue
            url_parameter_setting = log_parameter_setting['params']['response']['url']
            if str (url_parameter_setting) != 'https://merchant.finance.cainiao.com/funds/signApproveDetail/pledgeRateData.do':
                continue
            requestId_parameter_setting = log_parameter_setting['params']['requestId']
            try:
                parameter_setting_page_data = self.driver.execute_cdp_cmd ('Network.getResponseBody', {
                    'requestId': requestId_parameter_setting})['body']
                psd = json.loads (parameter_setting_page_data)
                if str (psd['success']) == 'false':
                    continue
                total_parameter_setting = psd['data']['resultList']
                if len (total_parameter_setting) > 0:
                    resultList = psd['data']['resultList']
                    for item in resultList:
                        item['customer_id'] = customer_id
                        item['customer_name'] = customer_name
                        self.discount_rate_data.append (item)
                else:
                    MessageToSlack.post_message_to_slack (
                        'No data for this company %s was found' % customer_name)
            except Exception as parameter_setting_e:
                MessageToSlack.post_message_to_slack (
                    'Problem after press the search button! ' + str (parameter_setting_e))

    # 多个数据库执行数据存储cai_niao_company_discountrate_dailylogs
    def save_to_db(self):
        # save to dbs
        self.db_save_discount_rate (
            self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name)
        # self.db_save_discount_rate (
        #     self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name)
        # self.db_save_discount_rate (
        #     self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name)

    # 存储数据到cai_niao_company_discountrate_dailylogs
    def db_save_discount_rate(self, db_host, db_user, db_pwd, db_name):
        discount_rate_data = self.discount_rate_data
        # MessageToSlack.post_message_to_slack(
        #     'save to cai_niao_company_discountrate_dailylogs table data is:' + str(discount_rate_data))
        if len (discount_rate_data) > 0:
            try:
                connection = pymysql.connect (
                    host=db_host, user=db_user, password=db_pwd, database=db_name,
                    cursorclass=pymysql.cursors.DictCursor)
                MessageToSlack.post_message_to_slack (
                    "connect with db successfully")
                with connection:
                    for data in discount_rate_data:
                        download_date = datetime.datetime.now ().strftime ('%Y-%m-%d')
                        created_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S')
                        updated_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S')
                        customer_id = data['customer_id']
                        customer_name = data['customer_name']
                        all_assets_value = data['allAssetsValue']
                        assets_receivable_value = data['assetsReceivableValue']
                        central_good_value = data['centralGoodValue']
                        central_pledge_rate = int (
                            data['centralPledgeRate'])  # int
                        currency = data['currency']
                        discount_assets_receivable_value = data['discountAssetsReceivableValue']
                        goods_owner_name = data['goodsOwnerName']
                        goods_value = data['goodsValue']
                        head_trip_pledge_rate = int (
                            data['headTripPledgeRate'])  # int
                        head_trip_value = data['headTripValue']
                        pledge_goods_value = data['pledgeGoodsValue']
                        # int
                        pledge_rate = int (data['pledgeRate'])
                        receivable_pledge_rate = int (
                            data['receivablePledgeRate'])  # int
                        store_name = data['storeName']
                        store_t_b_id = data['storeTBId']
                        record_exist = 0
                        with connection.cursor () as cursor:
                            check_sql = "SELECT EXISTS ( SELECT `download_date` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = %s and `customer_id` = %s and `store_t_b_id` = %s)"
                            cursor.execute (
                                check_sql, (download_date, customer_id, store_t_b_id))
                            result = cursor.fetchone ()
                            if (list (result.values ()))[0] == 1:
                                record_exist = 1

                        if record_exist == 0:
                            # MessageToSlack.post_message_to_slack(
                            #     f"Not found today {customer_name} store name is {store_name}  record, insert that record")
                            # 执行插入数据
                            # 新增语句
                            insert_sql = f"Insert INTO `cai_niao_company_discountrate_dailylogs` (`download_date`,`created_at`,`customer_id`,`customer_name`,`all_assets_value`,`assets_receivable_value`,`central_good_value`,`central_pledge_rate`,`currency`,`discount_assets_receivable_value`,`goods_owner_name`,`goods_value`,`head_trip_pledge_rate`,`head_trip_value`,`pledge_goods_value`,`pledge_rate`,`receivable_pledge_rate`,`store_name`,`store_t_b_id`) VALUES ('{download_date}', '{created_at}', '{customer_id}', '{customer_name}', '{all_assets_value}', '{assets_receivable_value}', '{central_good_value}', {central_pledge_rate}, '{currency}', '{discount_assets_receivable_value}', '{goods_owner_name}', '{goods_value}', {head_trip_pledge_rate}, '{head_trip_value}', '{pledge_goods_value}', {pledge_rate}, {receivable_pledge_rate}, '{store_name}', '{store_t_b_id}')"
                            # MessageToSlack.post_message_to_slack("insert_sql: %s" % insert_sql)
                            with connection.cursor () as cursor:
                                cursor.execute (insert_sql)
                        elif record_exist == 1:
                            # MessageToSlack.post_message_to_slack(
                            #     f"Today {customer_name} record is existed, update that record")
                            # 执行更新数据
                            # 更新语句
                            update_sql = f"UPDATE `cai_niao_company_discountrate_dailylogs` SET `updated_at` = '{updated_at}',`all_assets_value` = '{all_assets_value}',`assets_receivable_value` = '{assets_receivable_value}',`central_good_value` = '{central_good_value}',`central_pledge_rate` = {central_pledge_rate},`currency` = '{currency}',`discount_assets_receivable_value` = '{discount_assets_receivable_value}',`goods_owner_name` = '{goods_owner_name}',`goods_value` = '{goods_value}',`head_trip_pledge_rate` = {head_trip_pledge_rate},`head_trip_value` = '{head_trip_value}',`pledge_goods_value` = '{pledge_goods_value}',`pledge_rate` = {pledge_rate},`receivable_pledge_rate` = {receivable_pledge_rate},`store_name` = '{store_name}',`store_t_b_id` = '{store_t_b_id}' WHERE `download_date` = '{download_date}' and `customer_id` = '{customer_id}' and `store_t_b_id` = '{store_t_b_id}'"
                            # MessageToSlack.post_message_to_slack("update_sql: %s" % update_sql)
                            with connection.cursor () as cursor:
                                cursor.execute (update_sql)
                        connection.commit ()
                MessageToSlack.post_message_to_slack (
                    f"save data to {db_host} {db_name} cai_niao_company_discountrate_dailylogs success")
            except Exception as ds:
                MessageToSlack.post_message_to_slack (
                    "db_save_discount_rate error:%s" % str (ds))
                connection.close ()
        else:
            MessageToSlack.post_message_to_slack (
                'No data need to store as no data grabbed')
        MessageToSlack.post_message_to_slack ("Bye")

    # 关闭所有的浏览器窗口
    def teardown_method(self):
        # self.driver.quit()
        MessageToSlack.post_message_to_slack (
            "close page title: %s" % self.driver.title)
        MessageToSlack.post_message_to_slack ("Browser Close")


# 列表里的字典元素去重复


def list_dict_duplicate_removal(data_list):
    def run_function(x, y): return x if y in x else x + [y]

    return reduce (run_function, [[], ] + data_list)


if __name__ == '__main__':
    force = 0
    if len (sys.argv) > 1:
        force = sys.argv[1]
    # print(force)
    scrape_cainiao = Cainiao_Scrape (
        'https://merchant.finance.cainiao.com/funds/signApproveDetail/SignApproveDetailManage.htm',
        '数据库主机',
        '账户',
        '密码',
        '数据库名称',

        '数据库主机',
        '账户',
        '密码',
        '数据库名称',

        '数据库主机',
        '账户',
        '密码',
        '数据库名称',

        '网站登录账号',
        '网站登录密码',

        0
    )
    # 浏览器配置函数
    scrape_cainiao.setup_method ()
    # 链接菜鸟后台网址,并scrape相关数据
    scrape_cainiao.login_cainiao ()

 

posted @ 2023-02-27 16:04  爱家家的卡卡  阅读(63)  评论(0编辑  收藏  举报