python爬数据,链接端口的那种
chrome://version/ 可执行文件路径 /Applications/Google Chrome.app/Contents/MacOS/Google Google\ Chrome --remote-debugging-port=9222 --user-data-dir="~/ChromeProfile" /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
然后代码:
import datetime import time import json from collections.abc import Mapping from collections.abc import MutableMapping import MessageToSlack import os import pymysql.cursors import random import sys import undetected_chromedriver.v2 as uc from functools import reduce from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # 用来驱动浏览器的 from selenium import webdriver from selenium.webdriver.chrome.options import Options # from decouple import config class Cainiao_Scrape: need_customer_data = [] we_need_data = [] new_need_customer_data = [] discount_rate_data = [] parameter_setting_data = [] page_count = 0 # 初始设置函数 def __init__(self, web_url, pro_db_host, pro_db_user, pro_db_pwd, pro_db_name, u02_db_host, u02_db_user, u02_db_pwd, u02_db_name, u01_db_host, u01_db_user, u01_db_pwd, u01_db_name, cainiao_ac, cainiao_pwd, handle_force): self.vars = None self.driver = None self.web_url = web_url # production self.pro_db_host = pro_db_host self.pro_db_user = pro_db_user self.pro_db_pwd = pro_db_pwd self.pro_db_name = pro_db_name # u02 self.u02_db_host = u02_db_host self.u02_db_user = u02_db_user self.u02_db_pwd = u02_db_pwd self.u02_db_name = u02_db_name # u01 self.u01_db_host = u01_db_host self.u01_db_user = u01_db_user self.u01_db_pwd = u01_db_pwd self.u01_db_name = u01_db_name self.cainiao_ac = cainiao_ac self.cainiao_pwd = cainiao_pwd self.handle_force = handle_force # 浏览器配置 def setup_method(self): port_number = "127.0.0.1:9222" try: check_folder = os.path.isdir ("ScreenCap") if not check_folder: os.makedirs ("ScreenCap") chrome_options = webdriver.ChromeOptions () chrome_options.add_experimental_option ( "debuggerAddress", port_number) capabilities = DesiredCapabilities.CHROME capabilities["goog:loggingPrefs"] = { 'browser': 'ALL', "performance": "ALL"} # 必须有这一句,才能在后面获取到performance # capabilities["goog:perfLoggingPrefs"] = {'enableNetwork': True} # chrome_options.add_argument ("--disable-extensions") # chrome_options.add_argument ("--disable-popup-blocking") # chrome_options.add_argument ("--ignore-certificate-errors") # chrome_options.add_argument ("--disable-plugins-discovery") # chrome_options.add_argument ('--no-first-run') # chrome_options.add_argument ('--no-service-autorun') # chrome_options.add_argument ('--no-default-browser-check') # chrome_options.add_argument ('no-startup-window') # chrome_options.add_argument ('--disable-dev-shm-usage') # chrome_options.add_argument('--no-sandbox') # highest right to operate # --headless是不显示浏览器启动及执行过程 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') # 不加载gpu,规避bug self.driver = webdriver.Chrome (options=chrome_options, desired_capabilities=capabilities) # self.driver = uc.Chrome (options=chrome_options, desired_capabilities=capabilities, version_main=110) self.vars = {} # 清除浏览器cookie self.driver.delete_all_cookies () MessageToSlack.post_message_to_slack ("Start Chrome Driver") except Exception as r: self.teardown_method MessageToSlack.post_message_to_slack ( "setup_method info:%s" % str (r)) sys.exit () # 菜鸟登录的函数 def login_cainiao(self): try: MessageToSlack.post_message_to_slack ("Go to cainiao website") try: self.driver.get (self.web_url) except Exception as get2: MessageToSlack.post_message_to_slack ( f'get that page again: {str (get2)}') self.driver.get (self.web_url) link = self.web_url time.sleep (random.randint (4, 6)) # try: # # 打开本地的浏览器的时候,先跳过登录这一块然后再测试 # self.driver.switch_to.frame (0) # time.sleep (random.randint (4, 6)) # # 点击输入账户 # self.driver.find_element ( # By.XPATH, "//input[@id='fm-login-id']").click () # time.sleep (2) # self.driver.find_element ( # By.XPATH, "//input[@id='fm-login-id']").send_keys (self.cainiao_ac) # # 点击输入密码 # self.driver.find_element ( # By.XPATH, "//input[@id='fm-login-password']").click () # time.sleep (2) # self.driver.find_element ( # By.XPATH, "//input[@id='fm-login-password']").send_keys (self.cainiao_pwd) # # 调取滑块验证部分 # self.slide_auth () # # 点击登陆按钮 # self.driver.find_element ( # By.XPATH, "//button[@type='submit']").click () # MessageToSlack.post_message_to_slack ('pressed login button') # # 如果有快速进入的验证框,点击快速验证 # self.driver.switch_to.frame (0) # self.driver.find_element ( # By.XPATH, "//button[@type='submit']").click () # MessageToSlack.post_message_to_slack ( # 'pressed quick entry button') # except Exception as gin1: # MessageToSlack.post_message_to_slack ( # f'login error :{str (gin1)}') # pass # time.sleep (random.randint (4, 6)) try: self.driver.get (self.web_url) except Exception as get1: MessageToSlack.post_message_to_slack ( 'get that page for the second time:%s' % str (get1)) self.driver.get (self.web_url) # time.sleep (random.randint (4, 6)) MessageToSlack.post_message_to_slack ("Get Data Page") # 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍等于今天的customer_name数据 self.get_db_customer_data () # 查询符合时间段内的全部公司列表 self.query_all_customer () # 将菜鸟返回的数据中的approveNo赋予cai_niao_watermark_dailylogs表中符合条件的数据 self.dic_logic_handle () # 判断是否从头开始跑还是从上次中断时开始跑 if int (self.handle_force) == 0: # 获取cai_niao_company_discountrate_dailylogs今日是否更新数据 self.get_db_discountrate_data () else: self.new_need_customer_data = self.need_customer_data # 处理符合条件的数据中的跳转参数 self.goto_parameter_setting () # 处理数据保存 self.save_to_db () MessageToSlack.post_message_to_slack ("Data Scrape finished") except Exception as tc: self.driver.get_screenshot_as_file ( "./ScreenCap/" + datetime.datetime.now ().strftime ("%Y%m%d_%H%M%p") + ".png") # 异常时捕获页面截图 MessageToSlack.post_message_to_slack ( "def login_cainiao error:%s" % str (tc)) self.teardown_method # 调用关闭浏览器函数 sys.exit () # 退出程序机制 通过引发SystemExit异常来退出Python程序 # 获取拖动按钮位置并拖动 def slide_auth(self): try: time.sleep (random.randint (4, 6)) # check have slide verify elements or not if (self.driver.find_element (By.ID, "baxia-password")).is_displayed (): self.driver.switch_to.frame ('baxia-dialog-content') element = self.driver.find_element (By.ID, "nc_1_n1z") slide_bar = self.driver.find_element (By.ID, "nc_1__scale_text") action = ActionChains (self.driver) action.move_to_element (element).pause (1).click_and_hold (element).pause (1).move_by_offset ( slide_bar.size['width'] - element.size['width'], 0).pause (1).release () # mobile slider action.perform () MessageToSlack.post_message_to_slack ( "Slide the verification bar") self.driver.switch_to.parent_frame () time.sleep (2) if (self.driver.find_element (By.ID, "baxia-password")).is_displayed (): counter1 = 0 while (self.driver.find_element (By.ID, "baxia-password")).is_displayed () and counter1 < 6: self.driver.switch_to.frame ('baxia-dialog-content') wrapper = self.driver.find_element (By.ID, "nocaptcha") wrapper.click () time.sleep (random.randint (2, 3)) element = self.driver.find_element (By.ID, "nc_1_n1z") slide_bar = self.driver.find_element ( By.ID, "nc_1__scale_text") action = ActionChains (self.driver) action.move_to_element (element).pause (1).click_and_hold (element).pause (1).move_by_offset ( slide_bar.size['width'] - element.size['width'], 0).release () # mobile slider action.perform () MessageToSlack.post_message_to_slack ( "Slide the verification bar") self.driver.switch_to.parent_frame () time.sleep (random.randint (2, 3)) counter1 += 1 counter1 = 0 if (self.driver.find_element (By.ID, "baxia-password")).is_displayed (): raise Exception ( "Still cannot pass the slide certification after 5 attempt") except Exception as s: self.driver.get_screenshot_as_file ( "./ScreenCap/" + datetime.datetime.now ().strftime ("%Y%m%d_%H%M%p") + ".png") MessageToSlack.post_message_to_slack ( "def slide_auth error:%s" % str (s)) self.teardown_method sys.exit () # 查询符合时间段内的全部公司列表(日期範圍由2018-01-01的今天到今天) def query_all_customer(self): if (self.driver.find_element (By.XPATH, '//form[@id="J_Form"]')).is_displayed (): # 查询搜索框存在 MessageToSlack.post_message_to_slack ( 'customer name search form exists') # 日期选择范围(之前是2年前的今天,后改为2018-01-01) # start_year = int(time.strftime('%Y', time.localtime(time.time()))) - 2 # start_month = int(time.strftime('%m', time.localtime(time.time()))) # start_day = int(time.strftime('%d', time.localtime(time.time()))) start_year = 2018 start_month = 1 start_day = 1 # 年月日的XPATH current_selection_year_start = '.c-cal-year[data-year="%s"]' % start_year current_selection_month_start = '.c-cal-month[data-month="%s"]' % start_month current_selection_day_start = '.c-cal-day[data-day="%s"]' % start_day # 点击日期选择器的DIV self.driver.find_element ( By.XPATH, "//div[@id='c-calendar-select-66']").click () time.sleep (1) # 点击日期选择器的年份 self.driver.find_element ( By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[1]').click () time.sleep (1) self.driver.find_element ( By.CSS_SELECTOR, current_selection_year_start).click () # 点击日期选择器的月份 self.driver.find_element ( By.XPATH, '//*[@id="c-calendar-87"]/div/div[1]/div[2]').click () time.sleep (1) self.driver.find_element ( By.CSS_SELECTOR, current_selection_month_start).click () # 点击日期选择器的日期 self.driver.find_element ( By.CSS_SELECTOR, current_selection_day_start).click () # 点击确定 self.driver.find_element ( By.XPATH, '//*[@id="c-calendar-81"]/div[3]/button').click () # 点击查询 self.driver.find_element (By.XPATH, "//div[@id='J_search']").click () time.sleep (random.randint (2, 4)) # 获取总共的条数 paging_total_num = int (self.driver.find_element ( By.CSS_SELECTOR, '#J_Paging > div.paging-total-num > span').text) # 如果总条数大于10,设置每页显示条数为30 if paging_total_num > 10: self.driver.get_log ("performance") self.multi_page_handle () else: """ 获取第一页的performance_log,并分析存入数据库 get_log方法用于获取给定日志类型的日 get log first time, after get once log will be deleted """ first_page_log = self.driver.get_log ("performance") self.log_analysis_customer_query (first_page_log, 1) else: # 查询搜索框不存在时返回信息 raise Exception ( "cannot find this search form in customer list page. Please check is login problem or other") # MessageToSlack.post_message_to_slack( # 'from customer_query log,we can update customer list is:' + str(self.parameter_setting_data)) # 客户列表日志分析函数 def log_analysis_customer_query(self, performance_log_customer_query, page): db_customer_data = self.need_customer_data we_need_data = self.we_need_data for row in performance_log_customer_query: log_json_customer_query = json.loads (row['message']) log_customer_query = log_json_customer_query['message'] method_customer_query = str (log_customer_query['method']) if method_customer_query != 'Network.responseReceived': continue url_customer_query = str ( log_customer_query['params']['response']['url']) if url_customer_query != 'https://merchant.finance.cainiao.com/funds/signApproveDetail/signApproveDetailData.do': continue requestId_customer_query = str ( log_customer_query['params']['requestId']) try: customer_query_page_data = \ self.driver.execute_cdp_cmd ('Network.getResponseBody', {'requestId': requestId_customer_query})[ 'body'] cqd = json.loads (customer_query_page_data) cqd['data']['page'] = page cqd['data']['page_total'] = len (cqd['data']['resultList']) # 存储数据到cai_niao_company_discountrate_snapshots到各个数据库 self.save_to_discountrate_snapshots (cqd, page) total_customer_query = cqd['data']['total'] if total_customer_query > 0: # 调取跳转参数设置页面的函数 resultList = cqd['data']['resultList'] for item in resultList: approveNo = str (item['approveNo']) customer_name = str (item['companyName']) item_status = str (item['status']) if item_status == 'UN_SIGN': continue if item_status == 'FUND_APPROVE_PASS': if customer_name not in we_need_data: continue else: self.parameter_setting_data.append ({ 'approveNo': approveNo, 'customer_name': customer_name }) else: MessageToSlack.post_message_to_slack ('No data was found') except Exception as customer_query_e: MessageToSlack.post_message_to_slack ( 'def log_analysis_customer_query error:' + str (customer_query_e)) continue break # 多页面处理 def multi_page_handle(self): # 点击每页【】条的DIV框 self.driver.find_element ( By.XPATH, '//*[@id="c-select-226"]').click () time.sleep (1) # 点击页数列表的最后的一个li self.driver.find_element ( By.XPATH, '//ul[@id="c-list-251"]/li[last()]/span').click () time.sleep (random.randint (3, 4)) first_page_log = self.driver.get_log ("performance") self.log_analysis_customer_query (first_page_log, 1) # 获取页面显示页数,如果页面显示页数大于1,for循环跳转页面,并获取相应的performance_log分析存入数据库 paging_total_page = int (self.driver.find_element ( By.CSS_SELECTOR, '#J_Paging > div.paging-total-page > span').text) if paging_total_page > 1: initial_number = 1 while initial_number < paging_total_page: initial_number += 1 # 判断点击跳转下一页的btn是否可点击 btn_right = self.driver.find_element ( By.CSS_SELECTOR, '#J_Paging > button.btn.btn-default.btn-sm.paging-next-btn.c-icon.c-icon-arrow-right') if btn_right.is_enabled (): btn_right.click () time.sleep (random.randint (2, 4)) # 获取页面performance_log,并分析存入数据库 current_page_log = self.driver.get_log ("performance") self.log_analysis_customer_query ( current_page_log, initial_number) else: break # 多个数据库执行数据存储到cai_niao_company_discountrate_snapshots def save_to_discountrate_snapshots(self, data, page): # save to dbs self.save_discountrate_snapshots ( data, page, self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name) # self.save_discountrate_snapshots ( # data, page, self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name) # self.save_discountrate_snapshots ( # data, page, self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name) # 存储数据到cai_niao_company_discountrate_snapshots def save_discountrate_snapshots(self, data, page, db_host, db_user, db_pwd, db_name): record_exist = 0 page = int (page) if len (data) > 0: try: connection = pymysql.connect ( host=db_host, user=db_user, password=db_pwd, database=db_name, cursorclass=pymysql.cursors.DictCursor) with connection: raw_data = json.dumps (data) download_date = datetime.datetime.now ().strftime ('%Y-%m-%d') created_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S') updated_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S') with connection.cursor () as cursor: check_sql = f"SELECT EXISTS ( SELECT `id` FROM `cai_niao_company_discountrate_snapshots` WHERE `download_date` = '{download_date}' AND `page` = {page})" cursor.execute (check_sql) result = cursor.fetchone () if (list (result.values ()))[0] == 1: record_exist = 1 if record_exist == 0: insert_sql = f"Insert INTO `cai_niao_company_discountrate_snapshots` (`download_date`,`created_at`,`raw_data`,`page`) VALUES ('{download_date}', '{created_at}', '{raw_data}', {page})" cursor.execute (insert_sql) # MessageToSlack.post_message_to_slack( # f"Not found today 'cai_niao_company_discountrate_snapshots' {download_date}-page{page} record, insert that record") # MessageToSlack.post_message_to_slack(insert_sql) else: update_sql = f"UPDATE `cai_niao_company_discountrate_snapshots` SET `updated_at` = '{updated_at}',`raw_data` = '{raw_data}' WHERE `download_date` = '{download_date}' AND `page` = {page}" cursor.execute (update_sql) # MessageToSlack.post_message_to_slack( # f"Found today 'cai_niao_company_discountrate_snapshots' {download_date}-page{page} record, update that record") # MessageToSlack.post_message_to_slack(update_sql) connection.commit () except Exception as ds: MessageToSlack.post_message_to_slack ( "save_discountrate_snapshots error:%s" % str (ds)) connection.close () MessageToSlack.post_message_to_slack ( f"save data to {db_host} {db_name} cai_niao_company_discountrate_snapshots success") else: MessageToSlack.post_message_to_slack ( 'No data need to store to cai_niao_company_discountrate_snapshots as no data grabbed') # 获取cai_niao_watermark_dailylogs表中stock_central_amt>0并且日期範圍等于今天的customer_name数据 def get_db_customer_data(self): record_exist = 0 db_host = self.pro_db_host db_user = self.pro_db_user db_pwd = self.pro_db_pwd db_name = self.pro_db_name # start_year = int(time.strftime('%Y', time.localtime(time.time()))) - 2 # start_month = time.strftime('%m', time.localtime(time.time())) # start_day = time.strftime('%d', time.localtime(time.time())) # start_date = f"{start_year}-{start_month}-{start_day}" end_date = time.strftime ('%Y-%m-%d', time.localtime (time.time ())) try: connection = pymysql.connect (host=db_host, user=db_user, password=db_pwd, database=db_name, cursorclass=pymysql.cursors.DictCursor) MessageToSlack.post_message_to_slack ( "The database is connected to query data to get cai_niao_watermark_dailylogs customer info") with connection: with connection.cursor () as cursor: # check_sql = "SELECT EXISTS ( SELECT `customer_name` FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s)" check_sql = "SELECT EXISTS ( SELECT `customer_name` FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `download_date` = %s)" cursor.execute (check_sql, end_date) result = cursor.fetchone () if (list (result.values ()))[0] == 1: record_exist = 1 if record_exist == 1: # select_sql = "SELECT customer_id,customer_name FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `created_at` BETWEEN %s AND %s" select_sql = "SELECT customer_id,customer_name FROM `cai_niao_watermark_dailylogs` WHERE `stock_central_amt` > 0 AND `download_date` = %s" cursor.execute (select_sql, end_date) dbs_response = cursor.fetchall () all_data = list_dict_duplicate_removal ( dbs_response) self.need_customer_data = all_data for wn in all_data: self.we_need_data.append (wn['customer_name']) connection.commit () except Exception as ds: MessageToSlack.post_message_to_slack ( "def get_db_customer_data error:%s" % str (ds)) connection.close () # MessageToSlack.post_message_to_slack( # "today we need to update customer list is:%s" % str(self.we_need_data)) # 将菜鸟返回的数据中的approveNo赋予cai_niao_watermark_dailylogs表中符合条件的数据 def dic_logic_handle(self): need_customer_data = self.need_customer_data parameter_setting_data = self.parameter_setting_data for item_p in parameter_setting_data: companyName = str (item_p['customer_name']) for item_c in need_customer_data: customer_name = str (item_c['customer_name']) if companyName == customer_name: item_c['approveNo'] = item_p['approveNo'] else: continue self.need_customer_data = need_customer_data # 获取cai_niao_company_discountrate_dailylogs今日是否更新数据 def get_db_discountrate_data(self): record_exist = 0 db_host = self.pro_db_host db_user = self.pro_db_user db_pwd = self.pro_db_pwd db_name = self.pro_db_name parameter_setting_data = self.need_customer_data customer_id = [] for i in parameter_setting_data: insert_str = "'" + i["customer_id"] + "'" customer_id.append (insert_str) customer_id = ",".join (customer_id) # 将列表转为以逗号分隔的字符串 download_date = datetime.datetime.now ().strftime ('%Y-%m-%d') try: connection = pymysql.connect (host=db_host, user=db_user, password=db_pwd, database=db_name, cursorclass=pymysql.cursors.DictCursor) MessageToSlack.post_message_to_slack ( "The database is connected to query data cai_niao_company_discountrate_dailylogs update or not") with connection: with connection.cursor () as cursor: check_sql = f"SELECT EXISTS ( SELECT `customer_id` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = '{download_date}' AND `customer_id` IN ({customer_id}))" cursor.execute (check_sql) result = cursor.fetchone () if (list (result.values ()))[0] == 1: record_exist = 1 if record_exist == 1: select_sql = f"SELECT `customer_id` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = '{download_date}' AND `customer_id` IN ({customer_id})" cursor.execute (select_sql) dbs_response = cursor.fetchall () query_customer_data = list_dict_duplicate_removal ( dbs_response) exist_data = [] for qq in query_customer_data: exist_data.append (qq['customer_id']) for pp in parameter_setting_data: pp_customer_id = pp['customer_id'] if pp_customer_id not in exist_data: self.new_need_customer_data.append (pp) else: self.new_need_customer_data = parameter_setting_data connection.commit () except Exception as ds: MessageToSlack.post_message_to_slack ( "def get_db_discountrate_data:%s" % str (ds)) connection.close () # 处理符合条件的数据中的跳转参数 def goto_parameter_setting(self): parameter_setting_data = self.new_need_customer_data # MessageToSlack.post_message_to_slack( # "when the self.handle_force equal %s,the get_db_discountrate_data:%s" % (self.handle_force,str(parameter_setting_data))) if len (parameter_setting_data) == 0: MessageToSlack.post_message_to_slack ( 'Customer array without qualified jump parameters') for ii in parameter_setting_data: if 'approveNo' in ii: approveNo = ii['approveNo'] customer_id = ii['customer_id'] customer_name = ii['customer_name'] parameter_setting_url = 'https://merchant.finance.cainiao.com/funds/signApproveDetail/loanParameterSettings.htm?approveNo=' + approveNo try: self.driver.get (parameter_setting_url) except Exception as get_parameter_setting: MessageToSlack.post_message_to_slack ( str (get_parameter_setting)) MessageToSlack.post_message_to_slack ( 'get parameter setting page again') self.driver.get (parameter_setting_url) # print('跳转到参数设置的页面标题为:' + self.driver.title) time.sleep (random.randint (4, 6)) # 调取performance log分析函数 performance_log_parameter_setting = self.driver.get_log ( "performance") self.log_analysis_parameter_setting ( performance_log_parameter_setting, customer_name, customer_id) else: MessageToSlack.post_message_to_slack ( "%s can't scrap" % str (ii)) continue # 参数设置日志分析函数 def log_analysis_parameter_setting(self, performance_log_parameter_setting, customer_name, customer_id): for row in performance_log_parameter_setting: log_json_parameter_setting = json.loads (row['message']) log_parameter_setting = log_json_parameter_setting['message'] if str (log_parameter_setting['method']) != 'Network.responseReceived': continue url_parameter_setting = log_parameter_setting['params']['response']['url'] if str (url_parameter_setting) != 'https://merchant.finance.cainiao.com/funds/signApproveDetail/pledgeRateData.do': continue requestId_parameter_setting = log_parameter_setting['params']['requestId'] try: parameter_setting_page_data = self.driver.execute_cdp_cmd ('Network.getResponseBody', { 'requestId': requestId_parameter_setting})['body'] psd = json.loads (parameter_setting_page_data) if str (psd['success']) == 'false': continue total_parameter_setting = psd['data']['resultList'] if len (total_parameter_setting) > 0: resultList = psd['data']['resultList'] for item in resultList: item['customer_id'] = customer_id item['customer_name'] = customer_name self.discount_rate_data.append (item) else: MessageToSlack.post_message_to_slack ( 'No data for this company %s was found' % customer_name) except Exception as parameter_setting_e: MessageToSlack.post_message_to_slack ( 'Problem after press the search button! ' + str (parameter_setting_e)) # 多个数据库执行数据存储cai_niao_company_discountrate_dailylogs def save_to_db(self): # save to dbs self.db_save_discount_rate ( self.pro_db_host, self.pro_db_user, self.pro_db_pwd, self.pro_db_name) # self.db_save_discount_rate ( # self.u02_db_host, self.u02_db_user, self.u02_db_pwd, self.u02_db_name) # self.db_save_discount_rate ( # self.u01_db_host, self.u01_db_user, self.u01_db_pwd, self.u01_db_name) # 存储数据到cai_niao_company_discountrate_dailylogs def db_save_discount_rate(self, db_host, db_user, db_pwd, db_name): discount_rate_data = self.discount_rate_data # MessageToSlack.post_message_to_slack( # 'save to cai_niao_company_discountrate_dailylogs table data is:' + str(discount_rate_data)) if len (discount_rate_data) > 0: try: connection = pymysql.connect ( host=db_host, user=db_user, password=db_pwd, database=db_name, cursorclass=pymysql.cursors.DictCursor) MessageToSlack.post_message_to_slack ( "connect with db successfully") with connection: for data in discount_rate_data: download_date = datetime.datetime.now ().strftime ('%Y-%m-%d') created_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S') updated_at = datetime.datetime.now ().strftime ('%Y-%m-%d %H:%M:%S') customer_id = data['customer_id'] customer_name = data['customer_name'] all_assets_value = data['allAssetsValue'] assets_receivable_value = data['assetsReceivableValue'] central_good_value = data['centralGoodValue'] central_pledge_rate = int ( data['centralPledgeRate']) # int currency = data['currency'] discount_assets_receivable_value = data['discountAssetsReceivableValue'] goods_owner_name = data['goodsOwnerName'] goods_value = data['goodsValue'] head_trip_pledge_rate = int ( data['headTripPledgeRate']) # int head_trip_value = data['headTripValue'] pledge_goods_value = data['pledgeGoodsValue'] # int pledge_rate = int (data['pledgeRate']) receivable_pledge_rate = int ( data['receivablePledgeRate']) # int store_name = data['storeName'] store_t_b_id = data['storeTBId'] record_exist = 0 with connection.cursor () as cursor: check_sql = "SELECT EXISTS ( SELECT `download_date` FROM `cai_niao_company_discountrate_dailylogs` WHERE `download_date` = %s and `customer_id` = %s and `store_t_b_id` = %s)" cursor.execute ( check_sql, (download_date, customer_id, store_t_b_id)) result = cursor.fetchone () if (list (result.values ()))[0] == 1: record_exist = 1 if record_exist == 0: # MessageToSlack.post_message_to_slack( # f"Not found today {customer_name} store name is {store_name} record, insert that record") # 执行插入数据 # 新增语句 insert_sql = f"Insert INTO `cai_niao_company_discountrate_dailylogs` (`download_date`,`created_at`,`customer_id`,`customer_name`,`all_assets_value`,`assets_receivable_value`,`central_good_value`,`central_pledge_rate`,`currency`,`discount_assets_receivable_value`,`goods_owner_name`,`goods_value`,`head_trip_pledge_rate`,`head_trip_value`,`pledge_goods_value`,`pledge_rate`,`receivable_pledge_rate`,`store_name`,`store_t_b_id`) VALUES ('{download_date}', '{created_at}', '{customer_id}', '{customer_name}', '{all_assets_value}', '{assets_receivable_value}', '{central_good_value}', {central_pledge_rate}, '{currency}', '{discount_assets_receivable_value}', '{goods_owner_name}', '{goods_value}', {head_trip_pledge_rate}, '{head_trip_value}', '{pledge_goods_value}', {pledge_rate}, {receivable_pledge_rate}, '{store_name}', '{store_t_b_id}')" # MessageToSlack.post_message_to_slack("insert_sql: %s" % insert_sql) with connection.cursor () as cursor: cursor.execute (insert_sql) elif record_exist == 1: # MessageToSlack.post_message_to_slack( # f"Today {customer_name} record is existed, update that record") # 执行更新数据 # 更新语句 update_sql = f"UPDATE `cai_niao_company_discountrate_dailylogs` SET `updated_at` = '{updated_at}',`all_assets_value` = '{all_assets_value}',`assets_receivable_value` = '{assets_receivable_value}',`central_good_value` = '{central_good_value}',`central_pledge_rate` = {central_pledge_rate},`currency` = '{currency}',`discount_assets_receivable_value` = '{discount_assets_receivable_value}',`goods_owner_name` = '{goods_owner_name}',`goods_value` = '{goods_value}',`head_trip_pledge_rate` = {head_trip_pledge_rate},`head_trip_value` = '{head_trip_value}',`pledge_goods_value` = '{pledge_goods_value}',`pledge_rate` = {pledge_rate},`receivable_pledge_rate` = {receivable_pledge_rate},`store_name` = '{store_name}',`store_t_b_id` = '{store_t_b_id}' WHERE `download_date` = '{download_date}' and `customer_id` = '{customer_id}' and `store_t_b_id` = '{store_t_b_id}'" # MessageToSlack.post_message_to_slack("update_sql: %s" % update_sql) with connection.cursor () as cursor: cursor.execute (update_sql) connection.commit () MessageToSlack.post_message_to_slack ( f"save data to {db_host} {db_name} cai_niao_company_discountrate_dailylogs success") except Exception as ds: MessageToSlack.post_message_to_slack ( "db_save_discount_rate error:%s" % str (ds)) connection.close () else: MessageToSlack.post_message_to_slack ( 'No data need to store as no data grabbed') MessageToSlack.post_message_to_slack ("Bye") # 关闭所有的浏览器窗口 def teardown_method(self): # self.driver.quit() MessageToSlack.post_message_to_slack ( "close page title: %s" % self.driver.title) MessageToSlack.post_message_to_slack ("Browser Close") # 列表里的字典元素去重复 def list_dict_duplicate_removal(data_list): def run_function(x, y): return x if y in x else x + [y] return reduce (run_function, [[], ] + data_list) if __name__ == '__main__': force = 0 if len (sys.argv) > 1: force = sys.argv[1] # print(force) scrape_cainiao = Cainiao_Scrape ( 'https://merchant.finance.cainiao.com/funds/signApproveDetail/SignApproveDetailManage.htm', '数据库主机', '账户', '密码', '数据库名称', '数据库主机', '账户', '密码', '数据库名称', '数据库主机', '账户', '密码', '数据库名称', '网站登录账号', '网站登录密码', 0 ) # 浏览器配置函数 scrape_cainiao.setup_method () # 链接菜鸟后台网址,并scrape相关数据 scrape_cainiao.login_cainiao ()