import json import os, sys import random import pandas as pd import time import urllib3 from pyvirtualdisplay import Display urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) ABS_PATH = os.path.abspath(os.path.join(os.path.abspath(__file__), "..")) sys.path.append(ABS_PATH) TEST_PATH = os.path.join(ABS_PATH, 'test') chrome_driver_path = os.getenv('executable_path') from core.library.utils import LogManager from core.library.common.storage_manager.blob import AzureBlob from core.library.modules.requester import Requester from core.library.utils import utils CONTAINER_NAME = "regulation" # -*- coding: utf-8 -*- import sys, os import re import random import time import undetected_chromedriver as uc from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import Select from bs4 import BeautifulSoup ABS_PATH = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..")) DATASET_PATH = os.path.abspath(os.path.join(os.path.abspath(__file__), "..")) sys.path.append(ABS_PATH) sys.path.append(DATASET_PATH) chrome_driver_path = os.getenv('executable_path') from core.library.utils import LogManager, utils from helper_class import * class INTERFACING(): def __init__(self): self.driver_initialized = False self.driver = '' self.MAX_TRIALS = 2 self.chrome_version = utils.get_google_chrome_version() def make_soup(self): return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML() def current_url(self): return self.driver.current_url def get_driver(self): uc.TARGET_VERSION = utils.get_google_chrome_version() chrome_options = uc.ChromeOptions() # chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920.,1080") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--profile-directory=Default") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--incognito") chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--no-service-autorun") chrome_options.add_argument("--no-default-browser-check") chrome_options.add_argument("--password-store=basic") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) self.driver = uc.Chrome(options=chrome_options, version_main=utils.get_google_chrome_version(), driver_executable_path=chrome_driver_path) time.sleep(10) self.driver_initialized = True def close_driver(self): self.driver.quit() def get_selenium_response(self, url): soup_list=[] if not self.driver_initialized: self.get_driver() else: pass self.driver.get(url) time.sleep(3) # 找到下拉框元素 select_ele = Select(self.driver.find_element_by_xpath("//*[@id='example_length']/label/select")) # 选择 100 条/页 select_ele.select_by_value("100") time.sleep(5) soup = self.make_soup() soup_list.append(soup) num = 1 while num < 107: try: next_page = self.driver.find_element_by_xpath('//*[@id="example_next"]/a') next_page.click() time.sleep(3) num += 1 soup = self.make_soup() soup_list.append(soup) except: print('已到达最后一页,共爬取{}页'.format(num)) break def get_page_source(self): return self.driver.page_source def clicking(self, xpath): elem = self.driver.find_element(By.XPATH, xpath) elem.click() time.sleep(random.randint(2, 3)) def entering_values(self, xpath, value): elem = self.driver.find_element(By.XPATH, xpath) elem.clear() elem.send_keys(value) time.sleep(random.randint(2, 4)) def send_keys(self, xpath): elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN) def going_back(self): self.driver.execute_script("window.history.go(-1)") time.sleep(1) def refresh_page(self): self.driver.refresh() def close_handle(self): self.driver.close() def get_current_handle(self): return self.driver.current_window_handle def get_all_handles(self): return self.driver.window_handles def swtich_to_window(self, handle): self.driver.switch_to.window(handle) class IndiaRegistrationRequester(Requester): def request(self): # with Display(visible=False, size=(1920, 1080)) as display: self.news_request(self.url) def news_request(self, url): """ 获取新闻更新页面及其对应详情页面中的所有数据 url: 新闻更新页面url x_path: 翻页的xpath路径 return: 返回更新页所有数据 """ soup_list = self.get_response(url) print(soup_list) self.content = pd.DataFrame() def get_response(self, link): soup_list = INTERFACING().get_selenium_response(link) return soup_list