import os import ssl import sys import time import pymysql import undetected_chromedriver as uc from selenium import webdriver path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(path) from spider_setting import MYSQL_HOST, MYSQL_POST, MYSQL_PASSWORD, MYSQL_USER class Papunika(object): def __init__(self): self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database="cloud_joy_monitoring", user=MYSQL_USER, password=MYSQL_PASSWORD, charset='utf8', autocommit=True) self.cursor = self.db.cursor() self.main() def main(self): # 浏览器选项 chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # 解决DevToolsActivePort文件不存在的报错 chrome_options.add_argument('--no-sandbox') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') # 设置默认编码为utf-8 chrome_options.add_argument('--lang=zh-CN') # chrome_options.add_argument('disable-cache') chrome_options.add_argument('--disable-javascript') chrome_options.add_argument('--disable-java') prefs = { 'profile.default_content_setting_values': { 'images': 2, 'javascript': 2 # 2即为禁用的意思 } } chrome_options.add_experimental_option('prefs', prefs) # 隐藏滚动条, 应对一些特殊页面 chrome_options.add_argument('--hide-scrollbars') chrome_options.add_argument("--proxy-server=192.168.104.134:7890") # chrome_options.add_argument('–user-data-dir=C:/Users/cf.yu/AppData/Local/Google/Chrome/User Data') # chrome_options.add_argument('--profile-directory=Default') # 禁止加载图片 chrome_options.add_argument('blink-settings=imagesEnabled=false') # 指定浏览器分辨率 chrome_options.add_argument('--start-maximized') ssl._create_default_https_context = ssl._create_unverified_context uc.TARGET_VERSION = 101 # driver = uc.Chrome(options=chrome_options) driver = webdriver.Chrome(options=chrome_options) driver.get('https://papunika.com/') self.cursor.execute('select id, url from papunika_url order by id') for data in self.cursor.fetchall(): url = data[1] if "https://papunika.com/" in url and url.endswith("/"): print(data) key = url.replace("https://papunika.com/", "")[:-1] if not key: key = "index" if not os.path.exists("E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(key)): # or 1 == 1 driver.get(url) time.sleep(1) handles = driver.window_handles driver.switch_to.window(handles[-1]) time.sleep(1) # driver.execute_script("var leafArr = $('.leaflet-tooltip'); leafArr.each(function(){$(this).attr('name',$(this).text())})") driver.execute_script("document.querySelectorAll('.nk-gap-2, .code-block,#BorlabsCookieBoxWrap,#BorlabsCookieBox, #menu-item-9627, #menu-item-10519, #borlabs-cookie-js-after').forEach(node=>node.remove())") url = driver.current_url key = url.replace("https://papunika.com/", "")[:-1] if not key: key = "index" print("js执行完成:{}".format(url)) time.sleep(1) page = driver.page_source save_path = self.save_path(key, False) print(url, save_path, handles) path = '/'.join(save_path.split("/")[:-1]) try: self.save_file(save_path, page, path) except Exception as e: print("错误:{}".format(e)) continue # break driver.close() def save_path(self, number, status): if status: save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_cn/{}.html".format(number) else: save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(number) return save_path def save_file(self, file_name, page, path): if not os.path.exists(path): os.makedirs(path) with open(file_name, 'w', encoding='utf-8') as f: f.write("<!DOCTYPE html>\n") f.write(page) if __name__ == "__main__": Papunika() ''' html = etree.HTML(page) for i in range(5): content = html.xpath('//*[@id="BorlabsCookieBox"]') if content: data = etree.tostring(content[0], encoding="utf-8").decode("utf-8") for i in re.findall(r'(<.*?>)', data): page = page.replace(i, "") print(data) text = html.xpath('//*[@id="BorlabsCookieBox"]//text') if text: for i in text: page = page.replace(i, "") print(data) '''