selenium-爬取小说
1 import requests 2 from bs4 import BeautifulSoup 3 import sys 4 from selenium import webdriver 5 from selenium.webdriver.support.wait import WebDriverWait 6 from selenium.webdriver.common.by import By 7 from selenium.webdriver.support import expected_conditions as EC 8 import time 9 10 # [sɪˈliniəm] 硒 11 12 13 browser = webdriver.Chrome() 14 wait = WebDriverWait(browser, 10) 15 16 17 def get_total_page(): 18 url = 'https://www.xxbiquge.com/0_807/' 19 browser.get(url) 20 html = browser.page_source 21 soup = BeautifulSoup(html, 'lxml') 22 dd = soup.find_all('dd') 23 # browser.close() 24 pages = len(dd) 25 return pages 26 27 28 def index_page(i): 29 """ 30 加载出小说的每一章内容 31 :param i: 小说的第 i 章 32 """ 33 if i == 1: 34 # 小说第一章的 Url 地址 35 url = "https://www.xxbiquge.com/0_807/4055527.html" 36 browser.get(url) 37 # 等待 Content 节点加载出来 38 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content'))) 39 # 调用 get_info() 方法对页面进行解析 40 get_info() 41 # 寻找下一章点击的节点 42 next_p = browser.find_elements(By.XPATH, ('//div[@class="bottem2"]/a'))[ 43 2] # //*[@id="wrapper"]/div[4]/div/div[6]/a[3] 44 # 找到后停顿 30 秒 45 time.sleep(1) 46 # 点击按钮 47 next_p.click() 48 49 50 def main(): 51 """ 52 遍历小说全部章节 53 :return: 54 """ 55 total_page = get_total_page() 56 print(total_page) 57 for i in range(1, total_page + 1): 58 index_page(i) 59 60 61 def get_info(): 62 """ 63 提取每一章小说的章章节名及正文 64 #wrapper > div.content_read > div > div.bookname > h1 65 :return: 66 """ 67 # 找到章节的名字 68 name = browser.find_element_by_css_selector('#wrapper > div.content_read > div > div.bookname > h1').text 69 print(name) 70 # 找到小说正文 71 content = browser.find_element_by_id('content').text 72 print(content) 73 # 将拿到的小说名和对应的正文内容写入 txt 文件中 74 with open('雪中悍刀行.txt', 'a', encoding="utf-8") as f: 75 # '\n'.join([name, content]) 转化为字符串 76 f.write('\n'.join([name, content])) 77 # 换两行 78 f.write('\n\n') 79 80 81 if __name__ == '__main__': 82 main()