1 import requests
2 from bs4 import BeautifulSoup
3 import sys
4 from selenium import webdriver
5 from selenium.webdriver.support.wait import WebDriverWait
6 from selenium.webdriver.common.by import By
7 from selenium.webdriver.support import expected_conditions as EC
8 import time
9
10 # [sɪˈliniəm] 硒
11
12
13 browser = webdriver.Chrome()
14 wait = WebDriverWait(browser, 10)
15
16
17 def get_total_page():
18 url = 'https://www.xxbiquge.com/0_807/'
19 browser.get(url)
20 html = browser.page_source
21 soup = BeautifulSoup(html, 'lxml')
22 dd = soup.find_all('dd')
23 # browser.close()
24 pages = len(dd)
25 return pages
26
27
28 def index_page(i):
29 """
30 加载出小说的每一章内容
31 :param i: 小说的第 i 章
32 """
33 if i == 1:
34 # 小说第一章的 Url 地址
35 url = "https://www.xxbiquge.com/0_807/4055527.html"
36 browser.get(url)
37 # 等待 Content 节点加载出来
38 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content')))
39 # 调用 get_info() 方法对页面进行解析
40 get_info()
41 # 寻找下一章点击的节点
42 next_p = browser.find_elements(By.XPATH, ('//div[@class="bottem2"]/a'))[
43 2] # //*[@id="wrapper"]/div[4]/div/div[6]/a[3]
44 # 找到后停顿 30 秒
45 time.sleep(1)
46 # 点击按钮
47 next_p.click()
48
49
50 def main():
51 """
52 遍历小说全部章节
53 :return:
54 """
55 total_page = get_total_page()
56 print(total_page)
57 for i in range(1, total_page + 1):
58 index_page(i)
59
60
61 def get_info():
62 """
63 提取每一章小说的章章节名及正文
64 #wrapper > div.content_read > div > div.bookname > h1
65 :return:
66 """
67 # 找到章节的名字
68 name = browser.find_element_by_css_selector('#wrapper > div.content_read > div > div.bookname > h1').text
69 print(name)
70 # 找到小说正文
71 content = browser.find_element_by_id('content').text
72 print(content)
73 # 将拿到的小说名和对应的正文内容写入 txt 文件中
74 with open('雪中悍刀行.txt', 'a', encoding="utf-8") as f:
75 # '\n'.join([name, content]) 转化为字符串
76 f.write('\n'.join([name, content]))
77 # 换两行
78 f.write('\n\n')
79
80
81 if __name__ == '__main__':
82 main()