增补博客 第十三篇 python大作业小说阅读器(2)爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import re
from lxml import etree
import random
import requests
from 可视化查询书籍.查询书籍信息 import searchbook
 
# 代理IP池
proxy_list = ["192.168.72.241"]
 
def get_random_proxy():
return random.choice(proxy_list)
 
def search(user_leixing, user_title, user_author):
# 调用 searchbook 函数并获取匹配的书籍信息
book_info = searchbook(user_leixing, user_title, user_author)
 
# 检查目录是否存在
directory = "..\\小说\\" + book_info[0]
 
if not os.path.exists(directory):
print("目录不存在")
# 初始化 book 变量
book = [None, None, None]
 
# 调用 searchbook 函数并获取匹配的书籍信息
book = searchbook(user_leixing, user_title, user_author)
 
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
 
index_url = 'https://www.bqg88.cc/' + book[2]
 
# Using proxy
proxy = {"http": "http://" + get_random_proxy()}
response = requests.get(index_url, headers=headers, proxies=proxy)
response.raise_for_status()
 
info_list = re.findall('<dd><a href\s*=\s*"(.*?)">(.*?)</a></dd>', response.text)
 
# Create directory if it doesn't exist
if not os.path.exists(directory):
os.makedirs(directory)
 
for info in info_list:
url = 'https://www.bqg88.cc/' + info[0]
response = requests.get(url, headers=headers, proxies=proxy)
response.raise_for_status()
 
print("正在下载" + info[1])
 
html_data = etree.HTML(response.text)
text_list = html_data.xpath('//div[@id="chaptercontent"]/text()')
text = ''.join(text_list)
 
book_text = '\n\n' + info[1] + '\n\n'
book_text += text.replace('请收藏本站:https://www.bqg88.cc。笔趣阁手机版:https://m.bqg88.cc', '')
 
# Save each chapter to a separate text file
chapter_filename = os.path.join(directory, info[1] + '.txt')
with open(chapter_filename, "w", encoding='utf-8') as file:
file.write(book_text)
 
print("下载完成!")
else:
print("目录已存在")

  

posted @   财神给你送元宝  阅读(89)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示