增补博客 第十三篇 python大作业小说阅读器(2)爬取
import os import re from lxml import etree import random import requests from 可视化查询书籍.查询书籍信息 import searchbook # 代理IP池 proxy_list = ["192.168.72.241"] def get_random_proxy(): return random.choice(proxy_list) def search(user_leixing, user_title, user_author): # 调用 searchbook 函数并获取匹配的书籍信息 book_info = searchbook(user_leixing, user_title, user_author) # 检查目录是否存在 directory = "..\\小说\\" + book_info[0] if not os.path.exists(directory): print("目录不存在") # 初始化 book 变量 book = [None, None, None] # 调用 searchbook 函数并获取匹配的书籍信息 book = searchbook(user_leixing, user_title, user_author) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } index_url = 'https://www.bqg88.cc/' + book[2] # Using proxy proxy = {"http": "http://" + get_random_proxy()} response = requests.get(index_url, headers=headers, proxies=proxy) response.raise_for_status() info_list = re.findall('<dd><a href\s*=\s*"(.*?)">(.*?)</a></dd>', response.text) # Create directory if it doesn't exist if not os.path.exists(directory): os.makedirs(directory) for info in info_list: url = 'https://www.bqg88.cc/' + info[0] response = requests.get(url, headers=headers, proxies=proxy) response.raise_for_status() print("正在下载" + info[1]) html_data = etree.HTML(response.text) text_list = html_data.xpath('//div[@id="chaptercontent"]/text()') text = ''.join(text_list) book_text = '\n\n' + info[1] + '\n\n' book_text += text.replace('请收藏本站:https://www.bqg88.cc。笔趣阁手机版:https://m.bqg88.cc', '') # Save each chapter to a separate text file chapter_filename = os.path.join(directory, info[1] + '.txt') with open(chapter_filename, "w", encoding='utf-8') as file: file.write(book_text) print("下载完成!") else: print("目录已存在")