某点网文章爬取
一、思路
分析网页结构【开发者工具F12】 >>> 确定选择器提取内容 >>> 下载保存
二、代码实现
# -*- coding:UTF-8 -*- import os import time from concurrent.futures.thread import ThreadPoolExecutor import requests import urllib3 from bs4 import BeautifulSoup # 禁用安全请求警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) file_path_base = 'G:/novel/lingdian' # 文件下载目录 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/77.0.3865.35 Safari/537.36 ", 'Referer': 'http://www.00kxs.com'} chapter_content_array = [] # 从页面获取小说名称 def get_novel_name(novel_total_url): response = requests.get(url=novel_total_url, headers=headers, verify=False) response.encoding = 'gbk' # 'utf-8' bf = BeautifulSoup(response.text, 'lxml') title_content = bf.select('title')[0].text title_content = str(title_content).split('_')[0] if title_content.__contains__('完整'): title_content = title_content[0:int(title_content.index('完整'))] # 添加日期后缀 date_suffix = time.strftime('%Y%m%d', time.localtime(time.time())) return title_content + '-' + date_suffix # 从页面获取章节名称和url地址 def get_name2urls(novel_total_url): response = requests.get(url=novel_total_url, headers=headers, verify=False) response.encoding = 'gbk' # 'utf-8' bf = BeautifulSoup(response.text, 'lxml') name2urls = [] url_prefix = novel_total_url[0:int(novel_total_url.index('/html'))] chapter_array = bf.select('.gochapter a') if len(chapter_array) > 0: all_url = novel_total_url + chapter_array[0].get('href') response = requests.get(url=all_url, headers=headers, verify=False) response.encoding = 'gbk' # 'utf-8' bf = BeautifulSoup(response.text, 'lxml') temp_index = 1 for tag_content in bf.select('.list_xm a'): name2urls.append([str(tag_content.text).replace('?', '').replace('(求收藏推荐)', ''), url_prefix + tag_content.get('href'), temp_index]) temp_index += 1 else: start_flag = False temp_index = 1 for tag_content in bf.select('.chapter a'): if start_flag: name2urls.append([str(tag_content.text).replace('?', '').replace('(求收藏推荐)', ''), url_prefix + tag_content.get('href'), temp_index]) temp_index += 1 if str(tag_content).__contains__('第一章') or str(tag_content).__contains__('第1章'): start_flag = True name2urls.append([str(tag_content.text).replace('?', '').replace('(求收藏推荐)', ''), url_prefix + tag_content.get('href'), temp_index]) temp_index += 1 return name2urls # 下载章节内容 def get_chapter_content(chapter_name2url): chapter_name = chapter_name2url[0] chapter_url = chapter_name2url[1] chapter_index = chapter_name2url[2] chapter_content = '【' + chapter_name + '】' + '\n' try: response = requests.get(url=chapter_url, headers=headers, verify=False) response.encoding = 'gbk' # print(response.text) bf = BeautifulSoup(response.text, 'lxml') print('第【' + str(chapter_index) + '】章' + '【' + chapter_name + '】内容提取中 >>>') if len(bf.select('#novelcontent p')) > 0: for tag_content in bf.select('#novelcontent p'): if str(tag_content.text).__contains__('------'): continue chapter_content += tag_content.text + '\n' else: for tag_content in bf.select('#content p'): if str(tag_content.text).__contains__('------'): continue chapter_content += tag_content.text + '\n' chapter_content_array.append([chapter_index, chapter_content]) except Exception as error: print('【' + chapter_name + '】内容获取失败 -> ' + str(error)) return [chapter_index, chapter_content] # 下载操作 -> 使用章节内容对文件进行增量添加操作 def download(category_name, file_name, chapter_name, chapter_content): if not os.path.exists(file_path_base): os.makedirs(file_path_base) dir_name = file_path_base + '/' + category_name if not os.path.exists(dir_name): os.makedirs(dir_name) # 异常处理 try: path = dir_name + '/' + file_name + '.txt' with open(path, 'a', encoding='utf-8') as file: file.write(chapter_content) except Exception as error: print('【' + chapter_name + '】写入失败 -> ' + str(error)) def start_crawl(novel_total_url): total_start = time.perf_counter() # 1、从总览页获取小说名称 novel_name = get_novel_name(novel_total_url) # 2、从总览页获取章节名称和章节url name2urls = get_name2urls(novel_total_url) # 使用线程池提取章节内容 thread_pool = ThreadPoolExecutor(10) for name2url_index in range(len(name2urls)): thread_pool.submit(get_chapter_content, name2urls[name2url_index]) thread_pool.shutdown() # 将多线程提取来的内容集合排序 -> 合成内容 -> 一次性写入本地文件 chapter_content_array.sort() chapter_content_total = '' for chapter_content in chapter_content_array: chapter_content_total += chapter_content[1] total_expend = time.perf_counter() - total_start print('全部内容提取完成,总共耗时【' + ('%.1f' % total_expend) + '】秒') write_start = time.perf_counter() download('', novel_name, '第1章', chapter_content_total) write_expend = time.perf_counter() - write_start print('写入完成,总共耗时【' + ('%.1f' % write_expend) + '】秒') if __name__ == '__main__': novel_home_url = 'https://m.00ksw.com/html/62/62119/' # 斗罗 # novel_home_url = 'http://www.00kxs.com/html/3/3235/' # 透视 # novel_home_url = 'http://www.00kxs.com/html/56/56824/' #斗破 start_crawl(novel_home_url)
三、效果展示
四、打完收功~