python爬取小说
运行结果:
代码:
1 import requests 2 from bs4 import BeautifulSoup 3 from selenium import webdriver 4 import os 5 6 class NovelSpider: 7 def __init__(self): 8 self.start_url = 'https://www.biqukan.com/1_1680/' 9 10 def get_novel(self): 11 response = requests.get(self.start_url) 12 soup = BeautifulSoup(response.text, 'html.parser') 13 div_chapter = soup.find(class_="listmain") 14 chapter_list = div_chapter.find_all('a') 15 chapter_list = chapter_list[12:] 16 chapter = [] 17 chapter_num = len(chapter_list) 18 count = 0 19 print('《凡人修仙传仙界篇》开始下载:') 20 for cl in chapter_list: 21 chapter_dict = {} 22 chapter_name = cl.get_text() 23 chapter_dict['name'] = chapter_name 24 chapter_url = cl.get('href') 25 chapter_dict['value'] = 'https://www.biqukan.com' + chapter_url 26 if chapter_dict not in chapter: 27 chapter.append(chapter_dict) 28 print(f"已下载:{count}/{chapter_num}") 29 self.download_novel(chapter_dict) 30 count += 1 31 32 def parse_novel(self, url): 33 browser = webdriver.PhantomJS(executable_path=r'F:\Spider\novelSpider\phantomjs.exe') 34 browser.get(url) 35 soup = BeautifulSoup(browser.page_source, 'html.parser') 36 find_txt = soup.find(class_='showtxt') 37 # print(type(find_txt.get_text())) 38 return find_txt.get_text() 39 40 def download_novel(self, data): 41 filename = data['name'] 42 url = data['value'] 43 txt = self.parse_novel(url) 44 45 path = r"F:\Spider\novelSpider" 46 isExists = os.path.exists(path) 47 if not isExists: 48 os.mkdir(path) 49 else: 50 pass 51 52 with open(path + f'\凡人修仙传仙界篇.txt', 'a', encoding='utf-8') as f: 53 f.write(f'{filename}\n\n') 54 f.write(txt) 55 f.write('\n======\n\n') 56 f.close() 57 58 if __name__ == '__main__': 59 ns = NovelSpider() 60 ns.get_novel()