[Python]网络小说爬取、爬虫
1.源代码
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : HtmlParser.py # @Author: 赵路仓 # @Date : 2020/3/27 # @Desc : # @Contact : 398333404@qq.com import requests from bs4 import BeautifulSoup # 请求头 head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } url = 'https://www.xsbiquge.com/74_74627/' count = 0 def menu(name): url = search(name) f = open("test.txt", "w", encoding="utf-8") r = requests.get(url, headers=head) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, "html.parser") chapter = soup.find("dl") for c in chapter: try: for a in c: url = "https://www.xsbiquge.com" + str(a.attrs['href']) f.write(url + "\n") except: print("ss") f.close() def single_page(url): global count r = requests.get(url, headers=head) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, "html.parser") h1 = soup.find("h1") content = soup.find("div", {"id": "content"}) all = str(h1.string) + "\n" + str(content).replace("<div id=\"content\">", " ").replace("<br/><br/><br/><br/>", "\n\n").replace( "<br/><br/>", "\n\n").replace("</div>", "") + "\n" count += 1 print("当前第%d章" % (count)) # print(all) return all def content(path, name): menu(name) fh = open(path + name + ".txt", "w", encoding="utf-8") fh.write(name + "\n\n") fh.close() fw = open(path + name + ".txt", "a+", encoding="utf-8") f = open("test.txt", "r+", encoding="utf-8") for line in f: line = line.rstrip("\n") fw.write(single_page(line)) fw.close() f.close() def search(name): searchurl = "https://www.xsbiquge.com/search.php?keyword=" searchurl = searchurl + name r = requests.get(searchurl, headers=head) soup = BeautifulSoup(r.text, "html.parser") title = soup.find_all("a", {"cpos": "title"}) for t in title: print(t.attrs['title']) print(t.attrs['href']) if t.attrs['title'] == name: return t.attrs['href'] if __name__ == "__main__": # print(single_page("https://www.xsbiquge.com/74_74627/3845841.html")) # menu("E:/test.txt",url) # content("E:/", "万古大帝") # search("斗破苍穹") content("E:/", "斗破苍穹")
爬取结果: