python3+beautifulSoup4.6抓取某网站小说(三)网页分析,BeautifulSoup解析
本章学习内容:将网站上的小说都爬下来,存储到本地。
目标网站:www.cuiweijuxs.com
分析页面,发现一共4步:从主页进入分版打开分页列表、打开分页下所有链接、打开作品页面、打开单章内容。
所以实现步骤如下:
1、进入分版页面,www.cuiweijuxs.com/jingpinxiaoshuo/
找到最大分页数
<a href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_122.html" class="last">122</a>
循环打开每个页面
href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
2、找到当页所有链接,循环打开单页链接,下为可定位元素
div id="newscontent"
div class="l"
<span class="s2">
<a href="http://www.cuiweijuxs.com/4_4521/" target="_blank">标题</a>
3、打开单页链接,找到章节列表,下为可定位元素
<div id="list">
<dd>
<a href="/4_4508/528170.html">第一章</a>
</dd>
</div>
4、打开单章链接,读取内容
<div id="content">
内容
<div>
setup1:创建class,初始化参数,抽象化获取beautifulsoup解析后到网页
# -*- coding: UTF-8 -*- from urllib import request from bs4 import BeautifulSoup import os ''' 使用BeautifulSoup抓取网页 ''' class Capture(): def __init__(self): self.index_page_url = 'http://www.cuiweijuxs.com/' self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/' self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html" self.folder_path = '小说/' self.head = {} # 写入User Agent信息 self.head[ 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' # 获取BeautifulSoup def getSoup(self, query_url): req = request.Request(query_url, headers=self.head) webpage = request.urlopen(req) html = webpage.read() #soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html5lib') return soup # end getSoup
setup2:创建进入分版页面,找到最大分页数,并循环打开每个页面
# 读取更新列表 def readPageOne(self): soup = self.getSoup(self.one_page_url) last = soup.find("a","last") itemSize = int(last.string) page_url = str(self.two_page_url) for item in range(itemSize): print( item ) new_page_url = page_url.replace( "?",str(item+1) ) self.readPageTwo(new_page_url) # end readPageOne
使用getSoup方法获取解析后到html网页,使用find方法找到class是“last”的a标签,获取最大分页数
循环分页,从1开始
setup3:读取单页链接
#读取单页链接 def readPageTwo(self,page_url): soup = self.getSoup(page_url) con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'}) a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a') print(a_list) for a_href in a_list: #print(child) href = a_href.get('href') folder_name = a_href.get_text() print('a_href',href,'---folder_name',folder_name) path = self.folder_path + folder_name self.createFolder(path) self.readPageThree(href,path) # end for # end readPageTwo
找到div下id是newscontent的标签,再往下找到class是“l”的div,再找到所有class是“s2”的span,找到此span下的a标签,循环打开a标签
并找到标签名( a_href.get_text() )作为文件夹名称
setup4:打开作品页面,循环章节链接,拼接文件名称
#打开作品页面 def readPageThree(self,page_url,path): soup = self.getSoup(page_url) print('readPageThree--',page_url) a_list = soup.find('div', {'id': 'list'}).find_all('a') idx = 0 for a_href in a_list: idx = idx+1 href = self.index_page_url + a_href.get('href') txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt' print('a_href', href, '---path', txt_name) isExists = os.path.exists(txt_name) if isExists: print(txt_name, '已存在') else: self.readPageFour(href,txt_name)
setup5:打开章节链接,读取id=content的div下所有内容,写入文件中
#读取单章内容并写入 def readPageFour(self,page_url,path): soup = self.getSoup(page_url) con_div = soup.find('div', {'id': 'content'}) content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ') self.writeTxt(path,content)
完整代码实现如下:
1 # -*- coding: UTF-8 -*- 2 from urllib import request 3 from bs4 import BeautifulSoup 4 import os 5 6 ''' 7 使用BeautifulSoup抓取网页 8 ''' 9 10 class Capture(): 11 12 def __init__(self): 13 self.index_page_url = 'http://www.cuiweijuxs.com/' 14 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/' 15 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html" 16 self.folder_path = '小说/' 17 self.head = {} 18 # 写入User Agent信息 19 self.head[ 20 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' 21 22 # 获取BeautifulSoup 23 def getSoup(self, query_url): 24 req = request.Request(query_url, headers=self.head) 25 webpage = request.urlopen(req) 26 html = webpage.read() 27 #soup = BeautifulSoup(html, 'html.parser') 28 soup = BeautifulSoup(html, 'html5lib') 29 return soup 30 # end getSoup 31 32 #读取更新列表 33 def readPageOne(self): 34 soup = self.getSoup(self.one_page_url) 35 last = soup.find("a","last") 36 itemSize = int(last.string) 37 page_url = str(self.two_page_url) 38 39 for item in range(itemSize): 40 print( item ) 41 new_page_url = page_url.replace( "?",str(item+1) ) 42 self.readPageTwo(new_page_url) 43 44 # end readPageOne 45 46 #读取单页链接 47 def readPageTwo(self,page_url): 48 soup = self.getSoup(page_url) 49 con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'}) 50 a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a') 51 print(a_list) 52 for a_href in a_list: 53 #print(child) 54 href = a_href.get('href') 55 folder_name = a_href.get_text() 56 print('a_href',href,'---folder_name',folder_name) 57 path = self.folder_path + folder_name 58 self.createFolder(path) 59 self.readPageThree(href,path) 60 # end for 61 62 # end readPage 63 64 #打开单章链接 65 def readPageThree(self,page_url,path): 66 soup = self.getSoup(page_url) 67 print('readPageThree--',page_url) 68 a_list = soup.find('div', {'id': 'list'}).find_all('a') 69 idx = 0 70 for a_href in a_list: 71 idx = idx+1 72 href = self.index_page_url + a_href.get('href') 73 txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt' 74 print('a_href', href, '---path', txt_name) 75 isExists = os.path.exists(txt_name) 76 if isExists: 77 print(txt_name, '已存在') 78 else: 79 self.readPageFour(href,txt_name) 80 81 82 #读取单章内容并写入 83 def readPageFour(self,page_url,path): 84 soup = self.getSoup(page_url) 85 con_div = soup.find('div', {'id': 'content'}) 86 content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ') 87 self.writeTxt(path,content) 88 89 def readPageHtml(self,page_url,path): 90 soup = self.getSoup(page_url) 91 con_div = soup.find('div', {'id': 'content'}) 92 content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ') 93 94 95 def createFolder(self,path): 96 path = path.strip() 97 # 去除尾部 \ 符号 98 path = path.rstrip("\\") 99 isExists = os.path.exists(path) 100 # 不存在则创建 101 if not isExists: 102 os.makedirs(path) 103 print(path + ' create') 104 else: 105 print( path + ' 目录已存在') 106 #end createFolder 107 108 def writeTxt(self,file_name,content): 109 isExists = os.path.exists(file_name) 110 if isExists: 111 print(file_name,'已存在') 112 else: 113 file_object = open(file_name, 'w',encoding='utf-8') 114 file_object.write(content) 115 file_object.close() 116 117 def run(self): 118 try: 119 self.readPageOne() 120 except BaseException as error: 121 print('error--',error) 122 123 124 Capture().run()
作者:妖生
<<<<我的公众号:姚毛毛的博客
Linux常用工具站:https://www.linuxido.com
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利.
如果喜欢本文就点个【推荐】吧。