python3+beautifulSoup4.6抓取某网站小说(四)多线程抓取
上一篇多文章,是二级目录,根目录“小说”,二级目录“作品名称”,之后就是小说文件。
本篇改造了部分代码,将目录设置为根目录->作者目录->作品目录->作品章节.txt.
但这并不是本章内容当重点,重点是使用这个爬虫程序抓取当时候,经常会因为网络丢包等原因导致程序中断,
本来想着是循环获取网站状态,然后重新发起请求,结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程,正好就实验下,结果发现,速度很快,cool!
以下代码基本摘自虫师的selenium2
多线程的引用
import threading
方法调用:threading.Thread(target=music, args=('music方法参数1',music方法参数2) )
from time import sleep,ctime import threading def music(func,loop): for i in range(loop): print('music',func,ctime()) sleep(2) def movie(func,loop): for i in range(loop): print('movie',func,ctime()) sleep(4) def testOne(): music('简单的歌', 2) movie('两杆大烟枪', 2) print('all end', ctime())
def testTwo(): threads = [] t1 = threading.Thread(target=music, args=('喜欢的人',2) ) threads.append(t1) t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) ) threads.append(t2) t3= threading.Thread(target=music, args=('喜欢的人2', 2)) threads.append(t3) for t in threads: t.start() for t in threads: t.join() print('all end', ctime())
if __name__ == '__main__':
testOne()
#testTwo()
#testThree()
#threadsRun()
t.join方法用来串联线程,可以保证all end 语句在最后打印出来。
创建线程管理类
创建类名时就引入Thread:class MyThread(threading.Thread)
class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self) self.func = func self.args = args self.name = name def run(self): self.func(*self.args)
self:类实例,默认参数
func:调用方法名
args:参数
name:方法+".__name__"
完整代码:
1 class MyThread(threading.Thread): 2 3 def __init__(self, func, args, name): 4 threading.Thread.__init__(self) 5 self.func = func 6 self.args = args 7 self.name = name 8 9 def run(self): 10 self.func(*self.args) 11 12 def super_play(file_,time): 13 for i in range(3): 14 print('play', file_, ctime()) 15 sleep(time) 16 17 18 def time(args): 19 pass 20 21 22 def testThree(): 23 threads = [] 24 lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2} 25 for file_, time_ in lists.items(): 26 t = MyThread(super_play, (file_, time_), super_play.__name__) 27 threads.append(t) 28 29 files = range(len(lists)) 30 31 for f in files: 32 threads[f].start() 33 for f in files: 34 threads[f].join() 35 36 print('all end', ctime())
改造小说爬虫
好了,多线程说完了,怎么调用咱们写的小说类呢,很简单
首先,改造pageOne
def readPageOneByThread(self,page,time_): page_url = str(self.two_page_url) new_page_url = page_url.replace("?", page) print('第', page, '页---', new_page_url) path = self.folder_path self.readPageTwo(new_page_url, path) sleep(time_) # end readPageOneByThread ---------------------------------------
init方法中,self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
接下来,编写添加线程的方法:
def threadsRun(self): #self.readPageOne(122) for i in range(1,123): page = str(i) t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__) #t = threading.Thread(target=self.testRun, args=( str(i) )) self.threads.append(t) for t in self.threads: t.start() for t in self.threads: t.join() #t.join() print('all end: %s' % ctime()) class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self) self.func = func self.args = args self.name = name def run(self): self.func(*self.args)
这里偷了个懒,直接写了总页数,其实也可以使用原来的pageone方法读取last的div获取页数
下面是完整代码:
1 # -*- coding: UTF-8 -*- 2 from urllib import request 3 from bs4 import BeautifulSoup 4 from time import sleep,ctime 5 import os 6 import threading 7 import re 8 import random 9 10 ''' 11 使用BeautifulSoup抓取网页 12 version:0.5 更新为本地缓存链接 13 author:yaowei 14 date:2018-03-23 15 ''' 16 17 18 class Capture(): 19 20 def __init__(self): 21 self.index_page_url = 'http://www.cuiweijuxs.com/' 22 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/' 23 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html" 24 self.folder_path = '绯色/' 25 self.href_list = [] 26 self.head = {} 27 self.threads = [] 28 # 写入User Agent信息 29 self.head[ 30 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' 31 32 # end __init__ --------------------------------------- 33 34 # 获取BeautifulSoup 35 def getSoup(self,query_url): 36 req = request.Request(query_url, headers=self.head) 37 webpage = request.urlopen(req) 38 html = webpage.read() 39 soup = BeautifulSoup(html, 'html.parser') 40 return soup 41 # soup = BeautifulSoup(html, 'html5lib') 42 43 # 读取分版页面,打开分页链接 44 def readPageOne(self,count,time_): 45 46 print('count=====',count) 47 48 # 总页数 49 if count : 50 item_size = count 51 else : 52 # 读取页面 53 soup = self.getSoup(self.one_page_url) 54 last = soup.find("a", 'last') 55 item_size = int(last.string) 56 57 print('item_size=====',item_size) 58 page_url = str(self.two_page_url) 59 60 # 循环打开分页链接,读取分页页面 61 for item in range(item_size): 62 page = str(item + 1) 63 new_page_url = page_url.replace("?", page) 64 print('第', page, '页---', new_page_url) 65 path = self.folder_path 66 self.readPageTwo(new_page_url, path) 67 68 sleep(time_) 69 # end readPageOne --------------------------------------- 70 71 def readPageOneByThread(self,page,time_): 72 page_url = str(self.two_page_url) 73 new_page_url = page_url.replace("?", page) 74 print('第', page, '页---', new_page_url) 75 path = self.folder_path 76 self.readPageTwo(new_page_url, path) 77 sleep(time_) 78 # end readPageOneByThread --------------------------------------- 79 80 # 读取分页页面 81 def readPageTwo(self, page_url, path): 82 soup = self.getSoup(page_url) 83 # first div[id="newscontent"]->div[class="l"] 84 con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'}) 85 # first div[id="newscontent"]->div[class="l"]->all spann[class="s2"] 86 span_list = con_div.find_all('span', {'class': 's2'}) 87 88 # 遍历span 89 for span in span_list: 90 # 找到父节点下的span[class="s5"],以作者为文件夹名字 91 author = span.parent.find('span', {'class': 's5'}).get_text() 92 93 # span[class="s2"]->a 94 a_href = span.find('a') 95 href = a_href.get('href') # 单部作品链接 96 folder_name = a_href.get_text() # 作品名字 97 print('a_href', href, '---folder_name', folder_name) 98 new_path = path + '/' + author + '/' + folder_name 99 self.createFolder(new_path) # 创建文件夹 100 101 self.readPageThree(href, new_path) # 读取单部作品 102 103 # t = threading.Thread(target=self.readPageThree, args={href, new_path}) 104 # self.threads.append(t) 105 # end for 106 107 # end readPage --------------------------------------- 108 109 # 打开作品链接,遍历单章 110 def readPageThree(self, page_url, path): 111 soup = self.getSoup(page_url) # 作品页面 112 print('readPageThree--', page_url) 113 a_list = soup.find('div', {'id': 'list'}).find_all('a') 114 idx = 0 # 序号 115 for a_href in a_list: 116 idx = idx + 1 117 href = self.index_page_url + a_href.get('href') 118 file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt' 119 print('file_a_href', href, '---file_path', file_path) 120 121 ''' 122 new_path = self.isTxt(file_path) 123 if new_path: 124 print(new_path) 125 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8') 126 file_object.write(href+','+new_path) 127 file_object.close() 128 ''' 129 self.readPageFour(href, file_path) 130 131 #self.href_list.append({'href': href, 'file_path': file_path}) 132 133 # 多线程 134 #t = threading.Thread(target=self.readPageFour, args={href, file_path}) 135 #t.start() 136 #t.join(15) 137 138 # end readPageThree --------------------------------------- 139 140 # 读取单章内容并写入 141 def readPageFour(self, page_url, path): 142 new_path = self.isTxt(path) # 是否存在,存在则返回'',没创建则返回合法文件名 143 if new_path: 144 soup = self.getSoup(page_url) 145 con_div = soup.find('div', {'id': 'content'}) # 读取文本内容 146 content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ') 147 # content = content.replace('&','').replace('amp;','').replace('rdquo;','').replace('ldquo;','') 148 # content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;") 149 150 self.writeTxt(new_path, content) # 写入文件 151 152 # end readPageFour --------------------------------------- 153 154 def readPageHtml(self, page_url, path): 155 soup = self.getSoup(page_url) 156 con_div = soup.find('div', {'id': 'content'}) 157 content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ') 158 159 def createFolder(self, path): 160 path = path.strip() 161 # 去除尾部 \ 符号 162 path = path.rstrip("\\") 163 rstr = r"[\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' 164 new_path = re.sub(rstr, "_", path) # 替换为下划线 165 is_exists = os.path.exists(new_path) 166 # 不存在则创建 167 if not is_exists: 168 os.makedirs(new_path) 169 print('目录:', new_path + ' create') 170 else: 171 print(new_path + ' 目录已存在') 172 173 # end createFolder --------------------------------------- 174 175 def isTxt(self, path): 176 path = path.strip() 177 # 去除尾部 \ 符号 178 path = path.rstrip("\\") 179 rstr = r"[\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' 180 new_path = re.sub(rstr, "_", path) # 替换为下划线 181 isExists = os.path.exists(new_path) 182 if isExists: 183 print(new_path, '已存在') 184 return '' 185 else: 186 return new_path 187 188 # end createTxt --------------------------------------- 189 190 def writeTxt(self, file_name, content): 191 isExists = os.path.exists(file_name) 192 if isExists: 193 print(file_name, '已存在') 194 else: 195 file_object = open(file_name, 'w', encoding='utf-8') 196 file_object.write(content) 197 file_object.close() 198 199 # end writeTxt ------------------------------------------ 200 201 def run(self): 202 try: 203 self.readPageOne() 204 except BaseException as error: 205 print('error--', error) 206 207 def runTest(self): 208 try: 209 page_url = 'http://www.cuiweijuxs.com/4_4508/' 210 path = '小说/runTest' 211 self.readPageThree(page_url, path) 212 except BaseException as error: 213 print('error--', error) 214 215 def testRun(self,num,time_): 216 for i in range(3): 217 print('num=',num,ctime()) 218 sleep(time_) 219 220 def threadsRun(self): 221 222 #self.readPageOne(122) 223 224 for i in range(1,123): 225 page = str(i) 226 t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__) 227 #t = threading.Thread(target=self.testRun, args=( str(i) )) 228 self.threads.append(t) 229 230 for t in self.threads: 231 t.start() 232 for t in self.threads: 233 t.join() 234 #t.join() 235 236 print('all end: %s' % ctime()) 237 238 239 class MyThread(threading.Thread): 240 241 def __init__(self, func, args, name): 242 threading.Thread.__init__(self) 243 self.func = func 244 self.args = args 245 self.name = name 246 247 def run(self): 248 self.func(*self.args) 249 250 251 Capture().threadsRun()
作者:妖生
<<<<我的公众号:姚毛毛的博客
Linux常用工具站:https://www.linuxido.com
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利.
如果喜欢本文就点个【推荐】吧。