23hh小说网——爬虫1.0python
修改点:
1. decode('gbk')修改为decode('gbk','replace'),在遇到不能识别的字符时直接用?替换
2. 将原来的单线程变为多线程,现在处理速度快了好多
1 #! /bin/python 2 # -*- coding:utf-8 -*- 3 4 # -------------------------------------------- 5 # 程序:【23hh小说网】爬虫 6 # 版本:0.2.2 7 # 作者:Silence 8 # 日期:2014-04-08 9 # 功能:1. 提供一个目录页,把这个小说的全部章节都抓取下来,保存为一个文件 10 # 2. 提供一个正在看的目录页,把这个章节及以后的所有章节都抓取下来, 11 # 3. 增加错误时的重试机制 12 # --------------------------------------------- 13 import threading 14 import urllib2 15 import re 16 import os 17 from Queue import Queue 18 19 class Spider_Thread(threading.Thread): 20 """单进程的爬网页很耗时的,所以考虑做成多进程的 21 参数通过queue来传递""" 22 23 def __init__(self, t_name,queue): 24 threading.Thread.__init__(self ,name = t_name) 25 self.data = queue 26 self.errorInfo = {} 27 28 def run(self): 29 while self.data.qsize() > 0: 30 pageInfo = self.data.get() 31 print '线程%s正在爬第%d个页面'%(self.getName(),pageInfo.keys()[0]) 32 try: 33 self.novel = Novel_Tool(pageInfo.values()[0]['pageurl'],'N') 34 decodePageResp = self.novel.getDecodePage(pageInfo.values()[0]['pageurl']) 35 pageContent = self.novel.getPageContent(decodePageResp) 36 self.novel.writeToFile(pageContent,pageInfo.values()[0]['pagename']) 37 except Exception,e: 38 print '爬第%d个页面时出错了' %pageInfo.keys()[0] 39 self.errorInfo[pageInfo.keys()[0]] = pageInfo.values() 40 pass 41 if self.errorInfo.__len__() > 0: 42 print '出错的页面信息有:\n',self.errorInfo 43 self.novel = None 44 45 46 # 主要是用来提供一些公用的方法,不作为主函数调用 47 class Novel_Tool(): 48 49 def __init__(self,weburl,saveAsOne): 50 self.url = weburl 51 self.headers = { 52 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' 53 } 54 self.saveAsOne = saveAsOne 55 self.pagesInfo = {} 56 self.errorPage = [] 57 58 #获取当前页面的编码格式,现在某些小说网喜欢用gbk来编码 59 # 但是,也有一些不厚道的网站,他们的实际编码格式不是在页面中charset中定义的格式,暂时先忽略了 60 def getPageType(self,content): 61 pattern = re.compile('charset=.*?"') 62 pagetype = pattern.search(content).group() 63 pagetype = pagetype[8:len(pagetype) - 1] 64 return pagetype 65 66 def start(self): 67 if novelUrl.find('html') > 0: 68 self.spiderPagesFromCurrent() 69 else: 70 pageInfos = self.getAllUrlsAndNames() 71 self.spiderAllPagesFromOne(pageInfos) 72 self.doStat() 73 74 def doStat(self): 75 print '本次共尝试爬章节 %d,其中爬成功章节数 %d' %(self.pageInfos.__len__(),self.pageInfos.__len__() - self.errorPage.__len__()) 76 print '失败的章节信息为:',errorPage 77 78 def retryErroePage(self,errorPages): 79 print '准备重试错误页面中....' 80 self.spiderAllPagesFromOne(errorPages) 81 82 def spiderPagesFromCurrent(self): 83 pageurl = self.url 84 index = 1 85 while pageurl.find('index.html') == -1: 86 try: 87 decodePageResp = self.getDecodePage(pageurl) 88 pageContent = self.getPageContent(decodePageResp) 89 90 self.writeToFile(pageContent,self.getPageTitle(decodePageResp)) 91 pageurl = self.getNextPage(decodePageResp) 92 except Exception,e: 93 print '爬第%d个页面时出错了' %index 94 self.errorPage = {index:pageInfo['pageurl']} 95 pass 96 finally: 97 index = index + 1 98 99 # 依次遍历所有的章节,并爬下来 100 def spiderAllPagesFromOne(self,pageInfo): 101 for index,pageInfo in pageInfo.items(): 102 print '正在爬第%d个页面……'%index 103 try: 104 decodePageResp = self.getDecodePage(pageInfo['pageurl']) 105 pageContent = self.getPageContent(decodePageResp) 106 self.writeToFile(pageContent,pageInfo['pagename']) 107 except Exception,e: 108 print '爬第%d个页面时出错了' %index 109 self.errorPage = {index:pageInfo['pageurl']} 110 pass 111 112 # 获取正文的标题 113 def getPageTitle(self,content): 114 charToTitleRex = re.compile('h1>(.|\s)*?</h1') 115 pageTitle = charToTitleRex.search(content).group() 116 pageTitle = pageTitle[3:len(pageTitle)-4] 117 return pageTitle 118 119 def writeToFile(self,content,filename): 120 if os.path.exists(os.getcwd() + '/Novels'): 121 if not os.path.isdir(os.getcwd() + '/Novels'): 122 os.rename('Novels','Novels.bak') 123 os.mkdir(os.getcwd() + '/Novels') 124 else: 125 os.mkdir(os.getcwd() + '/Novels') 126 127 if self.saveAsOne == 'N': 128 ofile = open(os.getcwd() + '/Novels/' + filename,'w') 129 else: 130 ofile = open(os.getcwd() + '/Novels/novel.txt','a') 131 132 try: 133 ofile.write(content) 134 except Exception, e: 135 print '存储网页',filename,'出错!' 136 pass 137 finally: 138 ofile.close() 139 140 def getDecodePage(self,pageurl): 141 req = urllib2.Request( 142 url = pageurl, 143 headers = self.headers 144 ) 145 # print pageInfo['pageurl'] 146 pageResponse = urllib2.urlopen(req).read() 147 decodePageResp = pageResponse.decode(self.getPageType(pageResponse),'replace').encode('utf-8') 148 return decodePageResp 149 150 # 章节内容 151 152 def getPageContent(self,decodePageResp): 153 contentPattern = re.compile('(<dd id="contents">)((.|\s)*?)(</dd>)') 154 content = contentPattern.search(decodePageResp).group(2) 155 content = self.replaceWebTag(content) 156 return content 157 158 # 获取下一页的地址 159 def getNextPage(self,content): 160 # 先获取到下一页的位置 161 footlinkRex = re.compile('(footlink">)(.*?)</dd>') 162 foot = footlinkRex.search(content).group(2) 163 pattern = re.compile(r'(返回目录.*?(<a.*?">下一页))') 164 m = pattern.search(foot).groups() 165 nextUrl = m[len(m)-1][9:m[len(m)-1].find('">')] 166 167 return self.url[0:self.url.rfind('/')+1] + nextUrl 168 169 def getAllUrlsAndNames(self): 170 # 先请求目录页,获取所有的目录章节和链接 171 req = urllib2.Request( 172 url = self.url, 173 headers = self.headers 174 ) 175 myResponse = urllib2.urlopen(req).read() 176 decodeResp = myResponse.decode(self.getPageType(myResponse)).encode('utf-8') 177 178 print '正在分析目录页面,请稍后…………' 179 pageRex = re.compile('<a href=".*?</td>') #定义获取所有章节页面链接的正则 180 pageUrlRex = re.compile('".*?"') #获取章节url的正则 181 pageNameRex = re.compile('>.*?<') #获取章节名字的正则 182 183 pages = pageRex.findall(decodeResp) 184 index = 1 185 for page in pages: 186 pageurl = pageUrlRex.search(page).group() 187 pageurl = pageurl[1:len(pageurl) - 1] 188 pageurl = self.url + pageurl 189 190 pagename = pageNameRex.search(page).group() 191 pagename = pagename[1:len(pagename) - 1] 192 193 # print pagename + ' ' + pageurl 194 self.pagesInfo[index] = { 195 'pagename' : pagename, 196 'pageurl' : pageurl 197 } 198 index = index + 1 199 print '目录页分析完成!该小说共有%d个章节'%index 200 return self.pagesInfo 201 202 def getNovelName(self,content): 203 titleRex = re.compile('<h1>.*?</h1>') 204 title = titleRex.search(content).group() 205 return title[4:len(title) - 5] 206 207 def replaceWebTag(self,content): 208 charToNoneRex = re.compile(r' ') 209 charToNewLineRex = re.compile("<br />|<br>|<br/>") 210 211 content = charToNoneRex.sub("",content) 212 content = charToNewLineRex.sub("\n",content) 213 return content 214 215 if __name__ == '__main__': 216 print u""" 217 # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # 218 # 程序:【23hh小说网】爬虫 # 219 # 版本:1.0 # 220 # 作者:Silence # 221 # 日期:2014-04-08 # 222 # 操作:启动后输入要爬得小说目录页地址,就可以自动爬了 # 223 # 功能:1. 提供一个目录页,把目录页中所有的目录章节都抓出来(默认是:23hh的争霸天下); # 224 # 2. 提供一个正在看的目录页,把这个章节及以后的所有章节都抓取下来, # 225 # 分章节保存在启动脚本目录下的Novels目录下; # 226 # 如果该目录下已经有一个Novels,则把这个Novels改名为Novels.bak # 227 # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #""" 228 229 novelUrl = raw_input('请输入要爬的小说地址(默认是:23hh的争霸天下章节目录)\n') 230 if novelUrl == '': 231 novelUrl = 'http://www.23hh.com/book/43/43957/' 232 elif novelUrl.find('html') > 0: 233 novelUrl = novelUrl 234 235 saveAsOne = raw_input('是否保存为一个文件?是为Y,否为N\n') 236 if saveAsOne not in ['Y','N']: 237 saveAsOne = 'N' 238 239 Novel = Novel_Tool(novelUrl,saveAsOne) 240 241 if not novelUrl.find('html') > 0: 242 queue = Queue() 243 pageInfos = Novel.getAllUrlsAndNames() 244 for key,value in pageInfos.items(): 245 queue.put({key:value}) 246 247 thread1 = Spider_Thread('thread1',queue) 248 thread2 = Spider_Thread('thread2',queue) 249 thread3 = Spider_Thread('thread3',queue) 250 thread4 = Spider_Thread('thread4',queue) 251 thread1.start() 252 thread2.start() 253 thread3.start() 254 thread4.start() 255 else: 256 Novel.start()