python百度贴吧爬虫
# -*- coding: utf-8 -*- #coding=utf-8 import urllib import urllib2 import re import thread import time class BDTB: def __init__(self,baseurl,seeLz): self.baseUrl=baseurl self.seeLz='?see_lz='+str(seeLz) self.Tool=Tool() def getPage(self,pageNum): try: url=self.baseUrl+self.seeLz+'&pn='+str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read() except urllib2.URLError, e: print "链接网络失败"+e.reason return None def getTitle(self): html=self.getPage(1) pattern = re.compile('core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S) result =re.search(pattern,html) if result: print result.group(1) else: return None def getContent(self,page): pattern =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findall(pattern,page) floor=1 for i in items: print floor,u'楼--------------------------------------------\n' print self.Tool.replace(i) floor+=1 class Tool:
#去除img标签,1-7位空格,
removeImg = re.compile('<img.*?>| {1,7}| ')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
#将多行空行删除
removeNoneLine = re.compile('\n+')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
x = re.sub(self.removeNoneLine,"\n",x)
#strip()将前后多余内容删除
return x.strip()
baseURL = 'http://tieba.baidu.com/p/3138733512' bdtb = BDTB(baseURL,2) bdtb.getContent(bdtb.getPage(2))