02爬虫 爬取百度贴吧帖子
1 #encoding:utf-8 2 # -*- coding: utf-8 -*- 3 # coding:utf-8 4 import urllib 5 import urllib2 6 import re 7 #处理页面标签类 8 9 class Tool: 10 #去除img标签,7位长空格 11 removeImg = re.compile('<img.*?>| {7}|') 12 #删除超链接标签 13 removeAddr = re.compile('<a.*?>|</a>') 14 #把换行的标签换为\n 15 replaceLine = re.compile('<tr>|<div>|</div>|</p>') 16 #将表格制表<td>替换为\t 17 replaceTD= re.compile('<td>') 18 #把段落开头换为\n加空两格 19 replacePara = re.compile('<p.*?>') 20 #将换行符或双换行符替换为\n 21 replaceBR = re.compile('<br><br>|<br>') 22 #将其余标签剔除 23 removeExtraTag = re.compile('<.*?>') 24 def replace(self,x): 25 x = re.sub(self.removeImg,"",x) 26 x = re.sub(self.removeAddr,"",x) 27 x = re.sub(self.replaceLine,"\n",x) 28 x = re.sub(self.replaceTD,"\t",x) 29 x = re.sub(self.replacePara,"\n ",x) 30 x = re.sub(self.replaceBR,"\n",x) 31 x = re.sub(self.removeExtraTag,"",x) 32 #strip()将前后多余内容删除 33 return x.strip() 34 35 class BDTB: 36 37 def __init__(self,baseUrl,seeLZ): 38 self.baseURL = baseUrl 39 self.seeLZ = '?see_lz='+str(seeLZ) 40 self.tool = Tool() 41 42 def getPage(self,pageNum): 43 try: 44 url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum) 45 request = urllib2.Request(url) 46 response = urllib2.urlopen(request) 47 #print response.read() 48 return response 49 except urllib2.URLError, e: 50 if hasattr(e,"reason"): 51 print u"error",e.reason 52 return None 53 def getTitle(self): 54 page = self.getPage(1) 55 pattern = re.compile('<h3 .*?title="(.*?)" style=.*?</h3>', re.S) 56 page=page.read().decode('utf-8') 57 result = re.search(pattern,page) 58 if result: 59 print result.group(1) 60 return result.group(1).strip() 61 else: 62 return None 63 def getPageNum(self): 64 page = self.getPage(1) 65 pattern = re.compile('<li .*?max-page="(.*?)".*?/>',re.S) 66 page=page.read().decode('utf-8') 67 result = re.search(pattern,page) 68 if result: 69 print result.group(1) #测试输出 70 return result.group(1).strip() 71 else: 72 return None 73 #获取每一层楼的内容,传入页面内容 74 def getContent(self,page): 75 pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) 76 page=page.read().decode('utf-8') 77 items = re.findall(pattern,page) 78 floor=1 79 for item in items: 80 print floor,u'楼---------------------------------------\n' 81 print self.tool.replace(item) 82 floor+=1 83 baseURL = 'http://tieba.baidu.com/p/3138733512' 84 bdtb = BDTB(baseURL, 1) 85 #bdtb.getPage(1) 86 bdtb.getContent(bdtb.getPage(1))
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步