02爬虫 爬取百度贴吧帖子

复制代码
 1 #encoding:utf-8
 2 # -*- coding: utf-8 -*-
 3 # coding:utf-8
 4 import urllib
 5 import urllib2
 6 import re
 7 #处理页面标签类
 8 
 9 class Tool:
10     #去除img标签,7位长空格
11     removeImg = re.compile('<img.*?>| {7}|')
12     #删除超链接标签
13     removeAddr = re.compile('<a.*?>|</a>')
14     #把换行的标签换为\n
15     replaceLine = re.compile('<tr>|<div>|</div>|</p>')
16     #将表格制表<td>替换为\t
17     replaceTD= re.compile('<td>')
18     #把段落开头换为\n加空两格
19     replacePara = re.compile('<p.*?>')
20     #将换行符或双换行符替换为\n
21     replaceBR = re.compile('<br><br>|<br>')
22     #将其余标签剔除
23     removeExtraTag = re.compile('<.*?>')
24     def replace(self,x):
25         x = re.sub(self.removeImg,"",x)
26         x = re.sub(self.removeAddr,"",x)
27         x = re.sub(self.replaceLine,"\n",x)
28         x = re.sub(self.replaceTD,"\t",x)
29         x = re.sub(self.replacePara,"\n    ",x)
30         x = re.sub(self.replaceBR,"\n",x)
31         x = re.sub(self.removeExtraTag,"",x)
32         #strip()将前后多余内容删除
33         return x.strip()
34 
35 class BDTB:
36 
37     def __init__(self,baseUrl,seeLZ):
38         self.baseURL = baseUrl
39         self.seeLZ = '?see_lz='+str(seeLZ)
40         self.tool = Tool()
41 
42     def getPage(self,pageNum):
43         try:
44             url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
45             request = urllib2.Request(url)
46             response = urllib2.urlopen(request)
47             #print response.read()
48             return response
49         except urllib2.URLError, e:
50             if hasattr(e,"reason"):
51                 print u"error",e.reason
52                 return None
53     def getTitle(self):
54         page = self.getPage(1)
55         pattern = re.compile('<h3 .*?title="(.*?)" style=.*?</h3>', re.S)
56         page=page.read().decode('utf-8')
57         result = re.search(pattern,page)
58         if result:
59             print result.group(1)
60             return result.group(1).strip()
61         else:
62             return None
63     def getPageNum(self):
64         page = self.getPage(1)
65         pattern = re.compile('<li .*?max-page="(.*?)".*?/>',re.S)
66         page=page.read().decode('utf-8')
67         result = re.search(pattern,page)
68         if result:
69             print result.group(1)  #测试输出
70             return result.group(1).strip()
71         else:
72             return None
73     #获取每一层楼的内容,传入页面内容
74     def getContent(self,page):
75         pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
76         page=page.read().decode('utf-8')
77         items = re.findall(pattern,page)
78         floor=1
79         for item in items:
80             print floor,u'楼---------------------------------------\n'
81             print self.tool.replace(item)
82             floor+=1
83 baseURL = 'http://tieba.baidu.com/p/3138733512'
84 bdtb = BDTB(baseURL, 1)
85 #bdtb.getPage(1)
86 bdtb.getContent(bdtb.getPage(1))
复制代码

 

posted @   miao_a_miao  阅读(192)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示