帖子爬取

 1 # -*- coding: utf-8 -*-  
 2 import string  
 3 import urllib2  
 4 import re
 5 from BeautifulSoup import BeautifulSoup
 6 
 7 class Baidu_Spider:  
 8     def __init__(self,url):
 9         self.myUrl=url;
10          
11     # 初始化加载页面并将其转码储存  
12     def baidu_tieba(self,page):
13         self.myUrl=self.myUrl+str(page)
14         print self.myUrl
15         # 读取页面的原始信息并将其从gbk转码  
16         myPage = urllib2.urlopen(self.myUrl).read().decode("gbk")
17         soup = BeautifulSoup(myPage)
18         thread_list = soup.findAll("div",attrs={"class":'t_con clearfix'})
19         for record in thread_list:
20             #print record
21             author = self.find_author(record)
22             #print author
23             hot = self.find_hot(record)
24             #print hot
25             title = self.find_title(record)
26             #print title
27             content = self.find_content(record)
28             #print content
29             url = self.find_url(record)
30             #print url
31             if url !="":
32                 self.save_data(url,title,content,author,hot)
33     
34     # 用来寻找该帖的相关信息 
35     def find_url(self,record):
36         ahref=record.findAll("a",attrs={"class":'j_th_tit'})
37         if len(ahref)>0:
38             url='http://tieba.baidu.com'+ahref[0]['href'] 
39         else:
40             url=""
41 
42         return url 
43     def find_author(self,record):
44         author1=record.find("span",attrs={"class":'tb_icon_author '})
45         author2=author1.find("a",attrs={"class":'j_user_card'})
46         if author2 is not None:
47             author=author2.next
48         else:
49             author=author1.text
50         return author
51     def find_hot(self,record):
52         hot=record.find("div",attrs={"class":'threadlist_rep_num'}).text
53         return hot
54     def find_title(self,record):
55         tt=record.findAll("a",attrs={"class":'j_th_tit'})
56         if len(tt)>0:
57             title=tt[0]['title']
58         else:
59             title=''
60         return title 
61     def find_content(self,record):
62         content=record.find("div",attrs={"class":'threadlist_abs threadlist_abs_onlyline'})
63         if content is not None:
64             content=content.next.text
65         else:
66             content=''
67         return content
68     
69        
70     # 用来存储楼主发布的内容  
71     def save_data(self,url,title,content,author,hot):  
72         data=url+"\007"+hot+"\007"+author+"\007"+title+"\007"+content+"\n"
73         data=data.encode('utf-8')
74         #print data
75         f = open('spider'+'.txt','a')  
76         f.write(data)  
77         f.close()
78 
79 #-------- 程序入口处 ------------------  
80 page=0
81 print u'已经启动百度贴吧爬虫'
82 while True:
83     bdurl ='http://tieba.baidu.com/f?kw=%C9%CF%BA%A3%BD%BB%CD%A8%B4%F3%D1%A7&tp=0&pn='
84     mySpider = Baidu_Spider(bdurl)
85     mySpider.baidu_tieba(page)
86     page=page+50
View Code

 

posted @ 2013-12-26 16:23  liutoutou  阅读(140)  评论(0编辑  收藏  举报