帖子爬取
1 # -*- coding: utf-8 -*- 2 import string 3 import urllib2 4 import re 5 from BeautifulSoup import BeautifulSoup 6 7 class Baidu_Spider: 8 def __init__(self,url): 9 self.myUrl=url; 10 11 # 初始化加载页面并将其转码储存 12 def baidu_tieba(self,page): 13 self.myUrl=self.myUrl+str(page) 14 print self.myUrl 15 # 读取页面的原始信息并将其从gbk转码 16 myPage = urllib2.urlopen(self.myUrl).read().decode("gbk") 17 soup = BeautifulSoup(myPage) 18 thread_list = soup.findAll("div",attrs={"class":'t_con clearfix'}) 19 for record in thread_list: 20 #print record 21 author = self.find_author(record) 22 #print author 23 hot = self.find_hot(record) 24 #print hot 25 title = self.find_title(record) 26 #print title 27 content = self.find_content(record) 28 #print content 29 url = self.find_url(record) 30 #print url 31 if url !="": 32 self.save_data(url,title,content,author,hot) 33 34 # 用来寻找该帖的相关信息 35 def find_url(self,record): 36 ahref=record.findAll("a",attrs={"class":'j_th_tit'}) 37 if len(ahref)>0: 38 url='http://tieba.baidu.com'+ahref[0]['href'] 39 else: 40 url="" 41 42 return url 43 def find_author(self,record): 44 author1=record.find("span",attrs={"class":'tb_icon_author '}) 45 author2=author1.find("a",attrs={"class":'j_user_card'}) 46 if author2 is not None: 47 author=author2.next 48 else: 49 author=author1.text 50 return author 51 def find_hot(self,record): 52 hot=record.find("div",attrs={"class":'threadlist_rep_num'}).text 53 return hot 54 def find_title(self,record): 55 tt=record.findAll("a",attrs={"class":'j_th_tit'}) 56 if len(tt)>0: 57 title=tt[0]['title'] 58 else: 59 title='' 60 return title 61 def find_content(self,record): 62 content=record.find("div",attrs={"class":'threadlist_abs threadlist_abs_onlyline'}) 63 if content is not None: 64 content=content.next.text 65 else: 66 content='' 67 return content 68 69 70 # 用来存储楼主发布的内容 71 def save_data(self,url,title,content,author,hot): 72 data=url+"\007"+hot+"\007"+author+"\007"+title+"\007"+content+"\n" 73 data=data.encode('utf-8') 74 #print data 75 f = open('spider'+'.txt','a') 76 f.write(data) 77 f.close() 78 79 #-------- 程序入口处 ------------------ 80 page=0 81 print u'已经启动百度贴吧爬虫' 82 while True: 83 bdurl ='http://tieba.baidu.com/f?kw=%C9%CF%BA%A3%BD%BB%CD%A8%B4%F3%D1%A7&tp=0&pn=' 84 mySpider = Baidu_Spider(bdurl) 85 mySpider.baidu_tieba(page) 86 page=page+50