百度贴吧

参考:http://cuiqingcai.com/993.html

贴吧地址:https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1

# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
 
#百度贴吧:纯原创我心中的NBA2014-2015赛季现役50大


#实现功能:
#1.对百度贴吧的任意帖子进行抓取

#2.指定是否只抓取楼主发帖内容

#3.将抓取到的内容分析并保存到文件


 
# https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1
#解释如下:
# http:// 代表资源传输使用http协议
# tieba.baidu.com 是百度的二级域名,指向百度贴吧的服务器。
# /p/3138733512 是服务器某个资源,即这个帖子的地址定位符
# see_lz和pn是该URL的两个参数,分别代表了只看楼主和帖子页码,等于1表示该条件为真
 
 
class BDTB:
 
    #初始化方法,传入url,看是否只看楼主的参数
    def __init__(self, baseUrl, seeLz):
        self.baseurl = baseUrl
        self.seelz = '?see_lz=' + str(seeLz)
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        #初始化headers
        self.headers = { 'User-Agent' : self.user_agent }
        #初始化tool类工具
        self.tool = Tool()
        
        
 
 
    #方法getPage,获取该页帖子代码的源码
    def getPage(self, pageNum):
        try:
            url = self.baseurl + self.seelz + '&pn=' + str(pageNum)
            request = urllib2.Request(url,headers = self.headers)
            response = urllib2.urlopen(request)
            pageCode = response.read().decode('utf-8')
            return pageCode
 
        except urllib2.URLError, e:
            if hasattr(e,"reason"):
                print u"百度贴吧链接失败,错误原因是:",e.reason
                return None
 
 
    #获取帖子标题(使用正则表达式)
    def getTitle(self):
        page = self.getPage(1)
        if not page:
            print "页面加载失败..."
            return None
        pattern = re.compile('<h3.*?class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>', re.S)
        result = re.search(pattern, page)
        if result:
            print result.group(1).strip()
        else:
            print "None"
     
    #获取帖子总页码
    def getPageNum(self):
        page = self.getPage(1)
        if not page:
            print "页面加载失败..."
            return None
        pattern = re.compile('<span class="red">(.*?)</span>', re.S)
        result = re.search(pattern, page)
        if result:
            print result.group(1).strip()
        else:
            print "None"
     
    #获取正文信息,最后并写入文件,注意字符的转换
    #在此处说明一下:之前使用pageCode = response.read().decode('utf-8')请求到的网页内容用decode转换,这里的text是string类型用encode转换
    def getContent(self):
        page = self.getPage(1)
        if not page:
            print "页面加载失败..."
            return None
        pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S)
        results = re.findall(pattern, page)
        #定义floor变量打印重新楼层
        floor = 1
        file_object = open('bdtb.txt','w')
        try:
            for result in results:
                #print floor,u"楼------------------------------------------------------------------------\
                #------------------------------------------------------------\n"
                #print self.tool.replace(result)
                #floor += 1
                text = str(floor) + u"楼--------------------------------------------------------------------------------\n"\
                                    + self.tool.replace(result) + '\n'
                file_object.write(text.encode('utf-8') + '\n')
                floor += 1
        finally:
            file_object.close()
 


#处理页面标签类
class Tool:
    #去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD= re.compile('<td>')
    #把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n    ",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        #strip()将前后多余内容删除
        return x.strip()


 
baseURL = 'https://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL, 1)
#bdtb.getPage(1)
#bdtb.getTitle()
#bdtb.getPageNum()
bdtb.getContent()

  

  

  

posted @ 2017-06-21 19:08  大都比2号  阅读(5771)  评论(0编辑  收藏  举报