百度贴吧
参考:http://cuiqingcai.com/993.html
贴吧地址:https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1
# -*- coding:utf-8 -*- import urllib import urllib2 import re #百度贴吧:纯原创我心中的NBA2014-2015赛季现役50大 #实现功能: #1.对百度贴吧的任意帖子进行抓取 #2.指定是否只抓取楼主发帖内容 #3.将抓取到的内容分析并保存到文件 # https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1 #解释如下: # http:// 代表资源传输使用http协议 # tieba.baidu.com 是百度的二级域名,指向百度贴吧的服务器。 # /p/3138733512 是服务器某个资源,即这个帖子的地址定位符 # see_lz和pn是该URL的两个参数,分别代表了只看楼主和帖子页码,等于1表示该条件为真 class BDTB: #初始化方法,传入url,看是否只看楼主的参数 def __init__(self, baseUrl, seeLz): self.baseurl = baseUrl self.seelz = '?see_lz=' + str(seeLz) self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' #初始化headers self.headers = { 'User-Agent' : self.user_agent } #初始化tool类工具 self.tool = Tool() #方法getPage,获取该页帖子代码的源码 def getPage(self, pageNum): try: url = self.baseurl + self.seelz + '&pn=' + str(pageNum) request = urllib2.Request(url,headers = self.headers) response = urllib2.urlopen(request) pageCode = response.read().decode('utf-8') return pageCode except urllib2.URLError, e: if hasattr(e,"reason"): print u"百度贴吧链接失败,错误原因是:",e.reason return None #获取帖子标题(使用正则表达式) def getTitle(self): page = self.getPage(1) if not page: print "页面加载失败..." return None pattern = re.compile('<h3.*?class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>', re.S) result = re.search(pattern, page) if result: print result.group(1).strip() else: print "None" #获取帖子总页码 def getPageNum(self): page = self.getPage(1) if not page: print "页面加载失败..." return None pattern = re.compile('<span class="red">(.*?)</span>', re.S) result = re.search(pattern, page) if result: print result.group(1).strip() else: print "None" #获取正文信息,最后并写入文件,注意字符的转换 #在此处说明一下:之前使用pageCode = response.read().decode('utf-8')请求到的网页内容用decode转换,这里的text是string类型用encode转换 def getContent(self): page = self.getPage(1) if not page: print "页面加载失败..." return None pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S) results = re.findall(pattern, page) #定义floor变量打印重新楼层 floor = 1 file_object = open('bdtb.txt','w') try: for result in results: #print floor,u"楼------------------------------------------------------------------------\ #------------------------------------------------------------\n" #print self.tool.replace(result) #floor += 1 text = str(floor) + u"楼--------------------------------------------------------------------------------\n"\ + self.tool.replace(result) + '\n' file_object.write(text.encode('utf-8') + '\n') floor += 1 finally: file_object.close() #处理页面标签类 class Tool: #去除img标签,7位长空格 removeImg = re.compile('<img.*?>| {7}|') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #将表格制表<td>替换为\t replaceTD= re.compile('<td>') #把段落开头换为\n加空两格 replacePara = re.compile('<p.*?>') #将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n ",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) #strip()将前后多余内容删除 return x.strip() baseURL = 'https://tieba.baidu.com/p/3138733512' bdtb = BDTB(baseURL, 1) #bdtb.getPage(1) #bdtb.getTitle() #bdtb.getPageNum() bdtb.getContent()