Python爬虫(二)爬百度贴吧楼主发言
爬取电影吧一个帖子里的所有楼主发言:
# python2 # -*- coding: utf-8 -*- import urllib2 import string import re class Baidu_Spider: feature_pattern = re.compile(r'id="post_content.*?>\s+(.*?)</div>', re.S) replaceList = [(''', '\''), ('"', '\"')] def __init__(self, url): self.url = url + '?see_lz=1' def crawl_tieba_lz(self): begin_page = urllib2.urlopen(self.url).read() self.print_page_title(begin_page) count = self.get_page_count(begin_page) self.handle_data(count) def handle_data(self, count): f = open('tieba_lz.txt', 'w+') for i in range(count): url = self.url + '&pn=' + str(i+1) hint = '第' + str(i+1) + '页' print '正在下载%s: %s' % (hint, url) page = urllib2.urlopen(url).read() features = re.findall(self.feature_pattern, page) print hint + '下载完成' print '共有%d条记录' % len(features) f.write(hint + ':\n') for feature in features: feature = self.handle_record(feature) print feature f.write(feature + '\n\n') f.close() print 'done' def handle_record(self, record): record = re.sub(r'(<|</)br>', '\n', record) record = re.sub(r'<.*?>', '', record) for item in self.replaceList: record = record.replace(item[0], item[1]) return record def get_page_count(self, page): result = re.search(r'class="red">(\d+?)</span>', page, re.S) if result: count = int(result.group(1)) print '一共%d页' % count else: count = 0; print '无法获取页数' return count def print_page_title(self, page): result = re.search(r'<h1.*?>(.*?)</h1>', page, re.S) if result: title = result.group(1) print '标题: %s' % title else: print '无法获取标题' spider = Baidu_Spider('http://tieba.baidu.com/p/4082863285') spider.crawl_tieba_lz()