Python爬虫(一)爬百度贴吧
简单的GET请求:
# python2 import urllib2 response = urllib2.urlopen('http://www.baidu.com') html = response.read() print html req = urllib2.Request('http://www.baidu.com') req.add_header('User-Agent', 'Chrome') response = urllib2.urlopen(req) print 'response headers:' print response.info()
爬百度贴吧一个帖子:
# python2 # -*- coding: utf-8 -*- import urllib2 import string def crawl_tieba(base_url, begin_page, end_page): for i in range(begin_page, end_page + 1): print '正在下载第' + str(i) + '个网页...' url = base_url + '?pn=' + str(i) m = urllib2.urlopen(url).read() file_name = string.zfill(i, 5) + '.html' f = open(file_name, 'w+') f.write(m) f.close() print 'done' crawl_tieba('http://tieba.baidu.com/p/4999189637', 1, 10)
WARNING:
如果没有第二行的注释,会报错:" SyntaxError: Non-ASCII character '\xe6' "。
爬糗百的帖子:
# python2 # -*- coding: utf-8 -*- import urllib2 import string import re def crawl_qiubai(base_url, begin_page, end_page): for i in range(begin_page, end_page + 1): url = base_url + str(i) user_agent = 'chrome' headers = {'User-Agent': user_agent} print '正在下载网页' + str(i) + '...' req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req).read() pattern = re.compile(r'<div.*?class="content">.*?<span>(.*?)</span>', re.S) features = re.findall(pattern, response) file_name = 'qiubai_' + string.zfill(i, 5) + '.txt' f = open(file_name, 'w+') for index in range(len(features)): feature = features[index].replace('<br/>', '\n') f.write('第' + str(index + 1) + '条:\n') f.write(feature + '\n\n') f.close() print '网页' + str(i) + '下载完成' print 'done' crawl_qiubai('http://www.qiushibaike.com/hot/page/', 1, 10)
参考资料:
Python爬虫入门教程
谈谈 Python 中的连接符(+、+=)