python爬虫笔记(二)爬取整体贴吧并保存
这次我做一个爬取贴吧的例子,并且保存在文件中。
import urllib2 import urllib import sys reload(sys) sys.setdefaultencoding('utf-8')
#调用 request 模块,返回网页信息 def get_reponse(url, filename): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36' } print 'downloading{}'.format(filename) request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) return response.read() #这部分是将抓取的网页放入文件 def write_html(html, filename): print 'Saving{}'.format(filename) with open(filename, 'w') as f: f.write(html) print '-' * 20 #这部分构建链接并通过调用get_reponse访问网页 def tiebaSpider(url, start_page, end_page): for page in range(start_page, end_page + 1): pn = (page - 1) * 50 filename = 'no{}page'.format(str(page)) fullurl = url + '&pn={}'.format(str(pn)) html = get_reponse(fullurl, filename) write_html(html, filename) if __name__ == '__main__': kw = raw_input('in put the formu: ')
#输入起始页 start_page = int(raw_input('Input the begin page: '))
#输入末尾页 end_page = int(raw_input('Input the end page'))
#贴吧链接 base_url = 'http://tieba.baidu.com/f?' key = urllib.urlencode({'kw': kw}) url = base_url + key tiebaSpider(url, start_page, end_page)

浙公网安备 33010602011771号