贴吧实战整理
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib 5 import urllib2 6 7 def loadPage(url, filename): 8 """ 9 作用:根据url发送请求,获取服务器响应文件 10 url: 需要爬取的url地址 11 filename : 处理的文件名 12 """ 13 print "正在下载 " + filename 14 headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} 15 16 request = urllib2.Request(url, headers = headers) 17 return urllib2.urlopen(request).read() 18 19 def writePage(html, filename): 20 """ 21 作用:将html内容写入到本地 22 html:服务器相应文件内容 23 """ 24 print "正在保存 " + filename 25 # 文件写入 26 with open(filename, "w") as f: 27 f.write(html) 28 print "-" * 30 29 30 def tiebaSpider(url, beginPage, endPage): 31 """ 32 作用:贴吧爬虫调度器,负责组合处理每个页面的url 33 url : 贴吧url的前部分 34 beginPage : 起始页 35 endPage : 结束页 36 """ 37 for page in range(beginPage, endPage + 1): 38 pn = (page - 1) * 50 39 filename = "第" + str(page) + "页.html" 40 fullurl = url + "&pn=" + str(pn) 41 #print fullurl 42 html = loadPage(fullurl, filename) 43 #print html 44 writePage(html, filename) 45 print "谢谢使用" 46 47 if __name__ == "__main__": 48 kw = raw_input("请输入需要爬取的贴吧名:") 49 beginPage = int(raw_input("请输入起始页:")) 50 endPage = int(raw_input("请输入结束页:")) 51 52 url = "http://tieba.baidu.com/f?" 53 key = urllib.urlencode({"kw": kw}) 54 fullurl = url + key 55 tiebaSpider(fullurl, beginPage, endPage)