贴吧实战整理

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import urllib
 5 import urllib2
 6 
 7 def loadPage(url, filename):
 8     """
 9         作用:根据url发送请求,获取服务器响应文件
10         url: 需要爬取的url地址
11         filename : 处理的文件名
12     """
13     print "正在下载 " + filename
14     headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
15 
16     request = urllib2.Request(url, headers = headers)
17     return urllib2.urlopen(request).read()
18 
19 def writePage(html, filename):
20     """
21         作用:将html内容写入到本地
22         html:服务器相应文件内容
23     """
24     print "正在保存 " + filename
25     # 文件写入
26     with open(filename, "w") as f:
27         f.write(html)
28     print "-" * 30
29 
30 def tiebaSpider(url, beginPage, endPage):
31     """
32         作用:贴吧爬虫调度器,负责组合处理每个页面的url
33         url : 贴吧url的前部分
34         beginPage : 起始页
35         endPage : 结束页
36     """
37     for page in range(beginPage, endPage + 1):
38         pn = (page - 1) * 50
39         filename = "" + str(page) + "页.html"
40         fullurl = url + "&pn=" + str(pn)
41         #print fullurl
42         html = loadPage(fullurl, filename)
43         #print html
44         writePage(html, filename)
45         print "谢谢使用"
46 
47 if __name__ == "__main__":
48     kw = raw_input("请输入需要爬取的贴吧名:")
49     beginPage = int(raw_input("请输入起始页:"))
50     endPage = int(raw_input("请输入结束页:"))
51 
52     url = "http://tieba.baidu.com/f?"
53     key = urllib.urlencode({"kw": kw})
54     fullurl = url + key
55     tiebaSpider(fullurl, beginPage, endPage)

 

posted @ 2017-11-21 09:00  不可叽叽歪歪  阅读(383)  评论(0编辑  收藏  举报