使用XPath的爬虫,百度贴吧图片下载

现在我们用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子,并且将该这个帖子里每个楼层发布的图片下载到本地。

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import urllib
 5 import urllib2
 6 from lxml import etree
 7 
 8 def loadPage(url):
 9     """
10         作用:根据url发送请求,获取服务器响应文件
11         url: 需要爬取的url地址
12     """
13     #print url
14     #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
15 
16     request = urllib2.Request(url)
17     html = urllib2.urlopen(request).read()
18     # 解析HTML文档为HTML DOM模型
19     content = etree.HTML(html)
20     #print content
21     # 返回所有匹配成功的列表集合
22     link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
23     for link in link_list:
24         fulllink = "http://tieba.baidu.com" + link
25         # 组合为每个帖子的链接
26         #print link
27         loadImage(fulllink)
28 
29 # 取出每个帖子里的每个图片连接
30 def loadImage(link):
31     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
32     request = urllib2.Request(link, headers = headers)
33     html = urllib2.urlopen(request).read()
34     # 解析
35     content = etree.HTML(html)
36     # 取出帖子里每层层主发送的图片连接集合
37     link_list = content.xpath('//img[@class="BDE_Image"]/@src')
38     # 取出每个图片的连接
39     for link in link_list:
40         #print link
41         writeImage(link)
42 
43 def writeImage(link):
44     """
45         作用:将html内容写入到本地
46         link:图片连接
47     """
48     #print "正在保存 " + filename
49     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
50     # 文件写入
51     request = urllib2.Request(link, headers = headers)
52     # 图片原始数据
53     image = urllib2.urlopen(request).read()
54     # 取出连接后10位做为文件名
55     filename = link[-10:]
56     # 写入到本地磁盘文件内
57     with open(filename, "wb") as f:
58         f.write(image)
59     print "已经成功下载 "+ filename
60 
61 def tiebaSpider(url, beginPage, endPage):
62     """
63         作用:贴吧爬虫调度器,负责组合处理每个页面的url
64         url : 贴吧url的前部分
65         beginPage : 起始页
66         endPage : 结束页
67     """
68     for page in range(beginPage, endPage + 1):
69         pn = (page - 1) * 50
70         fullurl = url + "&pn=" + str(pn)
71         #print fullurl
72         loadPage(fullurl)
73         #print html
74 
75         print "谢谢使用"
76 
77 if __name__ == "__main__":
78     kw = raw_input("请输入需要爬取的贴吧名:")
79     beginPage = int(raw_input("请输入起始页:"))
80     endPage = int(raw_input("请输入结束页:"))
81 
82     url = "http://tieba.baidu.com/f?"
83     key = urllib.urlencode({"kw": kw})
84     fullurl = url + key
85     tiebaSpider(fullurl, beginPage, endPage)

 在python3中使用XPath的爬虫,百度贴吧图片下载:

 1 import requests
 2 from lxml import etree
 3 from urllib import parse
 4 
 5 def loadPage(url):
 6     """
 7         作用:根据url发送请求,获取服务器响应文件
 8         url: 需要爬取的url地址
 9     """
10     #print url
11     #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
12 
13     response = requests.get(url)
14     html = response.content
15     # 解析HTML文档为HTML DOM模型
16     content = etree.HTML(html)
17     #print content
18     # 返回所有匹配成功的列表集合
19     link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
20     for link in link_list:
21         fulllink = "http://tieba.baidu.com" + link
22         # 组合为每个帖子的链接
23         #print(link)
24         loadImage(fulllink)
25 
26 # 取出每个帖子里的每个图片连接
27 def loadImage(link):
28     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
29     response = requests.get(link, headers = headers)
30     html = response.content
31     # 解析
32     content = etree.HTML(html)
33     # 取出帖子里每层层主发送的图片连接集合
34     link_list = content.xpath('//img[@class="BDE_Image"]/@src')
35     # 取出每个图片的连接
36     for link in link_list:
37         #print(link)
38         writeImage(link)
39 
40 def writeImage(link):
41     """
42         作用:将html内容写入到本地
43         link:图片连接
44     """
45     #print "正在保存 " + filename
46     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
47     # 文件写入
48     response = requests.get(link, headers = headers)
49     # 图片原始数据
50     #image = urllib2.urlopen(request).read()
51     image = response.content
52     # 取出连接后10位做为文件名
53     filename = link[-10:]
54     # 写入到本地磁盘文件内
55     with open(filename, "wb") as f:
56         f.write(image)
57     print("已经成功下载 "+ filename)
58 
59 def tiebaSpider(url, beginPage, endPage):
60     """
61         作用:贴吧爬虫调度器,负责组合处理每个页面的url
62         url : 贴吧url的前部分
63         beginPage : 起始页
64         endPage : 结束页
65     """
66     for page in range(beginPage, endPage + 1):
67         pn = (page - 1) * 50
68         fullurl = url + "&pn=" + str(pn)
69         #print(fullurl)
70         loadPage(fullurl)
71         #print(html)
72 
73         print("谢谢使用")
74 
75 if __name__ == "__main__":
76     kw = input("请输入需要爬取的贴吧名:")
77     beginPage = int(input("请输入起始页:"))
78     endPage = int(input("请输入结束页:"))
79     url = "http://tieba.baidu.com/f?"
80     #key = urllib.urlencode({"kw": kw})
81     key = parse.urlencode({"kw": kw})
82     fullurl = url + key
83     tiebaSpider(fullurl, beginPage, endPage)

 

posted @ 2018-06-16 14:55  王琳杰  阅读(304)  评论(0编辑  收藏  举报