使用XPath的爬虫,百度贴吧图片下载
现在我们用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子,并且将该这个帖子里每个楼层发布的图片下载到本地。
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib 5 import urllib2 6 from lxml import etree 7 8 def loadPage(url): 9 """ 10 作用:根据url发送请求,获取服务器响应文件 11 url: 需要爬取的url地址 12 """ 13 #print url 14 #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} 15 16 request = urllib2.Request(url) 17 html = urllib2.urlopen(request).read() 18 # 解析HTML文档为HTML DOM模型 19 content = etree.HTML(html) 20 #print content 21 # 返回所有匹配成功的列表集合 22 link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') 23 for link in link_list: 24 fulllink = "http://tieba.baidu.com" + link 25 # 组合为每个帖子的链接 26 #print link 27 loadImage(fulllink) 28 29 # 取出每个帖子里的每个图片连接 30 def loadImage(link): 31 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 32 request = urllib2.Request(link, headers = headers) 33 html = urllib2.urlopen(request).read() 34 # 解析 35 content = etree.HTML(html) 36 # 取出帖子里每层层主发送的图片连接集合 37 link_list = content.xpath('//img[@class="BDE_Image"]/@src') 38 # 取出每个图片的连接 39 for link in link_list: 40 #print link 41 writeImage(link) 42 43 def writeImage(link): 44 """ 45 作用:将html内容写入到本地 46 link:图片连接 47 """ 48 #print "正在保存 " + filename 49 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 50 # 文件写入 51 request = urllib2.Request(link, headers = headers) 52 # 图片原始数据 53 image = urllib2.urlopen(request).read() 54 # 取出连接后10位做为文件名 55 filename = link[-10:] 56 # 写入到本地磁盘文件内 57 with open(filename, "wb") as f: 58 f.write(image) 59 print "已经成功下载 "+ filename 60 61 def tiebaSpider(url, beginPage, endPage): 62 """ 63 作用:贴吧爬虫调度器,负责组合处理每个页面的url 64 url : 贴吧url的前部分 65 beginPage : 起始页 66 endPage : 结束页 67 """ 68 for page in range(beginPage, endPage + 1): 69 pn = (page - 1) * 50 70 fullurl = url + "&pn=" + str(pn) 71 #print fullurl 72 loadPage(fullurl) 73 #print html 74 75 print "谢谢使用" 76 77 if __name__ == "__main__": 78 kw = raw_input("请输入需要爬取的贴吧名:") 79 beginPage = int(raw_input("请输入起始页:")) 80 endPage = int(raw_input("请输入结束页:")) 81 82 url = "http://tieba.baidu.com/f?" 83 key = urllib.urlencode({"kw": kw}) 84 fullurl = url + key 85 tiebaSpider(fullurl, beginPage, endPage)
在python3中使用XPath的爬虫,百度贴吧图片下载:
1 import requests 2 from lxml import etree 3 from urllib import parse 4 5 def loadPage(url): 6 """ 7 作用:根据url发送请求,获取服务器响应文件 8 url: 需要爬取的url地址 9 """ 10 #print url 11 #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} 12 13 response = requests.get(url) 14 html = response.content 15 # 解析HTML文档为HTML DOM模型 16 content = etree.HTML(html) 17 #print content 18 # 返回所有匹配成功的列表集合 19 link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') 20 for link in link_list: 21 fulllink = "http://tieba.baidu.com" + link 22 # 组合为每个帖子的链接 23 #print(link) 24 loadImage(fulllink) 25 26 # 取出每个帖子里的每个图片连接 27 def loadImage(link): 28 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 29 response = requests.get(link, headers = headers) 30 html = response.content 31 # 解析 32 content = etree.HTML(html) 33 # 取出帖子里每层层主发送的图片连接集合 34 link_list = content.xpath('//img[@class="BDE_Image"]/@src') 35 # 取出每个图片的连接 36 for link in link_list: 37 #print(link) 38 writeImage(link) 39 40 def writeImage(link): 41 """ 42 作用:将html内容写入到本地 43 link:图片连接 44 """ 45 #print "正在保存 " + filename 46 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 47 # 文件写入 48 response = requests.get(link, headers = headers) 49 # 图片原始数据 50 #image = urllib2.urlopen(request).read() 51 image = response.content 52 # 取出连接后10位做为文件名 53 filename = link[-10:] 54 # 写入到本地磁盘文件内 55 with open(filename, "wb") as f: 56 f.write(image) 57 print("已经成功下载 "+ filename) 58 59 def tiebaSpider(url, beginPage, endPage): 60 """ 61 作用:贴吧爬虫调度器,负责组合处理每个页面的url 62 url : 贴吧url的前部分 63 beginPage : 起始页 64 endPage : 结束页 65 """ 66 for page in range(beginPage, endPage + 1): 67 pn = (page - 1) * 50 68 fullurl = url + "&pn=" + str(pn) 69 #print(fullurl) 70 loadPage(fullurl) 71 #print(html) 72 73 print("谢谢使用") 74 75 if __name__ == "__main__": 76 kw = input("请输入需要爬取的贴吧名:") 77 beginPage = int(input("请输入起始页:")) 78 endPage = int(input("请输入结束页:")) 79 url = "http://tieba.baidu.com/f?" 80 #key = urllib.urlencode({"kw": kw}) 81 key = parse.urlencode({"kw": kw}) 82 fullurl = url + key 83 tiebaSpider(fullurl, beginPage, endPage)