xpath库学习
xpath解析是我们在爬虫中最常用也是最通用的一种数据解析方式。
环境安装
pip install lxml
解析原理
- 使用通用爬虫爬取网页数据
- 实例化etree对象,且将页面数据加载到该对象中
- 使用xpath函数结合xpath表达式进行标签定位和指定数据提取
实例化etree对象
- 1.将本地的html文档中的源码数据加载到etree对象中: etree.parse(filePath) - 2.可以将从互联网上获取的源码数据加载到该对象中 etree.HTML('page_text')
xpath表达式
- xpath表达式: - xpath('xpath表达式') - /:表示的是从根节点开始定位。表示的是一个层级。 - //:表示的是多个层级。可以表示从任意位置开始定位。 - 属性定位://div[@class='song'] tag[@attrName="attrValue"] - 索引定位://div[@class="song"]/p[3] 索引是从1开始的。 - 取文本: - /text() 获取的是标签中直系的文本内容 - //text() 标签中非直系的文本内容(所有的文本内容) - 取属性: /@attrName ==>img/src
xpath使用案例
属性定位: #找到class属性值为song的div标签 //div[@class="song"] 层级&索引定位: #找到class属性值为tang的div的直系子标签ul下的第二个子标签li下的直系子标签a //div[@class="tang"]/ul/li[2]/a 逻辑运算: #找到href属性值为空且class属性值为du的a标签 //a[@href="" and @class="du"] 模糊匹配: //div[contains(@class, "ng")] //div[starts-with(@class, "ta")] 取文本: # /表示获取某个标签下的文本内容 # //表示获取某个标签下的文本内容和所有子标签下的文本内容 //div[@class="song"]/p[1]/text() //div[@class="tang"]//text() 取属性: //div[@class="tang"]//li[2]/a/@href
爬虫分析案例
解析58二手房的相关数据
import requests from lxml import etree # 需求:爬取58二手房中的房源信息 if __name__ == "__main__": headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } # 爬取到页面源码数据 url = 'https://bj.58.com/ershoufang/' page_text = requests.get(url=url, headers=headers).text # 数据解析 tree = etree.HTML(page_text) # 存储的就是li标签对象 li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') fp = open('58.txt', 'w', encoding='utf-8') for li in li_list: # 局部解析 title = li.xpath('./div[2]/h2/a/text()')[0] # ./表示当前解析的li标签为源码参照物 print(title) fp.write(title + '\n')
解析下载图片数据:
# 需求:解析下载图片数据 http://pic.netbian.com/4kbeijing/ import requests from lxml import etree import os if __name__ == "__main__": url = 'http://pic.netbian.com/4kbeijing/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } response = requests.get(url=url, headers=headers) # 手动设定响应数据的编码格式 # response.encoding = 'utf-8' page_text = response.text # 数据解析:src的属性值 alt属性 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') # 创建一个文件夹 if not os.path.exists('./picLibs'): os.mkdir('./picLibs') for li in li_list: img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] img_name = li.xpath('./a/img/@alt')[0] + '.jpg' # 通用处理中文乱码的解决方案 # encode('iso-8859-1') # 是将gbk编码编码成unicode编码 # decode(‘gbk’) 是从unicode编码解码成gbk字符串 img_name = img_name.encode('iso-8859-1').decode('gbk') # print(img_name,img_src) # 请求图片进行持久化存储 img_data = requests.get(url=img_src, headers=headers).content img_path = 'picLibs/' + img_name with open(img_path, 'wb') as fp: fp.write(img_data) print(img_name, '下载成功!!!')
解析出所有城市名称
# 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/ import requests from lxml import etree if __name__ == "__main__": # headers = { # 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' # } # url = 'https://www.aqistudy.cn/historydata/' # page_text = requests.get(url=url,headers=headers).text # # tree = etree.HTML(page_text) # host_li_list = tree.xpath('//div[@class="bottom"]/ul/li') # all_city_names = [] # #解析到了热门城市的城市名称 # for li in host_li_list: # hot_city_name = li.xpath('./a/text()')[0] # all_city_names.append(hot_city_name) # # #解析的是全部城市的名称 # city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li') # for li in city_names_list: # city_name = li.xpath('./a/text()')[0] # all_city_names.append(city_name) # # print(all_city_names,len(all_city_names)) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } url = 'https://www.aqistudy.cn/historydata/' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) # 解析到热门城市和所有城市对应的a标签 # //div[@class="bottom"]/ul/li/ 热门城市a标签的层级关系 # //div[@class="bottom"]/ul/div[2]/li/a 全部城市a标签的层级关系 a_list = tree.xpath( '//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a') all_city_names = [] for a in a_list: city_name = a.xpath('./text()')[0] all_city_names.append(city_name) print(all_city_names, len(all_city_names))
爬取站长素材中免费简历模板
# 项目需求:爬取站长素材中免费简历模板 http://sc.chinaz.com/jianli/free.html import requests import os from lxml import etree if __name__ == "__main__": headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } url = 'http://sc.chinaz.com/jianli/free.html' # 创建一个文件夹jianlitemplates if not os.path.exists('./jianlitemplates'): os.mkdir('./jianlitemplates') for page in range(1, 4): # 分页提取(2-3页) if page > 1: url = 'http://sc.chinaz.com/jianli/free_%s.html' % page # print(url) page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) jianli_href_list = tree.xpath( '//div[@class="box col3 ws_block"]/a/@href') # 模板链接 jianli_name_list = tree.xpath( '//div[@class="box col3 ws_block"]/a/img/@alt') # 模板标题 title_list = [t.encode('iso-8859-1').decode('utf-8') for t in jianli_name_list] # 转中文 for ind, h in enumerate(jianli_href_list): # 抓取简历模板页 con_text = requests.get(url=h, headers=headers).text con_tree = etree.HTML(con_text) con_href = con_tree.xpath( '//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')[0] # 得到下载链接 print(con_href, title_list[ind]) wrd_path = 'jianlitemplates/' + title_list[ind] + '.rar' # 请求模板进行持久化存储 wrd_data = requests.get(url=con_href, headers=headers).content with open(wrd_path, 'wb') as fp: # 保存简历模板 fp.write(wrd_data) print(title_list[ind], '下载成功!!!')