Spider基础02总结【数据的提取,Json内置函数的用法,Xpath的使用】
数据分类
非结构化的数据:html等
处理方法:正则表达式、xpath
结构化数据:json,xml等
处理方法:转化为python数据类型
数据提取之JSON
由于把json数据转化为python内建数据类型很简单,所以爬虫中,如果我们能够找到返回json数据的URL,就会尽量使用这种URL
JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式,它使得人们很容易的进行阅读和编写。同时也方便了机器进行解析和生成。适用于进行数据交互的场景,比如网站前台与后台之间的数据交互。
哪里能找到返回json的url呢?
- 使用chrome切换到手机页面
- 抓包手机app的软件
json模块转换dict和json串的方法
具有read()或者write()方法的对象就是类文件对象 f = open(“a.txt”,”r”) f就是类文件对象
import json from pprint import pprint json_temp = """ {"count": 8, "start": 0, "subject_collection_items": [{"info": "\u5434\u4eac\/\u5434\u4eac\/\u5f17\u5170\u514b\u00b7\u683c\u91cc\u7f57\/\u5434\u521a\/\u52a8\u4f5c\/2017-07-27(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "07.27", "rating": {"count": 310987, "max": 10, "value": 7.5}, "description": "", "title": "\u6218\u72fc2", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26363254\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2485983612.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 750, "shape": "rectangle", "height": 1134}, "uri": "douban:\/\/douban.com\/movie\/26363254", "actions": [], "label": null, "subtype": "", "directors": ["\u5434\u4eac"], "actors": ["\u5434\u4eac", "\u5f17\u5170\u514b\u00b7\u683c\u91cc\u7f57", "\u5434\u521a"], "date": null, "reviewer_name": "", "type": "movie", "id": "26363254"}, {"info": "\u8c22\u4e1c\u71ca\/\u5ed6\u51e1\/\u674e\u6613\u5cf0\/\u4e07\u831c\/\u52a8\u4f5c\/\u72af\u7f6a\/\u60ac\u7591\/2017-08-11(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.11", "rating": {"count": 38253, "max": 10, "value": 5.6}, "description": "", "title": "\u5fc3\u7406\u7f6a", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26698000\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2492869971.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 5953, "shape": "rectangle", "height": 8315}, "uri": "douban:\/\/douban.com\/movie\/26698000", "actions": [], "label": null, "subtype": "", "directors": ["\u8c22\u4e1c\u71ca"], "actors": ["\u5ed6\u51e1", "\u674e\u6613\u5cf0", "\u4e07\u831c"], "date": null, "reviewer_name": "", "type": "movie", "id": "26698000"}, {"info": "\u51af\u5fb7\u4f26\/\u5218\u5fb7\u534e\/\u8212\u6dc7\/\u5f20\u9759\u521d\/\u52a8\u4f5c\/\u5192\u9669\/2017-08-11(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.11", "rating": {"count": 13996, "max": 10, "value": 5.8}, "description": "", "title": "\u4fa0\u76d7\u8054\u76df", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/25858758\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2494018217.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 1000, "shape": "rectangle", "height": 1418}, "uri": "douban:\/\/douban.com\/movie\/25858758", "actions": [], "label": null, "subtype": "", "directors": ["\u51af\u5fb7\u4f26"], "actors": ["\u5218\u5fb7\u534e", "\u8212\u6dc7", "\u5f20\u9759\u521d"], "date": null, "reviewer_name": "", "type": "movie", "id": "25858758"}, {"info": "\u53f6\u4f1f\u4fe1\/\u53e4\u5929\u4e50\/\u6258\u5c3c\u00b7\u8d3e\/\u5434\u6a3e\/\u5267\u60c5\/\u52a8\u4f5c\/2017-08-17(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.17", "rating": null, "description": "", "title": "\u6740\u7834\u72fc\u00b7\u8d2a\u72fc", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26826398\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2494948513.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 682, "shape": "rectangle", "height": 960}, "uri": "douban:\/\/douban.com\/movie\/26826398", "actions": [], "label": null, "subtype": "", "directors": ["\u53f6\u4f1f\u4fe1"], "actors": ["\u53e4\u5929\u4e50", "\u6258\u5c3c\u00b7\u8d3e", "\u5434\u6a3e"], "date": null, "reviewer_name": "", "type": "movie", "id": "26826398"}, {"info": "\u6768\u78ca\/\u738b\u5927\u9646\/\u5f20\u5929\u7231\/\u4efb\u8fbe\u534e\/\u559c\u5267\/\u52a8\u4f5c\/\u7231\u60c5\/2017-08-11(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.11", "rating": {"count": 7339, "max": 10, "value": 4.9}, "description": "", "title": "\u9c9b\u73e0\u4f20", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/25857966\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2493908018.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 1145, "shape": "rectangle", "height": 1600}, "uri": "douban:\/\/douban.com\/movie\/25857966", "actions": [], "label": null, "subtype": "", "directors": ["\u6768\u78ca"], "actors": ["\u738b\u5927\u9646", "\u5f20\u5929\u7231", "\u4efb\u8fbe\u534e"], "date": null, "reviewer_name": "", "type": "movie", "id": "25857966"}, {"info": "\u8fde\u5955\u7426\/\u90ed\u5bcc\u57ce\/\u738b\u5343\u6e90\/\u5218\u6d9b\/\u52a8\u4f5c\/\u72af\u7f6a\/2017-08-17(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.17", "rating": null, "description": "", "title": "\u7834\u00b7\u5c40", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26760160\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2494818916.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 1071, "shape": "rectangle", "height": 1500}, "uri": "douban:\/\/douban.com\/movie\/26760160", "actions": [], "label": null, "subtype": "", "directors": ["\u8fde\u5955\u7426"], "actors": ["\u90ed\u5bcc\u57ce", "\u738b\u5343\u6e90", "\u5218\u6d9b"], "date": null, "reviewer_name": "", "type": "movie", "id": "26760160"}, {"info": "\u7406\u67e5\u5fb7\u00b7\u6234\u5c14\/\u6210\u9f99\/\u7f57\u4f2f\u7279\u00b7\u96f7\u5fb7\u798f\/\u7eaa\u5f55\u7247\/2017-08-11(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.11", "rating": {"count": 3387, "max": 10, "value": 8.3}, "description": "", "title": "\u5730\u7403\uff1a\u795e\u5947\u7684\u4e00\u5929", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26647876\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2493261459.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 4488, "shape": "rectangle", "height": 6260}, "uri": "douban:\/\/douban.com\/movie\/26647876", "actions": [], "label": null, "subtype": "", "directors": ["\u7406\u67e5\u5fb7\u00b7\u6234\u5c14"], "actors": ["\u6210\u9f99", "\u7f57\u4f2f\u7279\u00b7\u96f7\u5fb7\u798f"], "date": null, "reviewer_name": "", "type": "movie", "id": "26647876"}, {"info": "\u90ed\u67ef\/\u7eaa\u5f55\u7247\/2017-08-14(\u4e2d\u56fd\u5927\u9646)", "original_price": null, "release_date": "08.14", "rating": {"count": 6771, "max": 10, "value": 8.9}, "description": "", "title": "\u4e8c\u5341\u4e8c", "url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/movie\/26430107\/", "price": null, "cover": {"url": "https:\/\/qnmob2.doubanio.com\/view\/movie_poster_cover\/lpst\/public\/p2457609817.jpg?imageView2\/0\/q\/80\/w\/9999\/h\/400\/format\/jpg", "width": 959, "shape": "rectangle", "height": 1343}, "uri": "douban:\/\/douban.com\/movie\/26430107", "actions": [], "label": null, "subtype": "", "directors": ["\u90ed\u67ef"], "actors": [], "date": null, "reviewer_name": "", "type": "movie", "id": "26430107"}], "total": 44, "subject_collection": {"subject_count": 44, "description": "", "url": "https:\/\/m.douban.com\/app_topic\/movie_showing", "uri": "douban:\/\/douban.com\/subject_collection\/movie_showing", "cover_url": "", "name": "\u5f71\u9662\u70ed\u6620", "id": "movie_showing", "display": {"layout": "grid"}, "sharing_url": "https:\/\/www.douban.com\/doubanapp\/dispatch?uri=\/subject_collection\/movie_showing\/"}} """ temp_dict = json.loads(json_temp) # 将json串转成dict pprint(temp_dict) # 格式化输出 temp_json = json.dumps(temp_dict) # 将dict转成json print(temp_json) # 将字典写到json文件当中 with open('data_json.json', 'w') as f: json.dump(temp_dict, f, indent=2, ensure_ascii=False) # 将字典写到json文件当中 f = open('data_json.json', 'w') json.dump(temp_dict, f, ensure_ascii=False, indent=2) f.close() # 读取json数据 with open('data_json.json', 'r') as f: temp_dict = json.load(f) print(type(temp_dict))
数据提取之JSON与JsonPATH
JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式,它使得人们很容易的进行阅读和编写。同时也方便了机器进行解析和生成。适用于进行数据交互的场景,比如网站前台与后台之间的数据交互。
JSON和XML的比较可谓不相上下。
Python 2.7中自带了JSON模块,直接import json就可以使用了。
官方文档:http://docs.python.org/library/json.html
Json在线解析网站:http://www.json.cn/#
JSON
json简单说就是javascript中的对象和数组,所以这两种结构就是对象和数组两种结构,通过这两种结构可以表示各种复杂的结构
对象:对象在js中表示为{ }括起来的内容,数据结构为 { key:value, key:value, ... }的键值对的结构,在面向对象的语言中,key为对象的属性,value为对应的属性值,所以很容易理解,取值方法为 对象.key 获取属性值,这个属性值的类型可以是数字、字符串、数组、对象这几种。
数组:数组在js中是中括号[ ]括起来的内容,数据结构为 ["Python", "javascript", "C++", ...],取值方式和所有语言中一样,使用索引获取,字段值的类型可以是 数字、字符串、数组、对象几种。
import json
json模块提供了四个功能:dumps、dump、loads、load,用于字符串 和 python数据类型间进行转换。
1. json.loads()
把Json格式字符串解码转换成Python对象 从json到python的类型转化对照如下:
# json_loads.py import json strList = '[1, 2, 3, 4]' strDict = '{"city": "北京", "name": "大猫"}' json.loads(strList) # [1, 2, 3, 4] json.loads(strDict) # json数据自动按Unicode存储 # {u'city': u'\u5317\u4eac', u'name': u'\u5927\u732b'}
2. json.dumps()
实现python类型转化为json字符串,返回一个str对象 把一个Python对象编码转换成Json字符串
从python原始类型向json类型的转化对照如下:
# json_dumps.py import json import chardet listStr = [1, 2, 3, 4] tupleStr = (1, 2, 3, 4) dictStr = {"city": "北京", "name": "大猫"} json.dumps(listStr) # '[1, 2, 3, 4]' json.dumps(tupleStr) # '[1, 2, 3, 4]' # 注意:json.dumps() 序列化时默认使用的ascii编码 # 添加参数 ensure_ascii=False 禁用ascii编码,按utf-8编码 # chardet.detect()返回字典, 其中confidence是检测精确度 json.dumps(dictStr) # '{"city": "\\u5317\\u4eac", "name": "\\u5927\\u5218"}' chardet.detect(json.dumps(dictStr)) # {'confidence': 1.0, 'encoding': 'ascii'} print json.dumps(dictStr, ensure_ascii=False) # {"city": "北京", "name": "大刘"} chardet.detect(json.dumps(dictStr, ensure_ascii=False)) # {'confidence': 0.99, 'encoding': 'utf-8'}
chardet是一个非常优秀的编码识别模块,可通过pip安装
3. json.dump()
将Python内置类型序列化为json对象后写入文件
# json_dump.py import json listStr = [{"city": "北京"}, {"name": "大刘"}] json.dump(listStr, open("listStr.json","w"), ensure_ascii=False) dictStr = {"city": "北京", "name": "大刘"} json.dump(dictStr, open("dictStr.json","w"), ensure_ascii=False)
4. json.load()
读取文件中json形式的字符串元素 转化成python类型
# json_load.py import json strList = json.load(open("listStr.json")) print strList # [{u'city': u'\u5317\u4eac'}, {u'name': u'\u5927\u5218'}] strDict = json.load(open("dictStr.json")) print strDict # {u'city': u'\u5317\u4eac', u'name': u'\u5927\u5218'}
XPATH和LXML类库
学习XPATH和LXML类库:
- lxml是一款高性能的 Python HTML/XML 解析器,我们可以利用XPath,来快速的定位特定元素以及获取节点信息
什么是XPATH:
- XPath (XML Path Language) 是一门在 HTML\XML 文档中查找信息的语言, 可用来在 HTML\XML 文档中对元素和属性进行遍历。
W3School官方文档:http://www.w3school.com.cn/xpath/index.asp
XML的节点关系
节点的概念:每个XML的标签我们都称之为节点
XML的节点关系
节点选择语法
XPath 使用路径表达式来选取 XML 文档中的节点或者节点集。这些路径表达式和我们在常规的电脑文件系统中看到的表达式非常相似。
使用chrome插件选择标签时候,选中时,选中的标签会添加属性class="xh-highlight
节点选择语法
查找某个特定的节点或者包含某个指定的值的节点
节点选择语法
# div的id包含i的标签 //div[contains(@id, 'i')] # a标签的内容为'下一页的' // a[text()='下一页']
选择未知节点
XPath的运算符
节点选择语法
选取若干路径
xpath的更多语法: https://msdn.microsoft.com/zh-cn/library/ms256039(v=vs.80).aspx
lxml python 官方文档:http://lxml.de/index.html
# 安装 pip install lxml from lxml import etree # 注意当获取到标签的时候为element对象, 如果获取到属性或内容则为列表 text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a>second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a >fifth item</a> # 注意,此处缺少一个 </li> 闭合标签 </ul> </div> ''' html = etree.HTML(text) # 将字符串转成element对象 print(html) result = etree.tostring(html).decode() # 将element对象转成bytes类型 格式化输出还可以添加lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签 print(result) print("*"*88) # 选择任意一个li标签并且class为item-1里的a表前的href属性的值 temp_href = html.xpath('//li[@class="item-1"]/a/@href') # 返回一个列表 列表元素为选中的值 print(temp_href) print("&"*22) temp_text = html.xpath('//li[@class="item-1"]/a/text()') print(temp_text) print("----") # 组合为字典 此方法当其中的一个没有href时容易出现错位 # temp_list = [] for href in temp_href: temp_dict = dict( href=href, title=temp_text[temp_href.index(href)] ) print(temp_dict) # temp_list.append(temp_dict) # print(temp_list) print('*'*88) li_list = html.xpath('//li[@class="item-0"]') # 不会出现混乱的组合方法 for li in li_list: temp = {} temp['href'] = li.xpath('./a/@href') if len(li.xpath('./a/@href')) > 0 else None temp['title'] = li.xpath('./a/text()') if len(li.xpath('./a/text()')) > 0 else None print(temp)
初步使用
我们利用它来解析 HTML 代码,简单示例:
# lxml_test.py # 使用 lxml 的 etree 库 from lxml import etree text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签 </ul> </div> ''' #利用etree.HTML,将字符串解析为HTML文档 html = etree.HTML(text) # 按字符串序列化HTML文档 result = etree.tostring(html) print(result)
输出结果:
<html><body> <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </body></html>
lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签。
文件读取:
除了直接读取字符串,lxml还支持从文件里读取内容。我们新建一个hello.html文件:
<!-- hello.html --> <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>
再利用 etree.parse() 方法来读取文件。
# lxml_parse.py from lxml import etree # 读取外部文件 hello.html html = etree.parse('./hello.html') result = etree.tostring(html, pretty_print=True) print(result)
输出结果与之前相同:
<html><body> <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </body></html>
XPath实例测试
1. 获取所有的 <li>
标签
# xpath_li.py from lxml import etree html = etree.parse('hello.html') print type(html) # 显示etree.parse() 返回类型 result = html.xpath('//li') print result # 打印<li>标签的元素集合 print len(result) print type(result) print type(result[0])
输出结果:
<type 'lxml.etree._ElementTree'> [<Element li at 0x1014e0e18>, <Element li at 0x1014e0ef0>, <Element li at 0x1014e0f38>, <Element li at 0x1014e0f80>, <Element li at 0x1014e0fc8>] 5 <type 'list'> <type 'lxml.etree._Element'>
2. 继续获取<li>
标签的所有 class
属性
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//li/@class') print result
运行结果
['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']
3. 继续获取<li>
标签下hre
为 link1.html
的 <a>
标签
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//li/a[@href="link1.html"]') print result
运行结果
[<Element a at 0x10ffaae18>]
4. 获取<li>
标签下的所有 <span>
标签
# xpath_li.py from lxml import etree html = etree.parse('hello.html') #result = html.xpath('//li/span') #注意这么写是不对的: #因为 / 是用来获取子元素的,而 <span> 并不是 <li> 的子元素,所以,要用双斜杠 result = html.xpath('//li//span') print result
运行结果
[<Element span at 0x10d698e18>]
5. 获取 <li>
标签下的<a>
标签里的所有 class
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//li/a//@class') print result
运行结果
['blod']
6. 获取最后一个 <li>
的 <a>
的 href
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//li[last()]/a/@href') # 谓语 [last()] 可以找到最后一个元素 print result
运行结果
['link5.html']
7. 获取倒数第二个元素的内容
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//li[last()-1]/a') # text 方法可以获取元素内容 print result[0].text
运行结果
fourth item
8. 获取 class
值为 bold
的标签名
# xpath_li.py from lxml import etree html = etree.parse('hello.html') result = html.xpath('//*[@class="bold"]') # tag方法可以获取标签名 print result[0].tag
运行结果
span