xpath丶BeautifulSoup丶pyquery丶jsonpath 解析html与json串
XPath与jsonpath
1 import json 2 from lxml import etree 3 from jsonpath import jsonpath 4 5 def json_test(): 6 str1 = '{"name":"埃里克森"}' 7 # 将字符串转为Python dict对象 8 js_obj = json.loads(str1) 9 print( type(js_obj) ) 10 11 # 将dict转成,json字符串 12 json_str = json.dumps(js_obj, ensure_ascii=False) 13 print( type(json_str) ) 14 15 # 将json对象保存到文件 16 json.dump(js_obj, open('test.txt', 'w', encoding='utf-8'), ensure_ascii=False ) 17 18 # 从文件中读取json对象 19 jso = json.load(open('test.txt', 'r', encoding='utf-8')) 20 print( type(jso) ) 21 22 def jsonpath_test(): 23 # JsonPath是Json版的XPath,正如XPath之于XML文档一样。 24 # XPath JsonPath 说明 25 # | / | $ | 文档根元素 26 # | . | @ | 当前元素 27 # | / | . or [] | 匹配下级元素 28 # | .. | N/A | 匹配上级元素,JsonPath不支持此操作符 29 # | // | .. | 递归匹配所有子元素 30 # | * | * | 通配符,匹配下级元素 31 # | @ | N/A | 匹配属性,Jsonath不支持此操作 32 # | [] | [] | 下标运算,根据索引获取元素,XPath下标从1开始,jsonPath从0开始 33 # | | | [,] | 连接操作符,将多个届uo拼接成数组返回,可以使用索引或别名政局政局 34 # | N/A | [start:end:step]| 数据切片操作 35 # | [] | ?() | 过滤表达式 36 # | N/A | () | 脚本表达式,使用底层脚本引擎,XPath不支持 37 # | () | N/A | 分组,JsonPath不支持 38 # e = etree.HTML(response.text) 39 # names = e.xpath(r"//div[@class='all-book-list']/div/ul/li/div/h2/a/text()") 40 str1 = ''' 41 { 42 "store": { 43 "book": [{ 44 "category": "reference", 45 "author": "Nigel Rees", 46 "title": "Sayings of the Century", 47 "price": 8.95 48 }, { 49 "category": "fiction", 50 "author": "Evelyn Waugh", 51 "title": "Sword of Honour", 52 "price": 12.99 53 }, { 54 "category": "fiction", 55 "author": "Herman Melville", 56 "title": "Moby Dick", 57 "isbn": "0-553-21311-3", 58 "price": 8.99 59 }, { 60 "category": "fiction", 61 "author": "J. R. R. Tolkien", 62 "title": "The Lord of the Rings", 63 "isbn": "0-395-19395-8", 64 "price": 22.99 65 } 66 ], 67 "bicycle": { 68 "color": "red", 69 "price": 19.95 70 } 71 } 72 } 73 ''' 74 js_obj = json.loads(str1) 75 res1 = jsonpath(js_obj, '$.store.book[*].author') # 所有book的author节点 (对应xpath:"/store/book/author") 76 res2 = jsonpath(js_obj, '$..author') # 所有author节点 (对应xpath:"//author") 77 res3 = jsonpath(js_obj, '$.store.*') # store下的所有节点:book数组和bicycle节点 (对应xpath:"/store") 78 res4 = jsonpath(js_obj, '$.store..price') # store下的所有price节点 (对应xpath:"/store//price") 79 res5 = jsonpath(js_obj, '$..book[2]') # 匹配book第三个节点 (对应xpath:"//book[2]") 80 res6 = jsonpath(js_obj, '$..book[(@.length-1)]') # 匹配一个book节点 (对应xpath:"//book[last()]") 81 res7 = jsonpath(js_obj, '$..book[-1:]') 82 res8 = jsonpath(js_obj, '$..book[0,1]') # 匹配两个节点 83 res9 = jsonpath(js_obj, '$..book[:2]') # (对应xpath:"//book[position()<3]") 84 res10 = jsonpath(js_obj, '$..book[?(@.price<10)]') # 过滤price<10的节点 (对应xpath:"//book[price<10]") 85 res11 = jsonpath(js_obj, '$..*') # 递归匹配所有子结点 (对应xpath:"//*") 86 print( res11 ) 87 88 if __name__ == '__main__': 89 jsonpath_test()
BeautifulSoup
1 from bs4 import BeautifulSoup, Comment 2 3 # 具体更详细的用法找官网 4 str1 = ''' 5 <title id='title1'>世界和平</title> 6 <div class='info' float='left'>Welcome to My Space</div> 7 <div class='info' float='right'> 8 <span>Good good study</span> 9 <a href='www.baidu.com'>baidu</a> 10 <strong><!--这是一段注释-->test</strong> 11 </div> 12 ''' 13 14 soup = BeautifulSoup(str1, 'lxml') 15 print(soup.title) 16 print(soup.div) 17 18 print(soup.div.attrs) 19 print(soup.div.get('class')) 20 print(soup.div['float']) 21 print(soup.a['href']) 22 23 print(soup.div.string) 24 print(soup.div.text) 25 26 print(type(soup.strong.string)) 27 if type(soup.strong.string) == Comment: 28 print(soup.strong.string) 29 print(soup.strong.pretify()) 30 else: 31 print(soup.strong.text) 32 33 print('===========find_all()===========') 34 print(soup.find_all('title')) 35 print(soup.find_all(id='title')) 36 print(soup.find_all(class_='info')) 37 print(soup.find_all(attrs={'float':'left'})) 38 39 40 print('===========css()===========') 41 print(soup.select('title')) 42 # print(soup.select(id='title')) 报错 43 print(soup.select('#title1')) 44 # print(soup.select(class_='info')) 报错 45 print(soup.select('.info')[0]) 46 print(soup.select('div > span')) 47 print(soup.select('div')[1].select('a'))
PyQuery
1 from pyquery import PyQuery 2 3 def main(): 4 # 可加载一段HTML字符串,或一个HTML文件,或是一个url地址 5 # doc = PyQuery('<html><title>test</title><head></head><body></body></html>') 6 # doc = PyQuery(filename='path_to_html_file') 7 # doc = PyQuery(url='https://www.baidu.com/') 8 9 # html()和text() 获取相应的html块或文本块 10 doc = PyQuery('<html><head><title>test</title></head><head></head><body></body></html>') 11 doc('head').html() # 返回:<title>test</title> 12 doc('head').text() # 返回:test 13 14 # 根据html标签来获取元素 15 doc = PyQuery('<html><title>test</title><head></head><body><p>1</p><p>2</p></body></html>') 16 print( doc('p') ) # 返回:<p>1</p><p>2</p> 17 print( doc('p').html() ) # 返回: 1 # 注意:当获取的到的元素不只一个时,html()只返回首个元素内容 18 # eq(index) 根据给定的索引号得到指定元素,若想得到第二个p标签的内容,则可以: 19 print( doc('p').eq(1).html() ) # 返回: 2 20 21 # filter() 根据类名,id名得到指定的元素 22 doc = PyQuery(r'<html><title>test</title><head></head><body><p id="p_id">1</p><p class="p_class">2</p></body></html>') 23 print( doc('p').filter('#p_id') ) # 返回: <p id="p_id">1</p> 24 print( doc('p').filter('.p_class') ) # 返回: <p class="p_class">2</p> 25 26 # find() 查找嵌套元素 27 doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>') 28 print( doc('div').find('p') ) # 返回: <p id="p_id">1</p><p class="p_class">2</p> 29 print( doc('div').find('p').eq(0) ) # 返回: <p id="p_id">1</p> 30 31 # 直接根据类名获取元素 32 doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>') 33 print( doc('#p_id').html() ) # 返回: 1 34 print( doc('.p_class').html() ) # 返回: 2 35 36 # 获取属性值 37 doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>') 38 print( doc('p').eq(0).attr('id') ) 39 # 修改属性值 40 doc('p').eq(1).attr('class', 'p_class test') 41 print( doc('div').html() ) 42 # add_class() 为元素增加类 43 doc('p').eq(0).add_class('p_class_id') 44 print(doc('div').html()) 45 # hasClass(classname) 判断元素是否包含类 46 print( doc('p').eq(1).has_class('p_class') ) 47 48 # 获取子元素 49 doc = PyQuery(r'<body><div><p id="p_id">1</p><p class="p_class">2<span>tspan</span></p></div></body>') 50 print( doc('div').children() ) 51 print( doc('div').children('#p_id') ) 52 print( doc('div').children('span') ) # 不能跨多层级获取元素 53 # 获取父元素 54 print(doc('#p_id').parent().eq(0)) 55 56 # clone() 获取节点的拷贝 57 # empty() 移除节点内容 58 59 # next_all() 返回后面全部元素 60 doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2<span>tspan</span></p></div>') 61 print( doc('div').children('#p_id').next_all() ) 62 print(doc('div').children('p:first').next_all()) 63 print(doc('div').children('p:last').next_all()) 64 # 返回不匹配选择器的元素 65 print( doc('p').not_('#p_id') ) 66 67 68 69 if __name__ == '__main__': 70 main()