爬虫_解析_JsonPath

1.Json Path介绍

看它的名字你就能知道,这Json Path和JSON文档有关系,正如XPath之于XML文档一样,JsonPath为Json文档提供了解析能力,通过使用JsonPath,你可以方便的查找节点、获取想要的数据,JsonPath是Json版的XPath。

JsonPath只能解析本地文件

jsonPath的安装及使用方式:

pip安装

  pip install jsonpath

jsonpath的使用:

  obj=json.load(open('json文件','r',encoding='utf-8'))

  ret=jsonpath.jsonpath(obj,'jsonpath语法')

2.Json Path语法

JsonPath的语法相对简单,它采用开发语言友好的表达式形式,如果你了解类C语言,对JsonPath就不会感到不适应。

3.案例

json格式:

{
    "store": {
        "book": [{
                "category": "reference",
                "author": "Nigel Rees",
                "title": "Sayings of the Century",
                "price": 8.95
            }, {
                "category": "fiction",
                "author": "Evelyn Waugh",
                "title": "Sword of Honour",
                "price": 12.99
            }, {
                "category": "fiction",
                "author": "Herman Melville",
                "title": "Moby Dick",
                "isbn": "0-553-21311-3",
                "price": 8.99
            }, {
                "category": "fiction",
                "author": "J. R. R. Tolkien",
                "title": "The Lord of the Rings",
                "isbn": "0-395-19395-8",
                "price": 22.99
            }
        ],
        "bicycle": {
            "color": "red",
            "price": 19.95
        }
    }
}
import jsonpath
import json
obj = json.load(open('jsonpath.json','r',encoding='utf-8'))

# 书店所有书的作者
# author_list = jsonpath.jsonpath(obj,'$.store.book[*].author')
# print(author_list)

#所有的作者
# author_list = jsonpath.jsonpath(obj,'$..author')
# print(author_list)

#store下面的所有的元素
# tag_list = jsonpath.jsonpath(obj,'$.store.*')
# print(tag_list)

#store里面所有东西的price
# price_list = jsonpath.jsonpath(obj,'$.store..price')
# print(price_list)

#第三个书
# book_list = jsonpath.jsonpath(obj,'$.store.book[2]')
# book_list = jsonpath.jsonpath(obj,'$..book[2]')
# print(book_list)

#最后一本书
# book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
# print(book)

#前两本书
# book_list = jsonpath.jsonpath(obj,'$..book[0,1]')
#切片方式
# book_list = jsonpath.jsonpath(obj,'$..book[:2]')
# print(book_list)

#过滤出所有的包含isbn的书
#条件过滤需要在()的前面添加一个?
# book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
# print(book_list)

#哪本书超过了10块钱
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
print(book_list)

 

爬取“淘票票”网站的城市信息

#爬取淘票票网站的所有城市
import urllib.request
import json
import jsonpath
url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1653455592636_97&jsoncallback=jsonp98&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = {
    # ':authority': 'dianying.taobao.com',
    # ':method': 'GET',
    # ':path': '/cityAction.json?activityId&_ksTS=1653455592636_97&jsoncallback=jsonp98&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
    # ':scheme':'https',
    'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': 'cna=SVJuGpUs7F8CAW8oHmD3KofM; t=1f612311ec0dc84f7fc6e413c667b160; tracknick=zkr123521; enc=I1w7N4hOVwMsfY3KW7jdT4%2BaThe78%2ByyPWmuAZ6NnGgMXJo3IFsw2uGNcRQCK3MHIvf0bze%2FTxYXlqJihMGUJQ%3D%3D; thw=cn; sgcookie=E100lVfe%2FS34cg1ChYfLrxpN8VrXAPn%2BlVjheL9QoGbFTfXQPxOEZaA%2BRkik%2F2%2BzCRGdiU9B5NmSdxKTeRILWPL9A0BwjPY8SdpKsjdezMuXs%2BKOy750gwPXMHJWmWgIvquU; _cc_=URm48syIZQ%3D%3D; cookie2=1881fc2910ae36205519dacd7949d705; v=0; _tb_token_=f85488eed3158; xlly_s=1; tfstk=cJ35BJ1FWTXS-2-FzbO28iz12IUhZm5_O3woNKNW2QLNIYG5iu_aCjJrx9UU9S1..; l=eBxKITZugtZgHmnFBOfwhurza77OMIRAguPzaNbMiOCP_5565GBCW6fIruTBCnGVhsZXJ37GxW68BeYBqC2sjqj4axom4RDmn; isg=BN7eYVUsMITeO2cCLrNqVfYHL3Qgn6IZ8nd8fohnNyEcq36F8S72KUFFo7enk5ox',
    'referer': 'https://dianying.taobao.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#split切割
content = content.split('(')[1].split(')')[0]
#下载到本地保存为json文件 with open('taopiaopiao.json','w',encoding='utf-8')as fp:     fp.write(content)
obj = json.load(open('taopiaopiao.json','r',encoding='utf-8'))
city_list = jsonpath.jsonpath(obj,'$..regionName')
print(city_list)

 

 

 

参考:https://blog.csdn.net/qq_36595013/article/details/109455924

posted @ 2022-05-25 13:10  创客未来  阅读(147)  评论(0编辑  收藏  举报