爬虫数据存储为json格式
爬虫数据存储为json格式
data为你的数据
def save_data(data):
with open('name.json', 'w', encoding='utf-8')as f:
json.dump(data, f)
案例:
声明:该案例 仅用于 学习,请勿商用,或非法用途,造成的一切后果于本人无关!
import json
import requests
from fake_useragent import UserAgent
from lxml import etree
# 获取到结构化的页面
def get_html(url):
headers = {
'User-Agent': UserAgent().chrome
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
return etree.HTML(response.text)
# 获取页面中要提取的数据
def get_detail(response):
detail_list = response.xpath('//div[@class="content-left mb16"]/div')
print(detail_list)
for da in detail_list:
# 新闻的链接
href = da.xpath('./a/@href')[0]
# 新闻的标题
title = da.xpath('./em/a/text()')[0]
# 新闻的概述
content = da.xpath('./p/text()')[0]
print(href, title, content)
items = {}
items['title'] = title
items['href'] = href
items['content'] = content
data_list.append(items)
print(data_list)
print('------------------------')
# 将获取到的数据存储为json
def wirte_json(data):
with open('detail.json', 'w', encoding='utf-8') as f:
json.dump(data, f)
# 主运行函数
def main():
for i in range(1,11):
html_data = get_html('http://www.szhk.com/news/newlist/news/28148690427226187/{}.htm'.format(1))
get_detail(html_data)
if __name__ == '__main__':
data_list = []
main()
# 将数据存储为 json格式
wirte_json(data_list)