W
e
l
c
o
m
e
: )

爬虫数据存储为json格式

爬虫数据存储为json格式

data为你的数据

def save_data(data):
    with open('name.json', 'w', encoding='utf-8')as f:
        json.dump(data, f)

案例:

声明:该案例 仅用于 学习,请勿商用,或非法用途,造成的一切后果于本人无关!

import json

import requests
from fake_useragent import UserAgent
from lxml import etree


# 获取到结构化的页面
def get_html(url):
    headers = {
        'User-Agent': UserAgent().chrome
    }

    response = requests.get(url=url, headers=headers)

    response.encoding = 'utf-8'

    return etree.HTML(response.text)


# 获取页面中要提取的数据
def get_detail(response):
    detail_list = response.xpath('//div[@class="content-left mb16"]/div')
    print(detail_list)
    for da in detail_list:
        # 新闻的链接
        href = da.xpath('./a/@href')[0]
        # 新闻的标题
        title = da.xpath('./em/a/text()')[0]
        # 新闻的概述
        content = da.xpath('./p/text()')[0]
        print(href, title, content)
        items = {}
        items['title'] = title
        items['href'] = href
        items['content'] = content
        data_list.append(items)
        print(data_list)
        print('------------------------')


# 将获取到的数据存储为json
def wirte_json(data):
    with open('detail.json', 'w', encoding='utf-8') as f:
        json.dump(data, f)


# 主运行函数
def main():
    for i in range(1,11):
        html_data = get_html('http://www.szhk.com/news/newlist/news/28148690427226187/{}.htm'.format(1))
        get_detail(html_data)


if __name__ == '__main__':
    data_list = []
    main()
    # 将数据存储为 json格式
    wirte_json(data_list)


posted @ 2020-02-18 09:07  水一RAR  阅读(1058)  评论(0编辑  收藏  举报