Python爬虫爬取疫情数据
Python爬虫爬取疫情数据
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/8/12 12:06 上午
# @Author : Helius
# @File : 04-corona_virus.py
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm
class CoronaVirusSpider(object):
def __init__(self):
self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'
def get_content_from_url(self, url):
"""
根据url ,获取响应内容的字符串数据
:param url: 请求的url
:return:
"""
response = requests.get(url)
return response.content.decode('utf-8')
def parse_home_page(self, home_page,tag_id):
"""
解析首页内容,获取解析后的Python数据
:param home_page:
:return:
"""
soup = BeautifulSoup(home_page, 'lxml')
script = soup.find(id=tag_id)
text = script.string
json_str = re.findall(r'\[.+\]', text)[0]
data = json.loads(json_str)
return data
def save(self, data, path):
with open(path, 'w') as fp:
json.dump(data, fp, ensure_ascii=False)
def crawl_last_day_corona_virus(self):
"""
采集最近一天的各国疫情数据
:return:
"""
home_page = self.get_content_from_url(self.home_url)
last_day_corona_virus = self.parse_home_page(home_page,'getListByCountryTypeService2true')
self.save(last_day_corona_virus, 'data/last_day_corona_virus.json')
def crawl_corona_virus(self):
"""
采集从1月23号以来各国疫情数据
:return:
"""
with open('data/last_day_corona_virus.json') as fp:
last_day_corona_virus = json.load(fp)
corona_virus = self.corona_virus_data(last_day_corona_virus,desc='采集1月23日以来各国疫情信息')
self.save(corona_virus,'data/corona_virus.json')
def crawl_last_day_corona_virus_of_china(self):
"""
采集最近一日我国各省一日疫情数据
:return:
"""
home_page = self.get_content_from_url(self.home_url)
last_day_corona_virus_of_china = self.parse_home_page(home_page,tag_id='getAreaStat')
self.save(last_day_corona_virus_of_china, 'data/last_day_corona_virus_of_china.json')
def crawl_corona_virus_of_china(self):
"""
采集我国自1月22日以来各省疫情数据
:return:
"""
with open('data/last_day_corona_virus_of_china.json') as fp:
last_day_corona_virus = json.load(fp)
corona_virus = self.corona_virus_data(last_day_corona_virus,'采集1月22日以来我国各省疫情信息')
self.save(corona_virus,'data/corona_virus_of_china.json')
def corona_virus_data(self, last_day_corona_virus,desc):
corona_virus = []
for country in tqdm(last_day_corona_virus, desc):
statistics_data_url = country['statisticsData']
statistics_data_json_str = self.get_content_from_url(statistics_data_url)
statistics_data = json.loads(statistics_data_json_str)['data']
for one_day in statistics_data:
one_day['provinceName'] = country['provinceName']
if country.get('countryShortCode'):
one_day['countryShortCode'] = country['countryShortCode']
corona_virus.extend(statistics_data)
return corona_virus
def run(self):
# self.crawl_last_day_corona_virus()
# self.crawl_corona_virus()
# self.crawl_last_day_corona_virus_of_china()
self.crawl_corona_virus_of_china()
if __name__ == '__main__':
spider = CoronaVirusSpider()
spider.run()
🔥
小结:整体比较简单,就当复习下啦
你所看得到的天才不过是在你看不到的时候还在努力罢了!