python爬取全国疫情数据并生成疫情地图
# 【完工状态】Python疫情数据分析##介绍 Python爬虫抓取数据,制作疫情地图和词云,项目开源 爬取目标网站: https://voice.baidu.com/act/newpneumonia/newpneumonia/ ## 软件架构 环境准备:Python3 Pycharm 主要模块:requests pyechart openpyxl wordcloud ## 使用说明 ### 获取数据(写入excel) **导入需要的模块** ``` import requests from lxml import etree import json import re import openpyxl ``` **创建一个类** ``` class Get_data(): ``` **获取数据** ``` def get_data(self): # 目标url url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/" # 伪装请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/80.0.3987.149 Safari/537.36 ' } # 发出get请求 response = requests.get(url,headers=headers) # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) def get_time(self): with open('html.txt','r') as file: text = file.read() # 获取更新时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0] print('郭内毅擎更新时间为 '+time_in) print('郭外毅擎更新时间为 '+time_out) return time_in,time_out ``` **解析数据** ``` def parse_data(self): with open('html.txt','r') as file: text = file.read() # 生成HTML对象 html = etree.HTML(text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') # print(type(result)) result = result[0] # print(type(result)) result = json.loads(result) # print(type(result)) result = json.dumps(result['component'][0]['caseList']) # print(result) # print(type(result)) with open('data.json','w') as file: file.write(result) print('数据已写入json文件...') response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/") # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) # 获取时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0] print(time_in) print(time_out) # 生成HTML对象 html = etree.HTML(response.text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') print(type(result)) result = result[0] print(type(result)) result = json.loads(result) print(type(result)) # 以每个省的数据为一个字典 data_in = result['component'][0]['caseList'] for each in data_in: print(each) print("\n" + '*' * 20) data_out = result['component'][0]['globalList'] for each in data_out: print(each) print("\n" + '*' * 20) ''' area --> 大多为省份 city --> 城市 confirmed --> 累计 crued --> 值域 relativeTime --> confirmedRelative --> 累计的增量 curedRelative --> 值域的增量 curConfirm --> 现有确镇 curConfirmRelative --> 现有确镇的增量 ''' # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的毅擎数据, # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的毅擎数据. ``` **将数据写入excel文件** ``` # 将得到的数据写入excel文件 # 创建一个工作簿 wb = openpyxl.Workbook() # 创建工作表,每一个工作表代表一个area ws_in = wb.active ws_in.title = "国内毅擎" ws_in.append(['省份', '累计确诊', '丝网', '治愈', '现有确诊', '累计确诊增量', '丝网增量', '治愈增量', '现有确诊增量']) for each in data_in: temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'], each['confirmedRelative'], each['diedRelative'], each['curedRelative'], each['curConfirmRelative']] for i in range(len(temp_list)): if temp_list[i] == '': temp_list[i] = '0' ws_in.append(temp_list) # 获取国外毅擎数据 for each in data_out: print(each) print("\n" + '*' * 20) sheet_title = each['area'] # 创建一个新的工作表 ws_out = wb.create_sheet(sheet_title) ws_out.append(['郭家', '累计确诊', '丝网', '治愈', '现有确诊', '累计确诊增量']) for country in each['subList']: list_temp = [country['country'], country['confirmed'], country['died'], country['crued'], country['curConfirm'], country['confirmedRelative']] for i in range(len(list_temp)): if list_temp[i] == '': list_temp[i] = '0' ws_out.append(list_temp) # 保存excel文件 wb.save('./data.xlsx') ```  **完整代码** ``` import requests from lxml import etree import json import re import openpyxl class Get_data(): def get_data(self): # 目标url url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/" # 伪装请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/80.0.3987.149 Safari/537.36 ' } # 发出get请求 response = requests.get(url,headers=headers) # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) def get_time(self): with open('html.txt','r') as file: text = file.read() # 获取更新时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0] print('国内疫情更新时间为 '+time_in) print('国外疫情更新时间为 '+time_out) return time_in,time_out def parse_data(self): with open('html.txt','r') as file: text = file.read() # 生成HTML对象 html = etree.HTML(text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') # print(type(result)) result = result[0] # print(type(result)) result = json.loads(result) # print(type(result)) result = json.dumps(result['component'][0]['caseList']) # print(result) # print(type(result)) with open('data.json','w') as file: file.write(result) print('数据已写入json文件...') response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/") # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) # 获取时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0] print(time_in) print(time_out) # 生成HTML对象 html = etree.HTML(response.text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') print(type(result)) result = result[0] print(type(result)) result = json.loads(result) print(type(result)) # 以每个省的数据为一个字典 data_in = result['component'][0]['caseList'] for each in data_in: print(each) print("\n" + '*' * 20) data_out = result['component'][0]['globalList'] for each in data_out: print(each) print("\n" + '*' * 20) ''' area --> 大多为省份 city --> 城市 confirmed --> 累计 died --> 死亡 crued --> 治愈 relativeTime --> confirmedRelative --> 累计的增量 curedRelative --> 治愈的增量 curConfirm --> 现有确诊 curConfirmRelative --> 现有确诊的增量 diedRelative --> 死亡的增量 ''' # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的疫情数据, # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的疫情数据. # 将得到的数据写入excel文件 # 创建一个工作簿 wb = openpyxl.Workbook() # 创建工作表,每一个工作表代表一个area ws_in = wb.active ws_in.title = "国内疫情" ws_in.append(['省份', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量', '死亡增量', '治愈增量', '现有确诊增量']) for each in data_in: temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'], each['confirmedRelative'], each['diedRelative'], each['curedRelative'], each['curConfirmRelative']] for i in range(len(temp_list)): if temp_list[i] == '': temp_list[i] = '0' ws_in.append(temp_list) # 获取国外疫情数据 for each in data_out: print(each) print("\n" + '*' * 20) sheet_title = each['area'] # 创建一个新的工作表 ws_out = wb.create_sheet(sheet_title) ws_out.append(['国家', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量']) for country in each['subList']: list_temp = [country['country'], country['confirmed'], country['died'], country['crued'], country['curConfirm'], country['confirmedRelative']] for i in range(len(list_temp)): if list_temp[i] == '': list_temp[i] = '0' ws_out.append(list_temp) # 保存excel文件 wb.save('./data.xlsx') ``` ### 制作疫情地图 **导入所需模块 ** ``` import openpyxl from wordcloud import WordCloud ``` **读取excel中的数据** ``` # 读取数据 wb = openpyxl.load_workbook('data.xlsx') # 获取工作表 ws = wb['国内义擎'] frequency_in = {} for row in ws.values: if row[0] == '省份': pass else: frequency_in[row[0]] = float(row[1]) frequency_out = {} sheet_name = wb.sheetnames for each in sheet_name: if "洲" in each: ws = wb[each] for row in ws.values: if row[0] == '国家': pass else: frequency_out[row[0]] = float(row[1]) ``` **生成词云图片** ``` def generate_pic(frequency,name): wordcloud = WordCloud(font_path="C:/Windows/Fonts/SIMLI.TTF", background_color="white", width=1920, height=1080) # 根据确诊病例的数目生成词云 wordcloud.generate_from_frequencies(frequency) # 保存词云 wordcloud.to_file('%s.png'%(name)) ``` **调用函数** ``` generate_pic(frequency_in,'国内义擎情况词云图') generate_pic(frequency_out,'世界义擎词云图') ```   **完整代码** ``` import openpyxl from wordcloud import WordCloud from pyecharts import options as opts from pyecharts.charts import WordCloud # 与即时显示图片相关的模块 ''' import matplotlib.pyplot as plt # 绘制图像的模块 import numpy as np from PIL import Image ''' # 读取数据 wb = openpyxl.load_workbook('data.xlsx') sheet_names = wb.sheetnames frequency_out = {} for each in sheet_names: if '洲' in each: ws = wb[each] for row in ws.values: if row[1] == "累计确诊": pass else: frequency_out[row[0]] = float(row[1]) else: pass # 以省份的确诊病例总数代表其出现的频率 frequency_in = {} ws = wb['国内疫情'] for row in ws.values: if row[1] == "累计确诊": pass else: frequency_in[row[0]] = float(row[1]) def generate_pic(frequency,name): # 这里可以事先准备一张图片,可以用作背景 # background_image = np.array(Image.open('pic.jpg')) wordcloud = WordCloud(font_path="C:/Windows/Fonts/SIMLI.TTF", background_color = "red", # mask=background_image, width=1920, height=1080 ) # 按照确诊病例数目生成词云 wordcloud.generate_from_frequencies(frequency) wordcloud.to_file('%s.png'%name) # 展示图片 # plt.imshow(wordcloud, interpolation="bilinear") # plt.axis("off") # plt.show() # 调用函数 generate_pic(frequency_in,'国内疫情') generate_pic(frequency_out,'国外疫情') ``` ### 绘制地图 **map_draw.py 文件源码** ``` from pyecharts import options as opts from pyecharts.charts import Map import os class Draw_map(): # relativeTime为发布的时间,传入时间戳字符串 # def get_time(self): # relativeTime = int(relativeTime) # return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(relativeTime)) def __init__(self): if not os.path.exists('./map/china'): os.makedirs('./map/china') def get_colour(self,a,b,c): result = '#' + ''.join(map((lambda x: "%02x" % x), (a,b,c))) return result.upper() ''' 参数说明——area:地级市 variate:对应的疫情数据 province:省份(不含省字) ''' def to_map_city(self,area, variate,province,update_time): pieces = [ {"max": 99999999, "min": 10000, "label": "≥10000", "color": self.get_colour(102, 2, 8)}, {"max": 9999, "min": 1000, "label": "1000-9999", "color": self.get_colour(140, 13, 13)}, {"max": 999, "min": 500, "label": "500-999", "color": self.get_colour(204, 41, 41)}, {"max": 499, "min": 100, "label": "100-499", "color": self.get_colour(255, 123, 105)}, {"max": 99, "min": 50, "label": "50-99", "color": self.get_colour(255, 170, 133)}, {"max": 49, "min": 10, "label": "10-49", "color": self.get_colour(255,202,179)}, {"max": 9, "min": 1, "label": "1-9", "color": self.get_colour(255,228,217)}, {"max": 0, "min": 0, "label": "0", "color": self.get_colour(255,255,255)}, ] c = ( # 设置地图大小 Map(init_opts=opts.InitOpts(width = '1000px', height='880px')) .add("累计确诊人数", [list(z) for z in zip(area, variate)], province, is_map_symbol_show=False) # 设置全局变量 is_piecewise设置数据是否连续,split_number设置为分段数,pices可自定义数据分段 # is_show设置是否显示图例 .set_global_opts( title_opts=opts.TitleOpts(title="%s地区疫情地图分布"%(province), subtitle = '截止%s %s省疫情分布情况'%(update_time,province), pos_left = "center", pos_top = "10px"), legend_opts=opts.LegendOpts(is_show = False), visualmap_opts=opts.VisualMapOpts(max_=200,is_piecewise=True, pieces=pieces, ), ) .render("./map/china/{}疫情地图.html".format(province)) ) def to_map_china(self,area, variate,update_time): pieces = [{"max": 999999, "min": 1001, "label": ">10000", "color": "#8A0808"}, {"max": 9999, "min": 1000, "label": "1000-9999", "color": "#B40404"}, {"max": 999, "min": 100, "label": "100-999", "color": "#DF0101"}, {"max": 99, "min": 10, "label": "10-99", "color": "#F78181"}, {"max": 9, "min": 1, "label": "1-9", "color": "#F5A9A9"}, {"max": 0, "min": 0, "label": "0", "color": "#FFFFFF"}, ] c = ( # 设置地图大小 Map(init_opts=opts.InitOpts(width='1000px', height='880px')) .add("累计确诊人数", [list(z) for z in zip(area, variate)], "china", is_map_symbol_show=False) .set_global_opts( title_opts=opts.TitleOpts(title="中国疫情地图分布", subtitle='截止%s 中国疫情分布情况'%(update_time), pos_left="center", pos_top="10px"), legend_opts=opts.LegendOpts(is_show=False), visualmap_opts=opts.VisualMapOpts(max_=200, is_piecewise=True, pieces=pieces, ), ) .render("./map/中国疫情地图.html") ) ``` **get_data.py 文件源码** ``` import requests from lxml import etree import json import re import openpyxl class Get_data(): def get_data(self): # 目标url url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/" # 伪装请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/80.0.3987.149 Safari/537.36 ' } # 发出get请求 response = requests.get(url,headers=headers) # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) def get_time(self): with open('html.txt','r') as file: text = file.read() # 获取更新时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0] print('国内疫情更新时间为 '+time_in) print('国外疫情更新时间为 '+time_out) return time_in,time_out def parse_data(self): with open('html.txt','r') as file: text = file.read() # 生成HTML对象 html = etree.HTML(text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') # print(type(result)) result = result[0] # print(type(result)) result = json.loads(result) # print(type(result)) result = json.dumps(result['component'][0]['caseList']) # print(result) # print(type(result)) with open('data.json','w') as file: file.write(result) print('数据已写入json文件...') response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/") # 将请求的结果写入文件,便于分析 with open('html.txt', 'w') as file: file.write(response.text) # 获取时间 time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0] time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0] print(time_in) print(time_out) # 生成HTML对象 html = etree.HTML(response.text) # 解析数据 result = html.xpath('//script[@type="application/json"]/text()') print(type(result)) result = result[0] print(type(result)) result = json.loads(result) print(type(result)) # 以每个省的数据为一个字典 data_in = result['component'][0]['caseList'] for each in data_in: print(each) print("\n" + '*' * 20) data_out = result['component'][0]['globalList'] for each in data_out: print(each) print("\n" + '*' * 20) ''' area --> 大多为省份 city --> 城市 confirmed --> 累计 died --> 死亡 crued --> 治愈 relativeTime --> confirmedRelative --> 累计的增量 curedRelative --> 治愈的增量 curConfirm --> 现有确诊 curConfirmRelative --> 现有确诊的增量 diedRelative --> 死亡的增量 ''' # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的疫情数据, # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的疫情数据. # 将得到的数据写入excel文件 # 创建一个工作簿 wb = openpyxl.Workbook() # 创建工作表,每一个工作表代表一个area ws_in = wb.active ws_in.title = "国内疫情" ws_in.append(['省份', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量', '死亡增量', '治愈增量', '现有确诊增量']) for each in data_in: temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'], each['confirmedRelative'], each['diedRelative'], each['curedRelative'], each['curConfirmRelative']] for i in range(len(temp_list)): if temp_list[i] == '': temp_list[i] = '0' ws_in.append(temp_list) # 获取国外疫情数据 for each in data_out: print(each) print("\n" + '*' * 20) sheet_title = each['area'] # 创建一个新的工作表 ws_out = wb.create_sheet(sheet_title) ws_out.append(['国家', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量']) for country in each['subList']: list_temp = [country['country'], country['confirmed'], country['died'], country['crued'], country['curConfirm'], country['confirmedRelative']] for i in range(len(list_temp)): if list_temp[i] == '': list_temp[i] = '0' ws_out.append(list_temp) # 保存excel文件 wb.save('./data.xlsx') ``` ** execution.py 文件 源码** ``` import map_draw import json map = map_draw.Draw_map() # 格式 # map.to_map_china(['湖北'],['99999'],'1584201600') # map.to_map_city(['荆门市'],['99999'],'湖北','1584201600') # 获取数据 with open('data.json', 'r') as file: data = file.read() data = json.loads(data) # 中国疫情地图 def china_map(update_time): area = [] confirmed = [] for each in data: print(each) area.append(each['area']) confirmed.append(each['confirmed']) map.to_map_china(area,confirmed,update_time) # 23个省、5个自治区、4个直辖市、2个特别行政区 香港、澳门和台湾的subList为空列表,未有详情数据 # 省、直辖市疫情地图 def province_map(update_time): for each in data: city = [] confirmeds = [] province = each['area'] for each_city in each['subList']: city.append(each_city['city']+"市") confirmeds.append(each_city['confirmed']) map.to_map_city(city,confirmeds,province,update_time) if province == '上海' or '北京' or '天津' or '重庆': for each_city in each['subList']: city.append(each_city['city']) confirmeds.append(each_city['confirmed']) map.to_map_city(city,confirmeds,province,update_time) ``` **main.py 文件 源码** ``` from get_data import Get_data data = Get_data() data.get_data() time_in,time_out = data.get_time() data.parse_data() import execution execution.china_map(time_in) execution.province_map(time_in) ``` 四个文件保存,放在同一目录下,直接执行main.py,即可成功运行 