python爬取全国疫情数据并生成疫情地图

# 【完工状态】Python疫情数据分析##介绍

Python爬虫抓取数据,制作疫情地图和词云,项目开源

爬取目标网站:
https://voice.baidu.com/act/newpneumonia/newpneumonia/

## 软件架构

环境准备:Python3 Pycharm 

主要模块:requests pyechart openpyxl wordcloud


## 使用说明

### 获取数据(写入excel)

 **导入需要的模块** 
```
import requests
from lxml import etree
import json
import re
import openpyxl
```

 **创建一个类** 
```
class Get_data():
```

 **获取数据** 
```
    def get_data(self):
        # 目标url
        url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/"

        # 伪装请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

        # 发出get请求
        response = requests.get(url,headers=headers)

        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

    def get_time(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 获取更新时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0]
        print('郭内毅擎更新时间为 '+time_in)
        print('郭外毅擎更新时间为 '+time_out)
        return time_in,time_out
```

 **解析数据** 
```
    def parse_data(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 生成HTML对象
        html = etree.HTML(text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        # print(type(result))
        result = result[0]
        # print(type(result))
        result = json.loads(result)
        # print(type(result))
        result = json.dumps(result['component'][0]['caseList'])
        # print(result)
        # print(type(result))
        with open('data.json','w') as file:
            file.write(result)
            print('数据已写入json文件...')

        response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/")
        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

        # 获取时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0]
        print(time_in)
        print(time_out)

        # 生成HTML对象
        html = etree.HTML(response.text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        print(type(result))
        result = result[0]
        print(type(result))
        result = json.loads(result)
        print(type(result))
        # 以每个省的数据为一个字典
        data_in = result['component'][0]['caseList']
        for each in data_in:
            print(each)
            print("\n" + '*' * 20)

        data_out = result['component'][0]['globalList']
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)

        '''
        area --> 大多为省份
        city --> 城市
        confirmed --> 累计
        crued --> 值域
        relativeTime --> 
        confirmedRelative --> 累计的增量
        curedRelative --> 值域的增量
        curConfirm --> 现有确镇
        curConfirmRelative --> 现有确镇的增量

        '''

        # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的毅擎数据,
        # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的毅擎数据.

```

 **将数据写入excel文件** 

```
        # 将得到的数据写入excel文件
        # 创建一个工作簿
        wb = openpyxl.Workbook()
        # 创建工作表,每一个工作表代表一个area
        ws_in = wb.active
        ws_in.title = "国内毅擎"
        ws_in.append(['省份', '累计确诊', '丝网', '治愈', '现有确诊', '累计确诊增量', '丝网增量', '治愈增量', '现有确诊增量'])
        for each in data_in:
            temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'],
                         each['confirmedRelative'], each['diedRelative'], each['curedRelative'],
                         each['curConfirmRelative']]
            for i in range(len(temp_list)):
                if temp_list[i] == '':
                    temp_list[i] = '0'
            ws_in.append(temp_list)

        # 获取国外毅擎数据
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)
            sheet_title = each['area']
            # 创建一个新的工作表
            ws_out = wb.create_sheet(sheet_title)
            ws_out.append(['郭家', '累计确诊', '丝网', '治愈', '现有确诊', '累计确诊增量'])
            for country in each['subList']:
                list_temp = [country['country'], country['confirmed'], country['died'], country['crued'],
                             country['curConfirm'], country['confirmedRelative']]
                for i in range(len(list_temp)):
                    if list_temp[i] == '':
                        list_temp[i] = '0'
                ws_out.append(list_temp)

            # 保存excel文件
            wb.save('./data.xlsx')

```

![数据分析](https://images.gitee.com/uploads/images/2020/0710/095057_367e8dd9_5618404.jpeg "数据分析.jpg")


 **完整代码** 
```
import requests
from lxml import etree
import json
import re
import openpyxl


class Get_data():
    def get_data(self):
        # 目标url
        url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/"

        # 伪装请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

        # 发出get请求
        response = requests.get(url,headers=headers)

        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

    def get_time(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 获取更新时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0]
        print('国内疫情更新时间为 '+time_in)
        print('国外疫情更新时间为 '+time_out)
        return time_in,time_out

    def parse_data(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 生成HTML对象
        html = etree.HTML(text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        # print(type(result))
        result = result[0]
        # print(type(result))
        result = json.loads(result)
        # print(type(result))
        result = json.dumps(result['component'][0]['caseList'])
        # print(result)
        # print(type(result))
        with open('data.json','w') as file:
            file.write(result)
            print('数据已写入json文件...')

        response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/")
        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

        # 获取时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0]
        print(time_in)
        print(time_out)

        # 生成HTML对象
        html = etree.HTML(response.text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        print(type(result))
        result = result[0]
        print(type(result))
        result = json.loads(result)
        print(type(result))
        # 以每个省的数据为一个字典
        data_in = result['component'][0]['caseList']
        for each in data_in:
            print(each)
            print("\n" + '*' * 20)

        data_out = result['component'][0]['globalList']
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)

        '''
        area --> 大多为省份
        city --> 城市
        confirmed --> 累计
        died --> 死亡
        crued --> 治愈
        relativeTime --> 
        confirmedRelative --> 累计的增量
        curedRelative --> 治愈的增量
        curConfirm --> 现有确诊
        curConfirmRelative --> 现有确诊的增量
        diedRelative --> 死亡的增量
        '''

        # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的疫情数据,
        # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的疫情数据.

        # 将得到的数据写入excel文件
        # 创建一个工作簿
        wb = openpyxl.Workbook()
        # 创建工作表,每一个工作表代表一个area
        ws_in = wb.active
        ws_in.title = "国内疫情"
        ws_in.append(['省份', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量', '死亡增量', '治愈增量', '现有确诊增量'])
        for each in data_in:
            temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'],
                         each['confirmedRelative'], each['diedRelative'], each['curedRelative'],
                         each['curConfirmRelative']]
            for i in range(len(temp_list)):
                if temp_list[i] == '':
                    temp_list[i] = '0'
            ws_in.append(temp_list)

        # 获取国外疫情数据
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)
            sheet_title = each['area']
            # 创建一个新的工作表
            ws_out = wb.create_sheet(sheet_title)
            ws_out.append(['国家', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量'])
            for country in each['subList']:
                list_temp = [country['country'], country['confirmed'], country['died'], country['crued'],
                             country['curConfirm'], country['confirmedRelative']]
                for i in range(len(list_temp)):
                    if list_temp[i] == '':
                        list_temp[i] = '0'
                ws_out.append(list_temp)

            # 保存excel文件
            wb.save('./data.xlsx')

```


### 制作疫情地图


 **导入所需模块 ** 

```
import openpyxl
from wordcloud import WordCloud
```

 **读取excel中的数据** 

```
# 读取数据
wb = openpyxl.load_workbook('data.xlsx')
# 获取工作表
ws = wb['国内义擎']
frequency_in = {}
for row in ws.values:
    if row[0] == '省份':
        pass
    else:
        frequency_in[row[0]] = float(row[1])

frequency_out = {}
sheet_name = wb.sheetnames
for each in sheet_name:
    if "" in each:
        ws = wb[each]
        for row in ws.values:
            if row[0] == '国家':
                pass
            else:
                frequency_out[row[0]] = float(row[1])
```


 **生成词云图片** 


```
def generate_pic(frequency,name):
    wordcloud = WordCloud(font_path="C:/Windows/Fonts/SIMLI.TTF",
                          background_color="white",
                          width=1920, height=1080)
    # 根据确诊病例的数目生成词云
    wordcloud.generate_from_frequencies(frequency)
    # 保存词云
    wordcloud.to_file('%s.png'%(name))
```

 **调用函数** 

```
generate_pic(frequency_in,'国内义擎情况词云图')
generate_pic(frequency_out,'世界义擎词云图')
```

![国内疫情](https://images.gitee.com/uploads/images/2020/0710/094810_7e9553ad_5618404.png "国内疫情词云图.png")
![国外疫情](https://images.gitee.com/uploads/images/2020/0710/094851_46ec1d8a_5618404.png "国外疫情词云图.png")


 **完整代码** 

```
import openpyxl
from wordcloud import WordCloud
from pyecharts import options as opts
from pyecharts.charts import WordCloud

# 与即时显示图片相关的模块
'''
import matplotlib.pyplot as plt   # 绘制图像的模块
import numpy as np
from PIL import Image
'''

# 读取数据
wb = openpyxl.load_workbook('data.xlsx')
sheet_names = wb.sheetnames

frequency_out = {}
for each in sheet_names:
    if '' in each:
        ws = wb[each]
        for row in ws.values:
            if row[1] == "累计确诊":
                pass
            else:
                frequency_out[row[0]] = float(row[1])
    else:
        pass


# 以省份的确诊病例总数代表其出现的频率
frequency_in = {}
ws = wb['国内疫情']
for row in ws.values:
    if row[1] == "累计确诊":
        pass
    else:
        frequency_in[row[0]] = float(row[1])

def generate_pic(frequency,name):
    # 这里可以事先准备一张图片,可以用作背景
    # background_image = np.array(Image.open('pic.jpg'))
    wordcloud = WordCloud(font_path="C:/Windows/Fonts/SIMLI.TTF",
                          background_color = "red",
                          # mask=background_image,
                          width=1920, height=1080
                          )
    # 按照确诊病例数目生成词云
    wordcloud.generate_from_frequencies(frequency)
    wordcloud.to_file('%s.png'%name)

    # 展示图片
    # plt.imshow(wordcloud, interpolation="bilinear")
    # plt.axis("off")
    # plt.show()

# 调用函数
generate_pic(frequency_in,'国内疫情')
generate_pic(frequency_out,'国外疫情')
```


### 绘制地图

 **map_draw.py 文件源码** 


```
from pyecharts import options as opts
from pyecharts.charts import Map
import os



class Draw_map():
    # relativeTime为发布的时间,传入时间戳字符串
    # def get_time(self):
        # relativeTime = int(relativeTime)
        # return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(relativeTime))

    def __init__(self):
        if not os.path.exists('./map/china'):
            os.makedirs('./map/china')

    def get_colour(self,a,b,c):
        result = '#' + ''.join(map((lambda x: "%02x" % x), (a,b,c)))
        return result.upper()

    '''
    参数说明——area:地级市 variate:对应的疫情数据 province:省份(不含省字)
    '''
    def to_map_city(self,area, variate,province,update_time):
        pieces = [
            {"max": 99999999, "min": 10000, "label": "≥10000", "color": self.get_colour(102, 2, 8)},
            {"max": 9999, "min": 1000, "label": "1000-9999", "color": self.get_colour(140, 13, 13)},
            {"max": 999, "min": 500, "label": "500-999", "color": self.get_colour(204, 41, 41)},
            {"max": 499, "min": 100, "label": "100-499", "color": self.get_colour(255, 123, 105)},
            {"max": 99, "min": 50, "label": "50-99", "color": self.get_colour(255, 170, 133)},
            {"max": 49, "min": 10, "label": "10-49", "color": self.get_colour(255,202,179)},
            {"max": 9, "min": 1, "label": "1-9", "color": self.get_colour(255,228,217)},
            {"max": 0, "min": 0, "label": "0", "color": self.get_colour(255,255,255)},
              ]


        c = (
            # 设置地图大小
            Map(init_opts=opts.InitOpts(width = '1000px', height='880px'))
            .add("累计确诊人数", [list(z) for z in zip(area, variate)], province, is_map_symbol_show=False)
            # 设置全局变量  is_piecewise设置数据是否连续,split_number设置为分段数,pices可自定义数据分段
            # is_show设置是否显示图例
            .set_global_opts(
                title_opts=opts.TitleOpts(title="%s地区疫情地图分布"%(province), subtitle = '截止%s  %s省疫情分布情况'%(update_time,province), pos_left = "center", pos_top = "10px"),
                legend_opts=opts.LegendOpts(is_show = False),
                visualmap_opts=opts.VisualMapOpts(max_=200,is_piecewise=True,
                                                  pieces=pieces,
                                                  ),
            )
            .render("./map/china/{}疫情地图.html".format(province))
        )

    def to_map_china(self,area, variate,update_time):
        pieces = [{"max": 999999, "min": 1001, "label": ">10000", "color": "#8A0808"},
                  {"max": 9999, "min": 1000, "label": "1000-9999", "color": "#B40404"},
                  {"max": 999, "min": 100, "label": "100-999", "color": "#DF0101"},
                  {"max": 99, "min": 10, "label": "10-99", "color": "#F78181"},
                  {"max": 9, "min": 1, "label": "1-9", "color": "#F5A9A9"},
                  {"max": 0, "min": 0, "label": "0", "color": "#FFFFFF"},
                  ]

        c = (
            # 设置地图大小
            Map(init_opts=opts.InitOpts(width='1000px', height='880px'))
                .add("累计确诊人数", [list(z) for z in zip(area, variate)], "china", is_map_symbol_show=False)
                .set_global_opts(
                title_opts=opts.TitleOpts(title="中国疫情地图分布", subtitle='截止%s 中国疫情分布情况'%(update_time), pos_left="center", pos_top="10px"),
                legend_opts=opts.LegendOpts(is_show=False),
                visualmap_opts=opts.VisualMapOpts(max_=200, is_piecewise=True,
                                                  pieces=pieces,
                                                  ),
            )
                .render("./map/中国疫情地图.html")
        )
```

 **get_data.py 文件源码** 


```
import requests
from lxml import etree
import json
import re
import openpyxl


class Get_data():
    def get_data(self):
        # 目标url
        url = "https://voice.baidu.com/act/newpneumonia/newpneumonia/"

        # 伪装请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

        # 发出get请求
        response = requests.get(url,headers=headers)

        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

    def get_time(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 获取更新时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"',text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"',text)[0]
        print('国内疫情更新时间为 '+time_in)
        print('国外疫情更新时间为 '+time_out)
        return time_in,time_out

    def parse_data(self):
        with open('html.txt','r') as file:
            text = file.read()
        # 生成HTML对象
        html = etree.HTML(text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        # print(type(result))
        result = result[0]
        # print(type(result))
        result = json.loads(result)
        # print(type(result))
        result = json.dumps(result['component'][0]['caseList'])
        # print(result)
        # print(type(result))
        with open('data.json','w') as file:
            file.write(result)
            print('数据已写入json文件...')

        response = requests.get("https://voice.baidu.com/act/newpneumonia/newpneumonia/")
        # 将请求的结果写入文件,便于分析
        with open('html.txt', 'w') as file:
            file.write(response.text)

        # 获取时间
        time_in = re.findall('"mapLastUpdatedTime":"(.*?)"', response.text)[0]
        time_out = re.findall('"foreignLastUpdatedTime":"(.*?)"', response.text)[0]
        print(time_in)
        print(time_out)

        # 生成HTML对象
        html = etree.HTML(response.text)
        # 解析数据
        result = html.xpath('//script[@type="application/json"]/text()')
        print(type(result))
        result = result[0]
        print(type(result))
        result = json.loads(result)
        print(type(result))
        # 以每个省的数据为一个字典
        data_in = result['component'][0]['caseList']
        for each in data_in:
            print(each)
            print("\n" + '*' * 20)

        data_out = result['component'][0]['globalList']
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)

        '''
        area --> 大多为省份
        city --> 城市
        confirmed --> 累计
        died --> 死亡
        crued --> 治愈
        relativeTime --> 
        confirmedRelative --> 累计的增量
        curedRelative --> 治愈的增量
        curConfirm --> 现有确诊
        curConfirmRelative --> 现有确诊的增量
        diedRelative --> 死亡的增量
        '''

        # 规律----遍历列表的每一项,可以发现,每一项(type:字典)均代表一个省份等区域,这个字典的前11项是该省份的疫情数据,
        # 当key = 'subList'时,其结果为只有一项的列表,提取出列表的第一项,得到一系列的字典,字典中包含该城市的疫情数据.

        # 将得到的数据写入excel文件
        # 创建一个工作簿
        wb = openpyxl.Workbook()
        # 创建工作表,每一个工作表代表一个area
        ws_in = wb.active
        ws_in.title = "国内疫情"
        ws_in.append(['省份', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量', '死亡增量', '治愈增量', '现有确诊增量'])
        for each in data_in:
            temp_list = [each['area'], each['confirmed'], each['died'], each['crued'], each['curConfirm'],
                         each['confirmedRelative'], each['diedRelative'], each['curedRelative'],
                         each['curConfirmRelative']]
            for i in range(len(temp_list)):
                if temp_list[i] == '':
                    temp_list[i] = '0'
            ws_in.append(temp_list)

        # 获取国外疫情数据
        for each in data_out:
            print(each)
            print("\n" + '*' * 20)
            sheet_title = each['area']
            # 创建一个新的工作表
            ws_out = wb.create_sheet(sheet_title)
            ws_out.append(['国家', '累计确诊', '死亡', '治愈', '现有确诊', '累计确诊增量'])
            for country in each['subList']:
                list_temp = [country['country'], country['confirmed'], country['died'], country['crued'],
                             country['curConfirm'], country['confirmedRelative']]
                for i in range(len(list_temp)):
                    if list_temp[i] == '':
                        list_temp[i] = '0'
                ws_out.append(list_temp)

            # 保存excel文件
            wb.save('./data.xlsx')

```


 ** execution.py 文件 源码** 

```
import map_draw
import json
map = map_draw.Draw_map()
# 格式
# map.to_map_china(['湖北'],['99999'],'1584201600')
# map.to_map_city(['荆门市'],['99999'],'湖北','1584201600')

# 获取数据
with open('data.json', 'r') as file:
    data = file.read()
    data = json.loads(data)

# 中国疫情地图
def  china_map(update_time):
    area = []
    confirmed = []
    for each in data:
        print(each)
        area.append(each['area'])
        confirmed.append(each['confirmed'])
    map.to_map_china(area,confirmed,update_time)

# 23个省、5个自治区、4个直辖市、2个特别行政区 香港、澳门和台湾的subList为空列表,未有详情数据

# 省、直辖市疫情地图
def province_map(update_time):
    for each in data:
        city = []
        confirmeds = []
        province = each['area']
        for each_city in each['subList']:
            city.append(each_city['city']+"")
            confirmeds.append(each_city['confirmed'])
            map.to_map_city(city,confirmeds,province,update_time)
        if province == '上海' or '北京' or '天津' or '重庆':
            for each_city in each['subList']:
                city.append(each_city['city'])
                confirmeds.append(each_city['confirmed'])
                map.to_map_city(city,confirmeds,province,update_time)


```

 **main.py 文件 源码** 

```
from get_data import Get_data


data = Get_data()
data.get_data()
time_in,time_out = data.get_time()
data.parse_data()

import execution
execution.china_map(time_in)
execution.province_map(time_in)
```
四个文件保存,放在同一目录下,直接执行main.py,即可成功运行

![地图](https://images.gitee.com/uploads/images/2020/0710/095159_eaf1059c_5618404.jpeg "地图.jpg")

 

posted @ 2022-04-04 20:26  山海自有归期  阅读(281)  评论(0编辑  收藏  举报