Python网络爬虫——爬取腾讯新闻国内疫情数据
一、 选题的背景
为什么要选择此选题?要达到的数据分析的预期目标是什么?(10分)
从社会、经济、技术、数据来源等方面进行描述(200字以内)
近年来,由于疫情原因的影响,世界各地都因为新型冠状病毒而陷入危机。因此,我对于现存国内的疫情数据进行了爬取并且对之进行数据分析,目的是为了让大家更加直观、清晰地查看,悉知国内的疫情现状,以便为社会上经常流通的人们加强防范意识,了解目前哪些区域风险等级较高出行注意防护。技术方面和数据来源方面是从腾讯新闻—抗疫频道,用request获取Json请求,从而得到疫情数据。
二、主题式网络爬虫设计方案(10分)
1.主题式网络爬虫名称
python网络爬虫——爬取腾讯新闻国内疫情数据
2.主题式网络爬虫爬取的内容与数据特征分析
爬取现存的疫情数据并做可视化处理
3.主题式网络爬虫设计方案概述(包括实现思路与技术难点)
步骤:首先,先确定此次的选题的主题内容,然后爬取现存的疫情数据,设计爬取程序进行爬取,并以csv的形式储存,接着利用pandas库进行数据分析以及清洗,再利用Matplotlib与seaborn等库进行图形与图像的绘制。最后,保存数据。
三、主题页面的结构特征分析
1.主题页面的结构与特征分析
目标网站是腾讯新闻网实时数据,其原理主要是通过Requests获取Json请求,从而得到各省、各市的疫情数据。
爬虫目标网站:
https://news.qq.com/zt2020/page/feiyan.htm
对于静态网页,我们只需要把网页地址栏中的url传到get请求中就可以轻松地获取到网页的数据。 对于动态网页抓取的关键是先分析网页数据获取和跳转的逻辑,再去写代码 。其中lastUpdateTime是数据的最新更新时间;chinaTotal中是目前的确诊数、疑似数、死亡数、治愈数;chinaDalyList中是1月13日至今的全国总数据;areaTree中是全国详细的数据。
我们需要的数据是lastUpdateTime部分,然后从Network中找到了我们需要抓取的接口:https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568。
2.htmls页面解析
3.节点(标签)查找方法与遍历方法(必要时画出节点树结构)
四、网络爬虫程序设计
1 import requests 2 import json 3 import pprint 4 import pandas as pd 5 import time 6 import matplotlib 7 import matplotlib.pyplot as plt 8 import numpy as np 9 import seaborn as sns 10 from pyecharts import options as opts 11 from pyecharts.charts import Bar 12 13 # ------------------------------------------------------------------------------ 14 # 第一步:抓取数据 15 # ------------------------------------------------------------------------------ 16 17 # 抓取腾讯疫情实时json数据 18 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568' 19 data = json.loads(requests.get(url=url).json()['data']) 20 print(data) 21 print(data.keys()) 22 23 # 统计省份信息(34个省份 ) 24 num = data['areaTree'][0]['children'] 25 print(len(num)) 26 for item in num: 27 print(item['name'], end=" ") # 不换行 28 else: 29 print("\n") # 换行 30 31 32 # 解析确诊数据 33 total_data = {} 34 for item in num: 35 if item['name'] not in total_data: 36 total_data.update({item['name']: 0}) 37 for city_data in item['children']: 38 total_data[item['name']] += int(city_data['total']['confirm']) 39 print(total_data) 40 41 42 # 解析疑似数据 43 total_suspect_data = {} 44 for item in num: 45 if item['name'] not in total_suspect_data: 46 total_suspect_data.update({item['name']: 0}) 47 48 for city_data in item['children']: 49 total_suspect_data[item['name']] += int(city_data['total']['suspect']) 50 print(total_suspect_data) 51 52 # 解析死亡数据 53 total_dead_data = {} 54 for item in num: 55 if item['name'] not in total_dead_data: 56 total_dead_data.update({item['name']: 0}) 57 58 for city_data in item['children']: 59 total_dead_data[item['name']] += int(city_data['total']['dead']) 60 print(total_dead_data) 61 62 # 解析治愈数据 63 total_heal_data = {} 64 for item in num: 65 if item['name'] not in total_heal_data: 66 total_heal_data.update({item['name']: 0}) 67 68 for city_data in item['children']: 69 total_heal_data[item['name']] += int(city_data['total']['heal']) 70 print(total_heal_data) 71 72 # 解析新增确诊数据 73 total_new_data = {} 74 for item in num: 75 if item['name'] not in total_new_data: 76 total_new_data.update({item['name']: 0}) 77 78 for city_data in item['children']: 79 total_new_data[item['name']] += int(city_data['today']['confirm']) # today 80 print(total_new_data) 81 82 import matplotlib.pyplot as plt 83 import numpy as np 84 85 plt.figure(figsize=[10, 6]) 86 87 # 用来正常显示中文标签 88 plt.rcParams['font.sans-serif'] = ['SimHei'] 89 90 # 用来正常显示负号 91 plt.rcParams['axes.unicode_minus'] = False
输出结果图:
1 # ------------------------------------------------------------------------------ 2 # 第二步:存储数据至CSV文件 3 # ------------------------------------------------------------------------------ 4 5 #防止因跳过SSL证书验证而出现警告 6 from requests.packages.urllib3.exceptions import InsecureRequestWarning 7 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 8 9 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568' 10 response = requests.get(url, verify=False) 11 json_data = response.json()['data'] 12 json_data = json.loads(json_data) 13 china_data = json_data['areaTree'][0]['children'] 14 data_set = [] 15 16 for i in china_data: 17 data_dict = {} 18 19 # 地区名称 20 data_dict['province'] = i['name'] 21 22 # 新增确认 23 data_dict['nowConfirm'] = i['total']['nowConfirm'] 24 25 # 死亡人数 26 data_dict['dead'] = i['total']['dead'] 27 28 # 治愈人数 29 data_dict['heal'] = i['total']['heal'] 30 31 # 死亡率 32 data_dict['deadRate'] = i['total']['deadRate'] 33 34 # 治愈率 35 data_dict['healRate'] = i['total']['healRate'] 36 data_set.append(data_dict) 37 df = pd.DataFrame(data_set) 38 df.to_csv('data1.csv')
输出结果图:
1 # ------------------------------------------------------------------------------ 2 # 第三步:Matplotlib绘制全国各地区柱状图 3 # ------------------------------------------------------------------------------ 4 5 # -----------------------------1.绘制确诊数据----------------------------------- 6 7 p1 = plt.subplot(221) 8 9 # 获取数据 10 names = total_data.keys() 11 nums = total_data.values() 12 print(names) 13 print(nums) 14 print(total_data) 15 plt.bar(names, nums, width=0.3, color='green') 16 17 # 设置标题 18 plt.ylabel("确诊人数", rotation=90) 19 plt.xticks(list(names), rotation=-60, size=8) 20 21 # 显示数字 22 for a, b in zip(list(names), list(nums)): 23 plt.text(a, b, b, ha='center', va='bottom', size=6) 24 plt.sca(p1) 25 26 # -----------------------------2.绘制新增确诊数据----------------------------------- 27 28 p2 = plt.subplot(222) 29 names = total_new_data.keys() 30 nums = total_new_data.values() 31 print(names) 32 print(nums) 33 plt.bar(names, nums, width=0.3, color='yellow') 34 plt.ylabel("新增确诊人数", rotation=90) 35 plt.xticks(list(names), rotation=-60, size=8) 36 37 # 显示数字 38 for a, b in zip(list(names), list(nums)): 39 plt.text(a, b, b, ha='center', va='bottom', size=6) 40 plt.sca(p2) 41 42 # -----------------------------3.绘制死亡数据----------------------------------- 43 44 p3 = plt.subplot(223) 45 names = total_dead_data.keys() 46 nums = total_dead_data.values() 47 print(names) 48 print(nums) 49 plt.bar(names, nums, width=0.3, color='blue') 50 plt.xlabel("地区") 51 plt.ylabel("死亡人数", rotation=90) 52 plt.xticks(list(names), rotation=-60, size=8) 53 54 for a, b in zip(list(names), list(nums)): 55 plt.text(a, b, b, ha='center', va='bottom', size=6) 56 plt.sca(p3) 57 58 # -----------------------------4.绘制治愈数据----------------------------------- 59 60 p4 = plt.subplot(224) 61 names = total_heal_data.keys() 62 nums = total_heal_data.values() 63 print(names) 64 print(nums) 65 plt.bar(names, nums, width=0.3, color='red') 66 plt.xlabel("地区") 67 plt.ylabel("治愈人数", rotation=90) 68 plt.xticks(list(names), rotation=-60, size=8) 69 70 for a, b in zip(list(names), list(nums)): 71 plt.text(a, b, b, ha='center', va='bottom', size=6) 72 plt.sca(p4) 73 plt.show()
输出结果图:
1 #------------------------------------------------------------------------------ 2 # 第四步:调用Seaborn绘制柱状图 3 #------------------------------------------------------------------------------ 4 5 # 读取数据 6 n = time.strftime('data1.csv') 7 data = pd.read_csv(n) 8 9 # 设置窗口 10 fig, ax = plt.subplots(1,1) 11 print(data['province']) 12 13 # 设置绘图风格及字体 14 sns.set_style("whitegrid",{'font.sans-serif':['simhei','Arial']}) 15 16 # 绘制柱状图 17 g = sns.barplot(x="province", y="nowConfirm", data=data, ax=ax, 18 palette=sns.color_palette("hls", 8)) 19 20 # 在柱状图上显示数字 21 i = 0 22 for index, b in zip(list(data['province']), list(data['nowConfirm'])): 23 g.text(i+0.05, b+0.05, b, color="black", ha="center", va='bottom', size=6) 24 i = i + 1 25 26 # 设置Axes的标题 27 ax.set_title('全国疫情最新情况') 28 29 # 设置坐标轴文字方向 30 ax.set_xticklabels(ax.get_xticklabels(), rotation=-60) 31 32 # 设置坐标轴刻度的字体大小 33 ax.tick_params(axis='x',labelsize=8) 34 ax.tick_params(axis='y',labelsize=8) 35 36 plt.show()
输出结果图:
1 #------------------------------------------------------------------------------ 2 # 第五步:调用pyechart绘制柱状图 3 #------------------------------------------------------------------------------ 4 5 bar = ( 6 Bar() 7 .add_xaxis(list(data['province'])) 8 .add_yaxis("累计确诊柱状图", list(data['nowConfirm'])) 9 .set_global_opts( 10 title_opts=opts.TitleOpts(title="COVID-19中国现有地区累计确诊人数树状图"), 11 yaxis_opts=opts.AxisOpts(name="累计确诊病例"), 12 xaxis_opts=opts.AxisOpts(name="地区"), 13 datazoom_opts=opts.DataZoomOpts(type_="slider") 14 ) 15 ) 16 bar.render("累计确诊柱状图.html")
输出结果图:
完整代码:
1 import requests 2 import json 3 import pprint 4 import pandas as pd 5 import time 6 import matplotlib 7 import matplotlib.pyplot as plt 8 import numpy as np 9 import seaborn as sns 10 from pyecharts import options as opts 11 from pyecharts.charts import Bar 12 13 # ------------------------------------------------------------------------------ 14 # 第一步:抓取数据 15 # ------------------------------------------------------------------------------ 16 17 # 抓取腾讯疫情实时json数据 18 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568' 19 data = json.loads(requests.get(url=url).json()['data']) 20 print(data) 21 print(data.keys()) 22 23 # 统计省份信息(34个省份 ) 24 num = data['areaTree'][0]['children'] 25 print(len(num)) 26 for item in num: 27 print(item['name'], end=" ") # 不换行 28 else: 29 print("\n") # 换行 30 31 32 # 解析确诊数据 33 total_data = {} 34 for item in num: 35 if item['name'] not in total_data: 36 total_data.update({item['name']: 0}) 37 for city_data in item['children']: 38 total_data[item['name']] += int(city_data['total']['confirm']) 39 print(total_data) 40 41 42 # 解析疑似数据 43 total_suspect_data = {} 44 for item in num: 45 if item['name'] not in total_suspect_data: 46 total_suspect_data.update({item['name']: 0}) 47 48 for city_data in item['children']: 49 total_suspect_data[item['name']] += int(city_data['total']['suspect']) 50 print(total_suspect_data) 51 52 # 解析死亡数据 53 total_dead_data = {} 54 for item in num: 55 if item['name'] not in total_dead_data: 56 total_dead_data.update({item['name']: 0}) 57 58 for city_data in item['children']: 59 total_dead_data[item['name']] += int(city_data['total']['dead']) 60 print(total_dead_data) 61 62 # 解析治愈数据 63 total_heal_data = {} 64 for item in num: 65 if item['name'] not in total_heal_data: 66 total_heal_data.update({item['name']: 0}) 67 68 for city_data in item['children']: 69 total_heal_data[item['name']] += int(city_data['total']['heal']) 70 print(total_heal_data) 71 72 # 解析新增确诊数据 73 total_new_data = {} 74 for item in num: 75 if item['name'] not in total_new_data: 76 total_new_data.update({item['name']: 0}) 77 78 for city_data in item['children']: 79 total_new_data[item['name']] += int(city_data['today']['confirm']) # today 80 print(total_new_data) 81 82 import matplotlib.pyplot as plt 83 import numpy as np 84 85 plt.figure(figsize=[10, 6]) 86 87 # 用来正常显示中文标签 88 plt.rcParams['font.sans-serif'] = ['SimHei'] 89 90 # 用来正常显示负号 91 plt.rcParams['axes.unicode_minus'] = False 92 93 # ------------------------------------------------------------------------------ 94 # 第二步:存储数据至CSV文件 95 # ------------------------------------------------------------------------------ 96 97 #防止因跳过SSL证书验证而出现警告 98 from requests.packages.urllib3.exceptions import InsecureRequestWarning 99 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 100 101 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568' 102 response = requests.get(url, verify=False) 103 json_data = response.json()['data'] 104 json_data = json.loads(json_data) 105 china_data = json_data['areaTree'][0]['children'] 106 data_set = [] 107 108 for i in china_data: 109 data_dict = {} 110 111 # 地区名称 112 data_dict['province'] = i['name'] 113 114 # 新增确认 115 data_dict['nowConfirm'] = i['total']['nowConfirm'] 116 117 # 死亡人数 118 data_dict['dead'] = i['total']['dead'] 119 120 # 治愈人数 121 data_dict['heal'] = i['total']['heal'] 122 123 # 死亡率 124 data_dict['deadRate'] = i['total']['deadRate'] 125 126 # 治愈率 127 data_dict['healRate'] = i['total']['healRate'] 128 data_set.append(data_dict) 129 df = pd.DataFrame(data_set) 130 df.to_csv('data1.csv') 131 132 # ------------------------------------------------------------------------------ 133 # 第三步:Matplotlib绘制全国各地区柱状图 134 # ------------------------------------------------------------------------------ 135 136 # -----------------------------1.绘制确诊数据----------------------------------- 137 138 p1 = plt.subplot(221) 139 140 # 获取数据 141 names = total_data.keys() 142 nums = total_data.values() 143 print(names) 144 print(nums) 145 print(total_data) 146 plt.bar(names, nums, width=0.3, color='green') 147 148 # 设置标题 149 plt.ylabel("确诊人数", rotation=90) 150 plt.xticks(list(names), rotation=-60, size=8) 151 152 # 显示数字 153 for a, b in zip(list(names), list(nums)): 154 plt.text(a, b, b, ha='center', va='bottom', size=6) 155 plt.sca(p1) 156 157 # -----------------------------2.绘制新增确诊数据----------------------------------- 158 159 p2 = plt.subplot(222) 160 names = total_new_data.keys() 161 nums = total_new_data.values() 162 print(names) 163 print(nums) 164 plt.bar(names, nums, width=0.3, color='yellow') 165 plt.ylabel("新增确诊人数", rotation=90) 166 plt.xticks(list(names), rotation=-60, size=8) 167 168 # 显示数字 169 for a, b in zip(list(names), list(nums)): 170 plt.text(a, b, b, ha='center', va='bottom', size=6) 171 plt.sca(p2) 172 173 # -----------------------------3.绘制死亡数据----------------------------------- 174 175 p3 = plt.subplot(223) 176 names = total_dead_data.keys() 177 nums = total_dead_data.values() 178 print(names) 179 print(nums) 180 plt.bar(names, nums, width=0.3, color='blue') 181 plt.xlabel("地区") 182 plt.ylabel("死亡人数", rotation=90) 183 plt.xticks(list(names), rotation=-60, size=8) 184 185 for a, b in zip(list(names), list(nums)): 186 plt.text(a, b, b, ha='center', va='bottom', size=6) 187 plt.sca(p3) 188 189 # -----------------------------4.绘制治愈数据----------------------------------- 190 191 p4 = plt.subplot(224) 192 names = total_heal_data.keys() 193 nums = total_heal_data.values() 194 print(names) 195 print(nums) 196 plt.bar(names, nums, width=0.3, color='red') 197 plt.xlabel("地区") 198 plt.ylabel("治愈人数", rotation=90) 199 plt.xticks(list(names), rotation=-60, size=8) 200 201 for a, b in zip(list(names), list(nums)): 202 plt.text(a, b, b, ha='center', va='bottom', size=6) 203 plt.sca(p4) 204 plt.show() 205 206 #------------------------------------------------------------------------------ 207 # 第四步:调用Seaborn绘制柱状图 208 #------------------------------------------------------------------------------ 209 210 # 读取数据 211 n = time.strftime('data1.csv') 212 data = pd.read_csv(n) 213 214 # 设置窗口 215 fig, ax = plt.subplots(1,1) 216 print(data['province']) 217 218 # 设置绘图风格及字体 219 sns.set_style("whitegrid",{'font.sans-serif':['simhei','Arial']}) 220 221 # 绘制柱状图 222 g = sns.barplot(x="province", y="nowConfirm", data=data, ax=ax, 223 palette=sns.color_palette("hls", 8)) 224 225 # 在柱状图上显示数字 226 i = 0 227 for index, b in zip(list(data['province']), list(data['nowConfirm'])): 228 g.text(i+0.05, b+0.05, b, color="black", ha="center", va='bottom', size=6) 229 i = i + 1 230 231 # 设置Axes的标题 232 ax.set_title('全国疫情最新情况') 233 234 # 设置坐标轴文字方向 235 ax.set_xticklabels(ax.get_xticklabels(), rotation=-60) 236 237 # 设置坐标轴刻度的字体大小 238 ax.tick_params(axis='x',labelsize=8) 239 ax.tick_params(axis='y',labelsize=8) 240 241 plt.show() 242 243 244 #------------------------------------------------------------------------------ 245 # 第五步:调用pyechart绘制柱状图 246 #------------------------------------------------------------------------------ 247 248 bar = ( 249 Bar() 250 .add_xaxis(list(data['province'])) 251 .add_yaxis("累计确诊柱状图", list(data['nowConfirm'])) 252 .set_global_opts( 253 title_opts=opts.TitleOpts(title="COVID-19中国现有地区累计确诊人数树状图"), 254 yaxis_opts=opts.AxisOpts(name="累计确诊病例"), 255 xaxis_opts=opts.AxisOpts(name="地区"), 256 datazoom_opts=opts.DataZoomOpts(type_="slider") 257 ) 258 ) 259 bar.render("累计确诊柱状图.html")
五、总结(10分)
1.经过对主题数据的分析与可视化,可以得到哪些结论?是否达到预期的目标?
从第一个柱状图分别可看出累计确诊人数、新增确诊人数、死亡人数、治愈人数最多的省份分别为湖北、陕西、湖北、湖北。由此可见湖北曾经是我国高风险地区,而陕西目前新增人数较多;第二个柱状图可以看出台湾每日新增确诊病例人数较多;第三个柱状图可以看出累计确诊病例最多的省份是台湾和陕西。但是目前来看除陕西以外,中国国内目前的疫情是处在一个较为稳定的控制状态,没有太大的波动。中国是有很大优势尽快结束这次疫情的。通过此次的疫情数据爬取实时数据项目,已经达到了数据的预期。
2.在完成此设计过程中,得到哪些收获?以及要改进的建议?
通过本次的疫情实时数据爬取及可视化的项目,我明白了在做一个项目时,不应该着急的去敲代码,而是应该先做好需求分析,了解技术难点,这样才能选择需要的技术方法,框架。