本次实验内容为爬取疫情数据,预计完成时间4小时,其中1小时用来学习相关知识,2小时配置环境,1小时编程。

执行情况,接近四小时时完成任务,其中遇到最大问题是python bs4的安装出现问题,其次对网站的数据解析不清,导致出现问题

最后第六名完成任务。

import requests
from lxml import etree
from bs4 import BeautifulSoup
import pymysql
import simplejson
import time
from _ast import If

db = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='mysjz’,db='python')
cursor = db.cursor()

class nCoV_2019:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
}
self.url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"

def parse_url(self):
r = requests.get(url = self.url,headers=self.headers)
assert r.status_code == 200
html = etree.HTML(r.content.decode())
results = html.xpath('//*[@id="getListByCountryTypeService2true"]//text()')[0].split('}')[:-3]
return results


def getDataList(self,results):
data_list = []
count = 0
timestamp = 0
time_local = 0
for result in results:
count = count + 1
data_dict = {}

if results.index(result) == 0:
result = result.replace('try { window.getListByCountryTypeService2true = [','')
result = simplejson.loads(result.lstrip(',') + '}')
if count == 1:
timestamp = result['createTime']/1000
time_local = time.localtime(int(timestamp))
data_dict['createTime'] = time.strftime("%Y-%m-%d %H:%M:%S",time_local)
else:
data_dict['createTime'] = time.strftime("%Y-%m-%d %H:%M:%S",time_local)

data_dict['continents'] = result['continents']
data_dict['provinceName'] = result['provinceName']
data_dict['currentConfirmedCount'] = result['currentConfirmedCount']
data_dict['confirmedCount'] = result['confirmedCount']
data_dict['curedCount'] = result['curedCount']
data_dict['deadCount'] = result['deadCount']

insert_yiqing = ("insert into yiqingcc(Time,Continents,Province,Current,Confirmed,Cured,Dead)""values(%s,%s,%s,%s,%s,%s,%s)")
data_yiqing = (data_dict['createTime'],data_dict['continents'],data_dict['provinceName'],data_dict['currentConfirmedCount'],data_dict['confirmedCount'],data_dict['curedCount'],data_dict['deadCount'])
cursor.execute(insert_yiqing,data_yiqing)
db.commit()
data_list.append(data_dict)
return data_list
def main(self):
results = self.parse_url()
self.getDataList(results)
#data_list = self.getDataList(results)
#print(self.parse_url())
nCoV_2019 = nCoV_2019()
nCoV_2019.main()

 

数据库结果如下: