[Python]爬取新型冠状病毒2.2至今的所有数据 python 2020.2.13
爬取网址http://hu.yixue99.com/2020/kszx_0205/27792.html
代码如下:
1 import requests 2 from bs4 import BeautifulSoup 3 4 url="http://hu.yixue99.com/2020/kszx_0205/27792.html" 5 kv = {'user-agent': 'Mozilla/5.0'} 6 7 #爬取总览信息 8 def content(): 9 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html" 10 r=requests.get(url,headers=kv) 11 r.encoding=r.apparent_encoding 12 demo=r.text 13 soup=BeautifulSoup(demo,"html.parser") 14 print("开始") 15 #print(r.text) 16 num=0 17 texts="" 18 for s in soup.find_all("span",{"style":"font-size:14px;"}): 19 text=str(s.string).replace("时间(北京时间)", "").replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈","").replace("疫情详情", "").replace("点击查看", "") 20 if text!="": 21 num+=1 22 if num % 5 != 0: 23 texts += text + " " 24 else: 25 texts+=text 26 print(texts) 27 wtire_content(texts.replace("例","") + "\n") 28 texts="" 29 30 31 #爬取链接 32 def href(): 33 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html" 34 r = requests.get(url, headers=kv) 35 r.encoding = r.apparent_encoding 36 demo = r.text 37 soup = BeautifulSoup(demo, "html.parser") 38 print("开始") 39 # print(r.text) 40 num = 0 41 texts = "" 42 for s in soup.find_all("span", {"style": "font-size:14px;"}): 43 if s.find("a") is not None: 44 href=str(s.find("a").attrs["href"]) 45 print(href) 46 wtire_href(href+"\n") 47 48 49 #爬取内容 50 def content_day(url): 51 r = requests.get(url, headers=kv) 52 r.encoding = r.apparent_encoding 53 demo = r.text 54 soup = BeautifulSoup(demo, "html.parser") 55 print(url) 56 print("开始") 57 num = 0 58 texts = "" 59 one=0 60 time= str(soup.find("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}).string).replace("各省疫情动态(截止至","").replace(" 10:00)","").replace("各省疫情动态(截止至","").replace(" 11:00)","") 61 print(time) 62 for s in soup.find_all("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}): 63 text = str(s.string).replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈", "").replace(" ", "").replace("省份", "") 64 if one==0: 65 one+=1 66 else: 67 if text !="": 68 num+=1 69 if num % 5 != 0: 70 texts += text + " " 71 else: 72 texts += text 73 print(time+texts) 74 write_content_day(time+" "+texts+"\n") 75 texts = "" 76 77 #写入总览信息 78 def wtire_content(contents): 79 f=open("E:/bingducsv/bingdusum.txt" , "a+" , encoding="utf-8") 80 f.write(contents) 81 f.close() 82 83 #写入每日的链接 84 def wtire_href(contents): 85 f = open("E:/bingducsv/bingduhref.txt", "a+", encoding="utf-8") 86 f.write(contents) 87 f.close() 88 89 def read(): 90 f = open("E:/bingducsv/bingduhref.txt", "r+", encoding="utf-8") 91 for line in f: 92 line=line.rstrip("\n") 93 url=line 94 content_day(url) 95 96 def write_content_day(contents): 97 f = open("E:/bingducsv/bingduday.txt", "a+", encoding="utf-8") 98 f.write(contents) 99 f.close() 100 101 102 if __name__=="__main__": 103 content() 104 href() 105 read()