爬虫——国内主要城市历史天气
天气逐渐凉爽,深圳开始步入一年中天气最适宜的半年,前些天汗流浃背的时候,无比向往气候清凉的城市,于是写了个爬虫,下载了全国四百多个主要城市的历史天气,研究了一下到底哪里的天气最舒爽。主要是为了看数据,代码写的很粗糙,先贴出来,抽空再改吧。
step1:建立数据库-从天气网爬取城市数据-写入数据库
#-*- encoding:utf-8 -*- import requests import re from lxml import etree import time import csv import numpy as np import os,time import pandas as pd import xlwt from statistics import median,mean from _ast import If import pymysql #step1:建立数据库-从天气网爬取城市数据-写入数据库 class Tianqi(object): def cityurl(self): pass def tianqiparser(self,html_): root = etree.HTML(html_.content,parser = etree.HTMLParser(encoding='utf8')) tianqi_meitian_list=root.xpath('//ul[@class="thrui"]/li') #print(tianqi_meitian_list) return tianqi_meitian_list def tianqiload(self,chengshipinyin,chengshizhongwen,yuefen): header={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", #"Connection": "keep-alive", #"Cookie":"Hm_lvt_30606b57e40fddacb2c26d2b789efbcb=1661241459,1661389919,1661573346,1661576171; Hm_lpvt_30606b57e40fddacb2c26d2b789efbcb=1661576171; ", "Host": "lishi.tianqi.com", #"Referer": https://lishi.tianqi.com/haerbin/index.html "Referer": "https://lishi.tianqi.com/chaoyang/202202.html", 'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"', #sec-ch-ua: "Chromium";v="21", " Not;A Brand";v="99" "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", #(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36}", } meiyueurl="".join([r"https://lishi.tianqi.com/",chengshipinyin,r"/",yuefen,".html"]) #meiyueurl1="".join([r"https://lishi.tianqi.com/shanghai//",chengshipinyin,r"/","202205",".html"]) print(meiyueurl) #html1=requests.post(meiyueurl,headers=header) html=requests.post(meiyueurl,headers=header) time.sleep(2) print("响应状态码",html.status_code) #print (html.text) return(html) #html=requests.get(meiyueurl,headers=header) #yuefenlist1=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206'] #yuefenlist=['201101'] yuefenlist=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206'] cityliebiao=r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市列表.xlsx" df_city=pd.read_excel(cityliebiao,sheet_name=4) l_cityliebiao=df_city[["ID","城市拼音","城市中文名"]].values.tolist() #建立连接 conn = pymysql.connect( host='127.0.0.1', port=3306, user='root', passwd=passwd,#数据库密码 db='lishitianqi', charset='utf8') #连接数据库 def connect_mysql(conn): #判断链接是否正常 conn.ping(True) #建立操作游标 cursor=conn.cursor() #设置数据输入输出编码格式 cursor.execute('set names utf8') return cursor # 建立链接游标 cur=connect_mysql(conn) if __name__ == '__main__': tianqi=Tianqi() for citypinyin in l_cityliebiao: print(type(citypinyin[1]+citypinyin[2])) print(len(citypinyin[1]+citypinyin[2])) print(citypinyin[1]+citypinyin[2]) a=citypinyin[1]+citypinyin[2] creat_sql="CREATE TABLE IF NOT EXISTS {} (ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,城市拼音 VARCHAR(30),城市中文名 VARCHAR(30),月份 VARCHAR(30),日期 VARCHAR(30),最高气温 VARCHAR(30),最低气温 VARCHAR(30),天气 VARCHAR(30),风向 VARCHAR(30))".format(a) cur.execute(creat_sql) for yuefen in yuefenlist: try: html=tianqi.tianqiload(citypinyin[1], citypinyin[2], yuefen) tianqi_meitian_list=tianqi.tianqiparser(html) for i in tianqi_meitian_list: s=(citypinyin[1], citypinyin[2], yuefen,i.xpath("./div[1]/text()")[0],i.xpath("./div[2]/text()")[0],i.xpath("./div[3]/text()")[0],i.xpath("./div[4]/text()")[0],i.xpath("./div[5]/text()")[0]) print(s,len(s)) #insert_sql="".join(["insert into ",a," (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values (%s)"]) #insert_sql="".join(["insert into ",a," (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values {}"]) insert_sql="insert into {0} (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values {1}".format(a,s) #print(insert_sql) cur.execute(insert_sql.format(s)) except: pass

下载下来的表样式如下(顺便夸一下vscode的PPZ插件,直接查看数据库文件,支持mysql,非常方便):
编辑
step2:简单分析,计算每个城市每月的历史平均高低温度,这里我们暂且认为夏天最高不超过29℃或者冬天最低不低于5℃为相对舒适的天气
#coding=utf8 from enum import unique import pymysql import pandas as pd from sqlalchemy import create_engine conn=pymysql.connect( host="127.0.0.1", port=3306, user="root", passwd=passwd,#数据库密码 db="lishitianqi", charset="utf8" ) def connect_mysql(conn): conn.ping(True) cursor=conn.cursor() cursor.execute("set names utf8") return cursor cur= connect_mysql(conn) cityliebiao=r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市列表.xlsx" df_city=pd.read_excel(cityliebiao,sheet_name=3) l_cityliebiao=df_city[["ID","城市拼音","城市中文名"]].values.tolist() monthlist=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206'] onlymonthlist=["01","02","03","04","05","06","07","08","09","10","11","12"] city_month=[] for i in l_cityliebiao: try: city=i[1]+i[2] select_meiritianqi="select * from {}".format(city) engine=create_engine("mysql+pymysql://root:solokey@127.0.0.1:3306/lishitianqi?charset=utf8") df1=pd.read_sql(select_meiritianqi,engine) df1.drop_duplicates(subset="日期",keep="first",inplace=True) print(df1) #print (set(df1["月份"].values.tolist())) #monthlist=list(set(df1["月份"].values.tolist())) #monthlist.sort() #print (monthlist) print(len(df1)) for month in onlymonthlist: df2=df1[(df1["月份"].str)[4:]==month] #print(df2) #print(df2["最高气温"].str.split(r"℃")[0]) df2["最高气温int"]=pd.to_numeric((df2["最高气温"].str)[:-1],errors="ignore") df2["最低气温int"]=pd.to_numeric((df2["最低气温"].str)[:-1],errors="ignore") df3=df2[df2['最高气温int']<29] df4=df2[df2['最低气温int']>5] #print(df3) df5=df2[(df2['最高气温int']<29) & (df2['最低气温int']>5)] l=[city,month,df2.shape[0],df3.shape[0],df4.shape[0],df5.shape[0]] print(city,month,df2.shape[0],df3.shape[0],df4.shape[0],df5.shape[0]) city_month.append(l) except: pass df_citymonth=pd.DataFrame(city_month) df_citymonth.columns=["城市","月份","总天数","最高温度小于29的天数","最低温度大于5度的天数","最高温度小于29度且最低温度大于5度的天数"] df_citymonth.to_excel(r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市每月天气.xlsx")

程序结果如下:
编辑
稍微统计一下:
编辑
果然,夏天最舒适的城市,还是集中在云贵高原这块,等有钱有闲了,夏天去这些凉爽城市长住一段。