爬虫——国内主要城市历史天气

 天气逐渐凉爽,深圳开始步入一年中天气最适宜的半年,前些天汗流浃背的时候,无比向往气候清凉的城市,于是写了个爬虫,下载了全国四百多个主要城市的历史天气,研究了一下到底哪里的天气最舒爽。主要是为了看数据,代码写的很粗糙,先贴出来,抽空再改吧。

step1:建立数据库-从天气网爬取城市数据-写入数据库

#-*- encoding:utf-8 -*-
import requests
import re
from lxml import etree
import time
import csv
import numpy as np
import os,time
import pandas as pd
import xlwt
from statistics import median,mean
from _ast import If
import pymysql

#step1:建立数据库-从天气网爬取城市数据-写入数据库

class Tianqi(object):
    def cityurl(self):
        pass
        
    def tianqiparser(self,html_):
        
        root = etree.HTML(html_.content,parser = etree.HTMLParser(encoding='utf8'))
        tianqi_meitian_list=root.xpath('//ul[@class="thrui"]/li')
        #print(tianqi_meitian_list)
        return tianqi_meitian_list
    def tianqiload(self,chengshipinyin,chengshizhongwen,yuefen):
        header={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            #"Connection": "keep-alive",
            #"Cookie":"Hm_lvt_30606b57e40fddacb2c26d2b789efbcb=1661241459,1661389919,1661573346,1661576171; Hm_lpvt_30606b57e40fddacb2c26d2b789efbcb=1661576171; ",
            "Host": "lishi.tianqi.com",
            #"Referer": https://lishi.tianqi.com/haerbin/index.html
            "Referer": "https://lishi.tianqi.com/chaoyang/202202.html",
            'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
            #sec-ch-ua: "Chromium";v="21", " Not;A Brand";v="99"
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            #(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36}",
            }
        meiyueurl="".join([r"https://lishi.tianqi.com/",chengshipinyin,r"/",yuefen,".html"])
        #meiyueurl1="".join([r"https://lishi.tianqi.com/shanghai//",chengshipinyin,r"/","202205",".html"])
        print(meiyueurl)
        #html1=requests.post(meiyueurl,headers=header)
        html=requests.post(meiyueurl,headers=header)
        time.sleep(2)
        print("响应状态码",html.status_code)
        #print (html.text)
        return(html)
        #html=requests.get(meiyueurl,headers=header)
        
        
    
        
        
#yuefenlist1=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206']
#yuefenlist=['201101']
yuefenlist=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206']

cityliebiao=r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市列表.xlsx"
df_city=pd.read_excel(cityliebiao,sheet_name=4)
l_cityliebiao=df_city[["ID","城市拼音","城市中文名"]].values.tolist()

#建立连接
conn = pymysql.connect(
      host='127.0.0.1',
      port=3306,
      user='root',
      passwd=passwd,#数据库密码
      db='lishitianqi',
      charset='utf8')
#连接数据库
def connect_mysql(conn):
    #判断链接是否正常
    conn.ping(True)
    #建立操作游标
    cursor=conn.cursor()
    #设置数据输入输出编码格式
    cursor.execute('set names utf8')
    return cursor
# 建立链接游标
cur=connect_mysql(conn)


if __name__ == '__main__':
    tianqi=Tianqi()
    
    for citypinyin in l_cityliebiao:
        print(type(citypinyin[1]+citypinyin[2]))
        print(len(citypinyin[1]+citypinyin[2]))
        print(citypinyin[1]+citypinyin[2])
        a=citypinyin[1]+citypinyin[2]
        creat_sql="CREATE TABLE IF NOT EXISTS  {}  (ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,城市拼音 VARCHAR(30),城市中文名 VARCHAR(30),月份  VARCHAR(30),日期 VARCHAR(30),最高气温  VARCHAR(30),最低气温  VARCHAR(30),天气 VARCHAR(30),风向  VARCHAR(30))".format(a)
        
        cur.execute(creat_sql)
        for yuefen in yuefenlist:
            try:
                html=tianqi.tianqiload(citypinyin[1], citypinyin[2], yuefen)
                tianqi_meitian_list=tianqi.tianqiparser(html)
                for i in tianqi_meitian_list:
                    
                    s=(citypinyin[1], citypinyin[2],  yuefen,i.xpath("./div[1]/text()")[0],i.xpath("./div[2]/text()")[0],i.xpath("./div[3]/text()")[0],i.xpath("./div[4]/text()")[0],i.xpath("./div[5]/text()")[0])
                    print(s,len(s))
                    #insert_sql="".join(["insert into ",a,"  (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values (%s)"])
                    #insert_sql="".join(["insert into ",a,"  (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values {}"])
                    insert_sql="insert into {0} (城市拼音,城市中文名,月份,日期,最高气温,最低气温,天气,风向) values {1}".format(a,s)
                    #print(insert_sql)
                    cur.execute(insert_sql.format(s))
            except:
                pass

  

下载下来的表样式如下(顺便夸一下vscode的PPZ插件,直接查看数据库文件,支持mysql,非常方便):

编辑

 step2:简单分析,计算每个城市每月的历史平均高低温度,这里我们暂且认为夏天最高不超过29℃或者冬天最低不低于5℃为相对舒适的天气

#coding=utf8
from enum import unique
import pymysql
import pandas as pd
from sqlalchemy import create_engine



conn=pymysql.connect(
    host="127.0.0.1",
    port=3306,
    user="root",
    passwd=passwd,#数据库密码
    db="lishitianqi",
    charset="utf8"

)
def connect_mysql(conn):
    conn.ping(True)
    cursor=conn.cursor()
    cursor.execute("set names utf8")
    return cursor
cur= connect_mysql(conn)
cityliebiao=r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市列表.xlsx"
df_city=pd.read_excel(cityliebiao,sheet_name=3)
l_cityliebiao=df_city[["ID","城市拼音","城市中文名"]].values.tolist()

monthlist=['201101', '201102', '201103', '201104', '201105', '201106', '201107', '201108', '201109', '201110', '201111', '201112', '201201', '201202', '201203', '201204', '201205', '201206', '201207', '201208', '201209', '201210', '201211', '201212', '201301', '201302', '201303', '201304', '201305', '201306', '201307', '201308', '201309', '201310', '201311', '201312', '201401', '201402', '201403', '201404', '201405', '201406', '201407', '201408', '201409', '201410', '201411', '201412', '201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512', '201601', '201602', '201603', '201604', '201605', '201606', '201607', '201608', '201609', '201610', '201611', '201612', '201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708', '201709', '201710', '201711', '201712', '201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903', '201904', '201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008', '202009', '202010', '202011', '202012', '202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204', '202205', '202206']
onlymonthlist=["01","02","03","04","05","06","07","08","09","10","11","12"]
city_month=[]

for i in l_cityliebiao:
    try:
        city=i[1]+i[2]
        select_meiritianqi="select * from {}".format(city)
        engine=create_engine("mysql+pymysql://root:solokey@127.0.0.1:3306/lishitianqi?charset=utf8")
        df1=pd.read_sql(select_meiritianqi,engine)
        df1.drop_duplicates(subset="日期",keep="first",inplace=True)
        print(df1)
        #print (set(df1["月份"].values.tolist()))
        #monthlist=list(set(df1["月份"].values.tolist()))
        #monthlist.sort()
        #print (monthlist)
        print(len(df1))
        for month in onlymonthlist:
            df2=df1[(df1["月份"].str)[4:]==month]
            #print(df2)
            #print(df2["最高气温"].str.split(r"℃")[0])
            df2["最高气温int"]=pd.to_numeric((df2["最高气温"].str)[:-1],errors="ignore")
            df2["最低气温int"]=pd.to_numeric((df2["最低气温"].str)[:-1],errors="ignore")
            df3=df2[df2['最高气温int']<29]
            df4=df2[df2['最低气温int']>5]
            #print(df3)
            df5=df2[(df2['最高气温int']<29) & (df2['最低气温int']>5)]
            l=[city,month,df2.shape[0],df3.shape[0],df4.shape[0],df5.shape[0]]
            print(city,month,df2.shape[0],df3.shape[0],df4.shape[0],df5.shape[0])
            city_month.append(l)

    except:
        pass

df_citymonth=pd.DataFrame(city_month) 

df_citymonth.columns=["城市","月份","总天数","最高温度小于29的天数","最低温度大于5度的天数","最高温度小于29度且最低温度大于5度的天数"]
df_citymonth.to_excel(r"D:\BaiduSyncdisk\code\workspace20211212\craw\lishitianqi\城市每月天气.xlsx")

  

程序结果如下:

编辑

稍微统计一下:

编辑

果然,夏天最舒适的城市,还是集中在云贵高原这块,等有钱有闲了,夏天去这些凉爽城市长住一段。

posted @ 2022-10-27 16:07  __白衣渡江  阅读(305)  评论(0)    收藏  举报