python之天气爬虫
代码已调试通过
# 导入第三方包 import random import re import time import pandas as pd import requests # 构造请求头 headers = { 'Accept': '*/*', 'Accept -Enconding': 'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'conection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/63.0.3236.0 ' 'Safari/537.36 ' } # 生成所有需要抓取的链接 urls = [] for year in range(2012, 2019): for month in range(1, 13): if year <= 2016: urls.append('http://tianqi.2345.com/t/wea_history/js/58362_%s%s.js' % (year, month)) else: if month < 10: print("未获取天气数据") break; info = [] for url in urls: random.randint(3, 6) response = requests.get(url, headers=headers).text # 发送url链接的请求,并返回响应数据 print(response) city=re.findall("city:'(.*?)',", "".join(response)) # 正则表达式获取城市 ymd = re.findall("ymd:'(.*?)',", "".join(response)) # 正则表达式获取日期数据 high = re.findall(",bWendu:'(.*?)',", "".join(response)) # 正则表达式获取最高气温数据,正则表达式不加最前面的逗号,容易多匹配avgbWendu字段 low = re.findall(",yWendu:'(.*?)',", "".join(response)) # 正则表达式获取最低气温数据 tianqi = re.findall("tianqi:'(.*?)',", "".join(response)) # 正则表达式获取天气状况数据 fengxiang = re.findall("fengxiang:'(.*?)',", "".join(response)) # 正则表达式获取风向数据 aqi = re.findall("aqi:'(.*?)',", "".join(response)) # 正则表达式获取空气质量指标数据 aqiInfo = re.findall("aqiInfo:'(.*?)',", "".join(response)) # 正则表达式获取空气质量说明数据 aqiLevel = re.findall("aqiLevel:'(.*?)'}", "".join(response)) # 正则表达式获取空气质量水平数据 maxWendu=re.findall("maxWendu:'(.*?)',", "".join(response)) # 正则表达式获取最高 温度 minWendu = re.findall("maxWendu:'(.*?)',", "".join(response)) # 正则表达式获取最低温度 avgbWendu = re.findall("avgbWendu:'(.*?)',", "".join(response)) # 正则表达式获取平均白天温度 # 犹豫 2012-2015没有空气质量相关的数据,故需要分开处理 # 循环并通过正则匹配获取相关数据 if len(aqi) == 0: fengli = re.findall("fengli:'(.*?)'}", "".join(response)) # 正则表达式获取风力数据 avgyWendu = re.findall("avgyWendu:'(.*?)'}", "".join(response)) # 正则表达式获取平均夜里温度 aqi = '' aqiInfo = '' aqiLevel = '' df = pd.DataFrame.from_dict( {'city': city, 'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang, 'fengli': fengli, 'aqi': aqi, 'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel, 'maxWendu': maxWendu, 'minWendu': minWendu, 'avgbWendu': avgbWendu, 'avgyWendu': avgyWendu}, orient='index') pl = df.transpose() info.append(pl) else: fengli = re.findall("fengli:'(.*?)',", "".join(response)) # 正则表达式获取风力数据 avgyWendu = re.findall("avgyWendu:'(.*?)',", "".join(response)) # 正则表达式获取平均夜里温度 df = pd.DataFrame.from_dict( {'city':city,'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang, 'fengli': fengli, 'aqi': aqi, 'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel,'maxWendu':maxWendu,'minWendu':minWendu,'avgbWendu':avgbWendu,'avgyWendu':avgyWendu}, orient='index') # pl = df.transpose() info.append(pl) time.sleep(3) # 每循环一次,都随机停顿几秒 # 将存储的所有天气数据进行合并,生成数据表格 weather = pd.concat(info) # 数据导出 time = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) weather.to_csv('weather_new' + time + '.csv', index=False)
运行结果如下: