import datetime
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
plt.rcParams["font.sans-serif"] = ["SimHei"]
# 第一步:读入数据
file_path = r"../../机器学习数据/data_temps1.csv"
df = pd.read_csv(file_path)
# 1.检查数据 - 检查数据的格式,数据的构成, 数据有没有缺失值
# print(df.head()) # 显示前五行数据
# print(df.shape) # 显示数据形状
# print(df.tail())
# print(df.describe()) # 数据描述
# print(df.info)
# print(df.isnull())
# print(df.isnull().sum())
# plt.hist(df["周"], bins=7, linewidth=0.5, edgecolor='white', align='left', alpha=0.6)
# 第二步:数据预处理
# 1.合并年月日
# df['date1'] = df["年"].map(str) + "-" + df["月"].map(str) + "-" + df["日"].map(str)
# print(df)
# 2.时间格式转换
# dates = [datetime.datetime.strptime(date, "%Y-%m-%d") for date in df["date1"]]
# 3.展示数据
# plt.figure(figsize=[12, 8])
# plt.subplot(2, 2, 1)
# plt.plot(dates, df["当天最高温度"])
# plt.title("当天最高温度")
#
# plt.subplot(2, 2, 2)
# plt.plot(dates, df["前一天最高温度"])
# plt.title("前一天最高温度")
#
# plt.subplot(2, 2, 3)
# plt.plot(dates, df["前两天最高温度"])
# plt.title("前两天最高温度")
#
# plt.subplot(2, 2, 4)
# plt.plot(dates, df["当地气象台预测值"])
# plt.title("当地气象台预测值")
#
# plt.show()
# 4. 数据的one-hot编码df["周"]
df = pd.get_dummies(df)
# 第三步:划分训练集与测试集
data = train_test_split(df, shuffle=True, test_size=0.3, random_state=100)
train_data = data[0] # 训练数据
train_feature = train_data.drop(["当天最高温度"], axis=1)
train_label = train_data["当天最高温度"]
test_data = data[1] # 测试数据
test_feature = test_data.drop(["当天最高温度"], axis=1)
test_label = test_data["当天最高温度"]
# 第四步:建模
n_estimators = [x for x in range(10, 101, 10)]
max_depth = [2, 4]
bootstrap = [True, False]
param_grid = {"n_estimators": n_estimators,
"max_depth": max_depth,
"bootstrap": bootstrap}
rf = RandomForestRegressor() # 实例化
clf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=5)
clf.fit(train_feature, train_label)
print(clf.best_params_)
# 第五步:评估
print(clf.score(train_feature, train_label))
print(clf.score(test_feature, test_label))
# 第六步:预测
pre_label = clf.predict(test_feature)
test_label = test_label.to_numpy()
# 第七步:数据可视化
plt.plot(pre_label)
plt.plot(test_label)
plt.title("拟合图")
plt.legend({"预测曲线", "真实曲线"})
plt.show()