房价预测《基础版,测试》
#coding=utf8 import numpy as np import pandas as pd from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor #不要第一列id,只是作为索引 train_df = pd.read_csv('./input/train.csv', index_col=0) test_df = pd.read_csv('./input/test.csv', index_col=0)
#label本身并不平滑。为了我们分类器的学习更加准确,我们会首先把label给“平滑化”(正态化),如果miss掉,导致自己的结果总是达不到一定标准。这里我们使用最有逼格的log1p, 也就是 log(x+1),避免了复值的问题。如果我们这里把数据都给平滑化了,那么最后算结果的时候,要记得把预测到的平滑数据给变回去。按照“怎么来的怎么去”原则,log1p()就需要expm1(); 同理,log()就需要exp(), ... etc.
prices = pd.DataFrame({'price':train_df['SalePrice'], 'log(price + 1)':np.log1p(train_df['SalePrice'])})
#print train_df.columns #prices.hist() #print 'ok' y_train = np.log1p(train_df.pop('SalePrice')) #print y_train.shape #print train_df.index all_df = pd.concat((train_df,test_df), axis=0) #变量转换 #print all_df['MSSubClass'].dtypes all_df['MSSubClass'] = all_df['MSSubClass'].astype(str) #print all_df.shape #print all_df['MSSubClass'].value_counts() #print all_df['MSSubClass'].dtypes #print pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head() #当我们用numerical来表达categorical的时候,要注意,数字本身有大小的含义,所以乱用数字会给之后的模型学习带来麻烦。于是我们可以用One-Hot的方法来表达category。 #pandas自带的get_dummies方法,一键做到One-Hot。 #把所有的category数据,都给One-Hot了 all_dummy_df = pd.get_dummies(all_df) #print all_dummy_df.head() #print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) #处理缺失值 mean_cols = all_dummy_df.mean() #print mean_cols all_dummy_df = all_dummy_df.fillna(mean_cols) #print all_dummy_df.isnull().sum().sum() #标准化numerical数据,这里,我们当然不需要把One-Hot的那些0/1数据给标准化。我们的目标应该是那些本来就是numerical的数据: #先来看看 哪些是numerical的 numeric_cols = all_df.columns[all_df.dtypes != 'object'] #print numeric_cols #print train_df.index numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean() numeric_col_std = all_dummy_df.loc[:, numeric_cols].std() all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] #print train_df.index #print test_df.index #print dummy_train_df.shape #print dummy_test_df.shape #print type(dummy_train_df) X_train = dummy_train_df.values X_test = dummy_test_df.values #print type(X_train) print X_train.shape alphas = np.logspace(-3, 2, 50) test_scores = [] for alpha in alphas: clf = Ridge(alpha) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(alphas, test_scores) plt.title('Alpha vs CV Error') max_features = [.1, .3, .5, .7, .9, .99] test_scores = [] for max_feat in max_features: clf = RandomForestRegressor(n_estimators=200, max_features=max_feat) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(max_features, test_scores) plt.title("Max Features vs CV Error") #Ensemble ridge = Ridge(alpha=15) rf = RandomForestRegressor(n_estimators=500, max_features=.3) ridge.fit(X_train, y_train) rf.fit(X_train, y_train) y_ridge = np.expm1(ridge.predict(X_test)) y_rf = np.expm1(rf.predict(X_test)) y_final = (y_ridge + y_rf) / 2
分类:
github项目
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)