import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
boston = pd.read_csv("./boston_house_prices.csv")
boston_target = pd.DataFrame(boston["MEDV"])
boston_data = pd.DataFrame(boston.iloc[:,boston.columns != "MEDV"])
boston_target.columns = [0]
boston_data.columns = range(len(boston_data.columns))
n_samples = boston_data.shape[0]
n_feature = boston_data.shape[1]
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_sample = int(np.floor(n_samples * n_feature * missing_rate))
n_missing_sample
3289
missing_feature = rng.randint(0, n_feature, n_missing_sample)
missing_sample = rng.randint(0, n_samples, n_missing_sample)
x_missing = boston_data.copy()
y_missing = boston_data.copy()
x_missing = np.array(x_missing)
x_missing[missing_sample, missing_feature] = np.nan
x_missing = pd.DataFrame(x_missing)
x_missing.isna().sum()
0 200
1 201
2 200
3 203
4 202
5 201
6 185
7 197
8 196
9 197
10 204
11 214
12 189
dtype: int64
im_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
x_missing_mean = im_mean.fit_transform(x_missing)
im_0 = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
x_missing_0 = im_0.fit_transform(x_missing)
x_missing_reg = x_missing.copy()
x_missing_reg.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 306 non-null float64
1 1 305 non-null float64
2 2 306 non-null float64
3 3 303 non-null float64
4 4 304 non-null float64
5 5 305 non-null float64
6 6 321 non-null float64
7 7 309 non-null float64
8 8 310 non-null float64
9 9 309 non-null float64
10 10 302 non-null float64
11 11 292 non-null float64
12 12 317 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values
for i in sortindex:
df = x_missing_reg
fillc = df.iloc[:,i]
df = pd.concat([df.iloc[:,df.columns != i], boston_target], axis=0)
df_0 = SimpleImputer(missing_values=np.nan, strategy="constant",fill_value=0).fit_transform(df)
y_train = fillc[fillc.notna()]
y_test = fillc[fillc.isna()]
x_train = df_0[y_train.index, :]
x_test = df_0[y_test.index, :]
rfc = RandomForestRegressor(n_estimators=100)
rfc.fit(x_train, y_train)
Ypredict = rfc.predict(x_test)
x_missing_reg.loc[x_missing_reg.iloc[:,i].isna(),i] = Ypredict
X = [boston_data, x_missing_mean, x_missing_0, x_missing_reg]
mse = []
for x in X:
rfc = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(rfc, x, np.array(boston_target).ravel(),cv=10,scoring="neg_mean_squared_error").mean()
mse.append(score * -1)
x_labels = ['Full data',
'Zero Imputation',
'Mean Imputation',
'Regressor Imputation']
colors = ['r', 'g', 'b', 'orange']
plt.figure(figsize=(12, 6))
ax = plt.subplot(111)
for i in np.arange(len(mse)):
ax.barh(i, mse[i],color=colors[i], alpha=0.6, align='center')
ax.set_title('Imputation Techniques with Boston Data')
ax.set_xlim(left=np.min(mse) * 0.9, right=np.max(mse) * 1.1)
ax.set_yticks(np.arange(len(mse)))
ax.set_xlabel('MSE')
ax.set_yticklabels(x_labels)
plt.show()