异常点分析
########## Bathrooms train_df.columns all_df = train_df.loc[:, ["Sold Price", "Bathrooms"]] plt.figure(figsize = (20, 6)) sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Bathrooms') plt.show() ######### Tax assessed value ######## 3674 43398 all_df = train_df.loc[:, ["Sold Price", "Tax assessed value"]] plt.figure(figsize = (20, 6)) sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Tax assessed value') for i in range(train_df.shape[0]): if train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000: plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Tax assessed value'], train_df.iloc[i].at['Id']) plt.show() ######### Tax assessed value ######## 3674 43398 all_df = train_df.loc[:, ["Sold Price", "Annual tax amount"]] plt.figure(figsize = (20, 6)) sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Annual tax amount') for i in range(train_df.shape[0]): if train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000: plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Annual tax amount'], train_df.iloc[i].at['Id']) plt.show() ######### Listed Price ######## 3674 43398 44633 all_df = train_df.loc[:, ["Sold Price", "Listed Price"]] plt.figure(figsize = (20, 6)) sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Listed Price') for i in range(train_df.shape[0]): if (train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000) or int(train_df.iloc[i].at['Listed Price']) > 100000000: plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Listed Price'], train_df.iloc[i].at['Id']) plt.show() ######### Last Sold Price ######## 3674 43398 44633 all_df = train_df.loc[:, ["Sold Price", "Last Sold Price"]] plt.figure(figsize = (20, 6)) sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Last Sold Price') for i in range(train_df.shape[0]): if (train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000) or int(train_df.iloc[i].at['Last Sold Price']) > 28000000: plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Last Sold Price'], train_df.iloc[i].at['Id']) plt.show()
train1 = train.drop([3674,6055,32867,34876,43398,44091,44633]).reset_index(drop=True) y = train1['Sold Price'] train_features = train1.drop('Sold Price', axis=1) features = pd.concat([train_features, test]).reset_index(drop=True) features.shape