寻找共线特征和寻找缺失值
1 # 寻找共线特征 2 def remove_collinear_features(x,y, threshold): 3 ''' 4 Objective: 5 Remove collinear features in a dataframe with a correlation coefficient 6 greater than the threshold. Removing collinear features can help a model 7 to generalize and improves the interpretability of the model. 8 9 Inputs: 10 threshold: any features with correlations greater than this value are removed 11 12 Output: 13 dataframe that contains only the non-highly-collinear features 14 ''' 15 16 # Dont want to remove correlations between Energy Star Score 17 18 # Calculate the correlation matrix 19 corr_matrix = x.corr() 20 iters = range(len(corr_matrix.columns) - 1) 21 drop_cols = [] 22 23 # Iterate through the correlation matrix and compare correlations 24 for i in iters: 25 for j in range(i): 26 item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)] 27 col = item.columns 28 row = item.index 29 val = abs(item.values) 30 31 # If correlation exceeds the threshold 32 if val >= threshold: 33 # Print the correlated features and the correlation value 34 print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2)) 35 drop_cols.append(col.values[0]) 36 drops = set(drop_cols) 37 return drops 38 # Drop one of each pair of correlated columns
寻找缺失值
def missing_values_table(df): # Total missing values mis_val = df.isnull().sum() # Percentage of missing values mis_val_percent = 100 * df.isnull().sum() / len(df) # Make a table with the results mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) # Rename the columns mis_val_table_ren_columns = mis_val_table.rename( columns = {0 : 'Missing Values', 1 : '% of Total Values'}) # Sort the table by percentage of missing descending mis_val_table_ren_columns = mis_val_table_ren_columns[ mis_val_table_ren_columns.iloc[:,1] != 0].sort_values( '% of Total Values', ascending=False).round(1) # Print some summary information print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.") # Return the dataframe with missing information return mis_val_table_ren_columns
# 寻找共线特征def remove_collinear_features(x,y, threshold): ''' Objective: Remove collinear features in a dataframe with a correlation coefficient greater than the threshold. Removing collinear features can help a model to generalize and improves the interpretability of the model. Inputs: threshold: any features with correlations greater than this value are removed Output: dataframe that contains only the non-highly-collinear features ''' # Dont want to remove correlations between Energy Star Score # Calculate the correlation matrix corr_matrix = x.corr() iters = range(len(corr_matrix.columns) - 1) drop_cols = []
# Iterate through the correlation matrix and compare correlations for i in iters: for j in range(i): item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)] col = item.columns row = item.index val = abs(item.values) # If correlation exceeds the threshold if val >= threshold: # Print the correlated features and the correlation value print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2)) drop_cols.append(col.values[0]) drops = set(drop_cols) return drops # Drop one of each pair of correlated columns