寻找共线特征和寻找缺失值

 1 # 寻找共线特征
 2 def remove_collinear_features(x,y, threshold):
 3     '''
 4     Objective:
 5         Remove collinear features in a dataframe with a correlation coefficient
 6         greater than the threshold. Removing collinear features can help a model
 7         to generalize and improves the interpretability of the model.
 8         
 9     Inputs: 
10         threshold: any features with correlations greater than this value are removed
11     
12     Output: 
13         dataframe that contains only the non-highly-collinear features
14     '''
15     
16     # Dont want to remove correlations between Energy Star Score
17     
18     # Calculate the correlation matrix
19     corr_matrix = x.corr()
20     iters = range(len(corr_matrix.columns) - 1)
21     drop_cols = []
22 
23     # Iterate through the correlation matrix and compare correlations
24     for i in iters:
25         for j in range(i):
26             item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
27             col = item.columns
28             row = item.index
29             val = abs(item.values)
30             
31             # If correlation exceeds the threshold
32             if val >= threshold:
33                 # Print the correlated features and the correlation value
34                 print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
35                 drop_cols.append(col.values[0])
36     drops = set(drop_cols)
37     return drops
38     # Drop one of each pair of correlated columns

寻找缺失值

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# 寻找共线特征def remove_collinear_features(x,y, threshold): ''' Objective: Remove collinear features in a dataframe with a correlation coefficient greater than the threshold. Removing collinear features can help a model to generalize and improves the interpretability of the model. Inputs: threshold: any features with correlations greater than this value are removed Output: dataframe that contains only the non-highly-collinear features ''' # Dont want to remove correlations between Energy Star Score # Calculate the correlation matrix corr_matrix = x.corr() iters = range(len(corr_matrix.columns) - 1) drop_cols = []
# Iterate through the correlation matrix and compare correlations for i in iters: for j in range(i): item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)] col = item.columns row = item.index val = abs(item.values) # If correlation exceeds the threshold if val >= threshold: # Print the correlated features and the correlation value print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2)) drop_cols.append(col.values[0]) drops = set(drop_cols) return drops # Drop one of each pair of correlated columns

posted @ 2022-03-06 11:13 刘老中医写代码阅读(38) 评论(0) 编辑收藏举报

刷新页面返回顶部

刘老中医

寻找共线特征和寻找缺失值

公告