寻找共线特征和寻找缺失值

 1 # 寻找共线特征
 2 def remove_collinear_features(x,y, threshold):
 3     '''
 4     Objective:
 5         Remove collinear features in a dataframe with a correlation coefficient
 6         greater than the threshold. Removing collinear features can help a model
 7         to generalize and improves the interpretability of the model.
 8         
 9     Inputs: 
10         threshold: any features with correlations greater than this value are removed
11     
12     Output: 
13         dataframe that contains only the non-highly-collinear features
14     '''
15     
16     # Dont want to remove correlations between Energy Star Score
17     
18     # Calculate the correlation matrix
19     corr_matrix = x.corr()
20     iters = range(len(corr_matrix.columns) - 1)
21     drop_cols = []
22 
23     # Iterate through the correlation matrix and compare correlations
24     for i in iters:
25         for j in range(i):
26             item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
27             col = item.columns
28             row = item.index
29             val = abs(item.values)
30             
31             # If correlation exceeds the threshold
32             if val >= threshold:
33                 # Print the correlated features and the correlation value
34                 print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
35                 drop_cols.append(col.values[0])
36     drops = set(drop_cols)
37     return drops
38     # Drop one of each pair of correlated columns

寻找缺失值

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

 

# 寻找共线特征def remove_collinear_features(x,y, threshold):    '''    Objective:        Remove collinear features in a dataframe with a correlation coefficient        greater than the threshold. Removing collinear features can help a model        to generalize and improves the interpretability of the model.            Inputs:         threshold: any features with correlations greater than this value are removed        Output:         dataframe that contains only the non-highly-collinear features    '''        # Dont want to remove correlations between Energy Star Score        # Calculate the correlation matrix    corr_matrix = x.corr()    iters = range(len(corr_matrix.columns) - 1)    drop_cols = []
    # Iterate through the correlation matrix and compare correlations    for i in iters:        for j in range(i):            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]            col = item.columns            row = item.index            val = abs(item.values)                        # If correlation exceeds the threshold            if val >= threshold:                # Print the correlated features and the correlation value                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))                drop_cols.append(col.values[0])    drops = set(drop_cols)    return drops    # Drop one of each pair of correlated columns

posted @ 2022-03-06 11:13  刘老中医写代码  阅读(37)  评论(0编辑  收藏  举报