Data Cleaning

1. Handling Missing Values

#get the missing data ratio
missing_values_count = nfl_data.isnull().sum() ## get the number of missing data points per column
total_cells = np.product(nfl_data.shape) #sum of total data
percent_missing = total_missing/total_cells #missing data ratio

#data NaN or not record(imputation)
nfl_data.dropna() #remove all the rows that contain a missing value
nfl_data.dropna(axis=1) #remove all the columns that contain a missing value
subset_nfl_data.fillna(0) # replace all NA's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0) #replace NA's the value directly after it in the same column

#1.Drop Columns with Missing Values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()] #Get names of columns with missing values
reduced_X_train = X_train.drop(cols_with_missing, axis=1) #Drop columns in training data

#2.Imputation
#replace missing values with the mean value along each column
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

#3.An Extension to Imputation
for col in cols_with_missing: # Make new columns indicating what will be imputed
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
#Then impute the same way as method two

2. Scaling and Normalization

  • in scaling, you're changing the range of your data
  • in normalization, you're changing the shape of the distribution of your data
#from mlxtend.preprocessing import minmax_scaling
original_data = np.random.exponential(size=1000) # generate 1000 data points randomly drawn from an exponential distribution
scaled_data = minmax_scaling(original_data, columns=[0]) #mix-max scale the data between 0 and 1
#from scipy import stats
normalized_data = stats.boxcox(original_data) # normalize the exponential data with boxcox,Gaussian distribution.

3. Parsing Dates

#Python recognize dates as composed of day, month, and year.
import seaborn as sns
import datetime
date_lengths = earthquakes.Date.str.len() #get the date length
date_lengths.value_counts() #group by length
indices = np.where([date_lengths == 24])[1] #index
earthquakes.loc[indices]
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y") 
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day #get the day of the month from the date column
#Plot the days of the month from your earthquake dataset.
day_of_month_earthquakes = day_of_month_earthquakes.dropna()
sns.distplot(day_of_month_earthquakes, kde=False, bins=31)

4. Inconsistent Data Entry

#Text pre-processing
countries = professors['Country'].unique() # get all the unique values in the 'Country' column
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

#Fuzzy matching
#from fuzzywuzzy import process
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio] # filter
    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches) #get the boolean condition
    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match

replace_matches_in_column(df=professors, column='Country', string_to_match="south korea") #matching the south korea

5. Categorical Variables

#1. Drop Categorical Variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

#2. Ordinal Encoding
#Ordinal encoding assigns each unique value to a different integer.
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

#3. One-Hot Encoding
#One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

posted @ 2022-06-05 18:56  失控D大白兔  阅读(39)  评论(0编辑  收藏  举报