Data Cleaning

1. Handling Missing Values

#get the missing data ratio
missing_values_count = nfl_data.isnull().sum() ## get the number of missing data points per column
total_cells = np.product(nfl_data.shape) #sum of total data
percent_missing = total_missing/total_cells #missing data ratio

#data NaN or not record(imputation)
nfl_data.dropna() #remove all the rows that contain a missing value
nfl_data.dropna(axis=1) #remove all the columns that contain a missing value
subset_nfl_data.fillna(0) # replace all NA's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0) #replace NA's the value directly after it in the same column

#1.Drop Columns with Missing Values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()] #Get names of columns with missing values
reduced_X_train = X_train.drop(cols_with_missing, axis=1) #Drop columns in training data

#2.Imputation
#replace missing values with the mean value along each column
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

#3.An Extension to Imputation
for col in cols_with_missing: # Make new columns indicating what will be imputed
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
#Then impute the same way as method two

2. Scaling and Normalization

in scaling, you're changing the range of your data
in normalization, you're changing the shape of the distribution of your data

#from mlxtend.preprocessing import minmax_scaling
original_data = np.random.exponential(size=1000) # generate 1000 data points randomly drawn from an exponential distribution
scaled_data = minmax_scaling(original_data, columns=[0]) #mix-max scale the data between 0 and 1
#from scipy import stats
normalized_data = stats.boxcox(original_data) # normalize the exponential data with boxcox，Gaussian distribution.

3. Parsing Dates

#Python recognize dates as composed of day, month, and year.
import seaborn as sns
import datetime
date_lengths = earthquakes.Date.str.len() #get the date length
date_lengths.value_counts() #group by length
indices = np.where([date_lengths == 24])[1] #index
earthquakes.loc[indices]
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y") 
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day #get the day of the month from the date column
#Plot the days of the month from your earthquake dataset.
day_of_month_earthquakes = day_of_month_earthquakes.dropna()
sns.distplot(day_of_month_earthquakes, kde=False, bins=31)

4. Inconsistent Data Entry

#Text pre-processing
countries = professors['Country'].unique() # get all the unique values in the 'Country' column
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

#Fuzzy matching
#from fuzzywuzzy import process
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio] # filter
    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches) #get the boolean condition
    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match

replace_matches_in_column(df=professors, column='Country', string_to_match="south korea") #matching the south korea

5. Categorical Variables

#1. Drop Categorical Variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

#2. Ordinal Encoding
#Ordinal encoding assigns each unique value to a different integer.
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

#3. One-Hot Encoding
#One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

posted @ 2022-06-05 18:56 失控D大白兔阅读(39) 评论(0) 编辑收藏举报

刷新页面返回顶部

929code

Data Cleaning

1. Handling Missing Values

2. Scaling and Normalization

3. Parsing Dates

4. Inconsistent Data Entry

5. Categorical Variables