#get the missing data ratio
missing_values_count = nfl_data.isnull().sum() ## get the number of missing data points per column
total_cells = np.product(nfl_data.shape) #sum of total data
percent_missing = total_missing/total_cells #missing data ratio#data NaN or not record(imputation)
nfl_data.dropna() #remove all the rows that contain a missing value
nfl_data.dropna(axis=1) #remove all the columns that contain a missing value
subset_nfl_data.fillna(0) # replace all NA's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0) #replace NA's the value directly after it in the same column#1.Drop Columns with Missing Values
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()] #Get names of columns with missing values
reduced_X_train = X_train.drop(cols_with_missing, axis=1) #Drop columns in training data#2.Imputation#replace missing values with the mean value along each column
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
#3.An Extension to Imputation
for col in cols_with_missing: # Make new columns indicating what will be imputed
X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
#Then impute the same way as method two
2. Scaling and Normalization
in scaling, you're changing the range of your data
in normalization, you're changing the shape of the distribution of your data
#from mlxtend.preprocessing import minmax_scalingoriginal_data = np.random.exponential(size=1000) # generate 1000 data points randomly drawn from an exponential distributionscaled_data = minmax_scaling(original_data, columns=[0]) #mix-max scale the data between 0 and 1#from scipy import statsnormalized_data = stats.boxcox(original_data) # normalize the exponential data with boxcox,Gaussian distribution.
3. Parsing Dates
#Python recognize dates as composed of day, month, and year.import seaborn as sns
import datetime
date_lengths = earthquakes.Date.str.len() #get the date length
date_lengths.value_counts() #group by length
indices = np.where([date_lengths == 24])[1] #index
earthquakes.loc[indices]
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y")
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day #get the day of the month from the date column#Plot the days of the month from your earthquake dataset.
day_of_month_earthquakes = day_of_month_earthquakes.dropna()
sns.distplot(day_of_month_earthquakes, kde=False, bins=31)
4. Inconsistent Data Entry
#Text pre-processing
countries = professors['Country'].unique() # getall the uniquevaluesin the 'Country'column
# convertto lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()
#Fuzzy matching
#from fuzzywuzzy import process
def replace_matches_in_column(df, column, string_to_match, min_ratio =47):
# get a list ofunique strings
strings = df[column].unique()
# get the top 10 closest matchesto our input string
matches= fuzzywuzzy.process.extract(string_to_match, strings,
limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
close_matches = [matches[0] formatchesinmatches if matches[1] >= min_ratio] # filter
# get the rowsofall the closematchesin our dataframe
rows_with_matches = df[column].isin(close_matches) #get the booleancondition
# replace allrowswithclosematcheswith the input matches
df.loc[rows_with_matches, column] = string_to_match
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea") #matching the south korea
5. Categorical Variables
#1. Drop Categorical Variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
#2. Ordinal Encoding#Ordinal encoding assigns each unique value to a different integer.
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
#3. One-Hot Encoding#One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本