处理缺失值
import pandas as pd
import numpy as np
df = pd.read_csv("./Narrativedata.csv", index_col=0)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 714 non-null float64
1 Sex 891 non-null object
2 Embarked 889 non-null object
3 Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
df["Age"].fillna(df["Age"].mean(), inplace=True)
df.dropna(axis=0, inplace=True)
df.shape
(889, 4)
preprocessing.LabelEncoder:标签专用,能够将分类转换为分类数值
from sklearn.preprocessing import LabelEncoder
label = df.iloc[:,-1]
le = LabelEncoder()
le_data = le.fit_transform(label)
df["Survived"] = le_data
df.shape
(889, 4)
pd.DataFrame(le.inverse_transform(le_data)).head()
|
0 |
0 |
No |
1 |
Yes |
2 |
Yes |
3 |
Yes |
4 |
No |
le.classes_
array(['No', 'Unknown', 'Yes'], dtype=object)
preprocessing.OrdinalEncoder:特征专用,能够将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
df_ = df.copy(deep=True)
feature = df_.iloc[:,1:-1]
ordinal = OrdinalEncoder(categories="auto")
result = ordinal.fit_transform(feature)
ordinal.categories_
[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
ordinal.feature_names_in_
array(['Sex', 'Embarked'], dtype=object)
df.shape
(889, 4)
preprocessing.OneHotEncoder:独热编码,创建哑变量
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(categories="auto").fit(df.iloc[:,1:-1])
result = onehot.transform(df.iloc[:,1:-1]).toarray()
result
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.],
...,
[1., 0., 0., 0., 1.],
[0., 1., 1., 0., 0.],
[0., 1., 0., 1., 0.]])
onehot.get_feature_names_out()
array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
dtype=object)
features = pd.DataFrame(result)
features.shape
(889, 5)
data_full = pd.concat([df,pd.DataFrame(result)],axis=1)
# 搞不懂了
data_full.dropna(axis=0,inplace=True)
data_full.head()
|
Age |
Sex |
Embarked |
Survived |
0 |
1 |
2 |
3 |
4 |
0 |
22.0 |
male |
S |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
1 |
38.0 |
female |
C |
2.0 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
2 |
26.0 |
female |
S |
2.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
3 |
35.0 |
female |
S |
2.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
4 |
35.0 |
male |
S |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
data_full.drop(axis=1, columns=["Sex","Embarked"], inplace=True)
data_full.columns = ["Age", "Survived", 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
data_full.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 888
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 887 non-null float64
1 Survived 887 non-null float64
2 Sex_female 887 non-null float64
3 Sex_male 887 non-null float64
4 Embarked_C 887 non-null float64
5 Embarked_Q 887 non-null float64
6 Embarked_S 887 non-null float64
dtypes: float64(7)
memory usage: 55.4 KB