import numpy as np
import pandas as pd
df = pd.read_csv('train.csv')
df.head(3)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
缺省值查看
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
df.head(6)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
---|
df.drop_duplicates().head(4)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
df.to_csv('test_clear.csv')
df.head(5)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 3 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
---|
df = pd.read_csv('test_clear.csv')
df['AgeBand'] = pd.cut(df['Age'],[0,5,15,30,50,80],labels = ['1','2','3','4','5'])
df.head(5)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 3 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
---|
df.to_csv('test_cut.csv')
df['AgeBand'] = pd.cut(df['Age'],5,labels = ['1','2','3','4','5'])
df.head(5)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 3 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 3 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 3 |
---|
pd.value_counts(df['AgeBand'])
2 346
3 188
1 100
4 69
5 11
Name: AgeBand, dtype: int64
df.to_csv('test_ave.csv')
df['AgeBand'] = pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,1.],labels = ['1','2','3','4','5'])
df.head(5)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
---|
pd.value_counts(df['AgeBand'])
5 195
4 157
2 154
3 131
1 77
Name: AgeBand, dtype: int64
df.to_csv('test_ave.csv')
df['Sex'].value_counts()
male 577
female 314
Name: Sex, dtype: int64
df['Sex_num'] = df['Sex'].replace(['female','male'],[2,1])
df.head(5)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand | Sex_num |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 | 2 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 | 1 |
---|
df['Embarked'].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
df['Embarked_num'] = df['Embarked'].map({'S' : 1, 'C' : 2, 'Q': 3})
df.head(3)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | AgeBand | Sex_num | Embarked_num |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 | 1.0 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 | 2.0 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 | 1.0 |
---|
缺省值的处理
- dropna 根据每个标签的值是否是缺失数据来筛选轴标签,并根据允许丢失的数据量来确定阈值
- fillna 用某些值来填充缺失的数据或使用插值方法
参数: value标量值,或者字典型对象用于填充缺失值
method插值方法,如果没有其他参数,默认‘ffill’
limit 前向或者后向填充时的最大填充范围
inplace 修改被调用的对象,而不是生成一个备份
axis 需要填充的轴,默认axis = 0 - isnull 返回表明那些值是缺失值的布尔值
- notnull isnull的反函数
df[df['Age'] == np.nan] = 0
df.head(6)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
---|
df['Age'].isnull()
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
from numpy import nan as NA
data = pd.DataFrame(np.random.randn(7,3))
data.iloc[4:6,0]= NA
data.iloc[3:,1] = NA
data.iloc[5:,2] = NA
data
| 0 | 1 | 2 |
---|
0 | 0.255784 | 2.167973 | -0.583178 |
---|
1 | -0.066963 | 1.567319 | -0.213410 |
---|
2 | -2.612350 | 0.314180 | -0.843624 |
---|
3 | -0.087889 | NaN | 0.447406 |
---|
4 | NaN | NaN | -0.022148 |
---|
5 | NaN | NaN | NaN |
---|
6 | 2.363004 | NaN | NaN |
---|
data.dropna()
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
data.dropna(how = 'all')
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | NaN | 0.039350 |
---|
4 | NaN | NaN | 0.018442 |
---|
6 | -1.392818 | NaN | NaN |
---|
data.dropna(thresh = 2)
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | NaN | 0.039350 |
---|
data.dropna(thresh = 5,axis = 1)
| 0 | 2 |
---|
0 | 1.211654 | 0.469220 |
---|
1 | 1.147614 | 0.884146 |
---|
2 | -0.652302 | 1.268530 |
---|
3 | -0.368937 | 0.039350 |
---|
4 | NaN | 0.018442 |
---|
5 | NaN | NaN |
---|
6 | -1.392818 | NaN |
---|
data.fillna(0)
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | 0.000000 | 0.039350 |
---|
4 | 0.000000 | 0.000000 | 0.018442 |
---|
5 | 0.000000 | 0.000000 | 0.000000 |
---|
6 | -1.392818 | 0.000000 | 0.000000 |
---|
data.fillna({0:1,1:8,2:9})
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | 8.000000 | 0.039350 |
---|
4 | 1.000000 | 8.000000 | 0.018442 |
---|
5 | 1.000000 | 8.000000 | 9.000000 |
---|
6 | -1.392818 | 8.000000 | 9.000000 |
---|
data.fillna(method = 'ffill')
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | 0.542338 | 0.039350 |
---|
4 | -0.368937 | 0.542338 | 0.018442 |
---|
5 | -0.368937 | 0.542338 | 0.018442 |
---|
6 | -1.392818 | 0.542338 | 0.018442 |
---|
data.fillna(method = 'ffill',limit = 2)
| 0 | 1 | 2 |
---|
0 | 1.211654 | 1.644143 | 0.469220 |
---|
1 | 1.147614 | 0.011054 | 0.884146 |
---|
2 | -0.652302 | 0.542338 | 1.268530 |
---|
3 | -0.368937 | 0.542338 | 0.039350 |
---|
4 | -0.368937 | 0.542338 | 0.018442 |
---|
5 | -0.368937 | NaN | 0.018442 |
---|
6 | -1.392818 | NaN | 0.018442 |
---|
data.fillna(method = 'ffill',limit = 2,inplace = True)
data
| 0 | 1 | 2 |
---|
0 | 0.255784 | 2.167973 | -0.583178 |
---|
1 | -0.066963 | 1.567319 | -0.213410 |
---|
2 | -2.612350 | 0.314180 | -0.843624 |
---|
3 | -0.087889 | 0.314180 | 0.447406 |
---|
4 | -0.087889 | 0.314180 | -0.022148 |
---|
5 | -0.087889 | NaN | -0.022148 |
---|
6 | 2.363004 | NaN | -0.022148 |
---|
df[df['Age'] ==None] =0
df.head(3)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
重复数值处理
- duplicated 返回一个布尔值Series,这个Series反映的是每一行是否存在重复(即是否与之前出现的行相同)的情况
- drop_duplicates 返回的是DataFrame,内容是duplicate返回数组中为Flase的部分
- 两个函数默认都是保留第一个观测的数值,参数keep = ‘last’将会返回重复值中最后一个
df[df.duplicated()]
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
data1 = pd.DataFrame({'k1':['one' , 'two'] * 3 + ['two'],
'k2':[1,1,2,3,3,4,4]})
data1
| k1 | k2 |
---|
0 | one | 1 |
---|
1 | two | 1 |
---|
2 | one | 2 |
---|
3 | two | 3 |
---|
4 | one | 3 |
---|
5 | two | 4 |
---|
6 | two | 4 |
---|
data1.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data1[data1.duplicated()]
data1.drop_duplicates()
| k1 | k2 |
---|
0 | one | 1 |
---|
1 | two | 1 |
---|
2 | one | 2 |
---|
3 | two | 3 |
---|
4 | one | 3 |
---|
5 | two | 4 |
---|
data1['v1'] = 0
data1.drop_duplicates(['k1'])
data1['v1'] = 0
data1.drop_duplicates(keep = 'last')
| k1 | k2 | v1 |
---|
0 | one | 1 | 0 |
---|
1 | two | 1 | 0 |
---|
2 | one | 2 | 0 |
---|
3 | two | 3 | 0 |
---|
4 | one | 3 | 0 |
---|
6 | two | 4 | 0 |
---|
分箱操作
- cut函数 自定义取间/等区间分
- qcut函数 基于样本分位数分箱类似[0,0.1,0.5,1.] 还可以进行样本数据量相等的分箱
ages = [20,22,25,27,21,34,56,36,42,28,61]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (35, 60], (35, 60], (25, 35], (60, 100]]
Length: 11
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 1, 2, 2, 2, 1, 3], dtype=int8)
pd.value_counts(cats)
(18, 25] 4
(25, 35] 3
(35, 60] 3
(60, 100] 1
dtype: int64
cats = pd.cut(ages,5,labels = [1,2,3,4,5])
cats
[1, 1, 1, 1, 1, ..., 5, 2, 3, 1, 5]
Length: 11
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
cats = pd.qcut(ages,[0,0.1,0.5,0.7,0.8,0.9],labels = [1,2,3,4,5])
cats
[1, 2, 2, 2, 1, ..., 5, 3, 4, 2, NaN]
Length: 11
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
替换
- replace replace(被替代值,替代值),参数可以数组可以,字典传递
- map
data2 = pd.Series([1,-99,3,-99,-100,8])
data2
0 1
1 -99
2 3
3 -99
4 -100
5 8
dtype: int64
data2.replace([-99,8],0)
0 1
1 0
2 3
3 0
4 -100
5 0
dtype: int64
data2.replace([-99,8],[np.nan,0])
0 1.0
1 NaN
2 3.0
3 NaN
4 -100.0
5 0.0
dtype: float64
data2.replace({-99:np.nan,
8:0})
0 1.0
1 NaN
2 3.0
3 NaN
4 -100.0
5 0.0
dtype: float64
data2.map({8:0,-99:np.nan,1:7})
0 7.0
1 NaN
2 NaN
3 NaN
4 NaN
5 0.0
dtype: float64