数据转换
移除重复数据
import pandas as pd
import numpy as np
from pandas import Series
data = pd.DataFrame(
{'k1':['one']*3+['two']*4,
'k2':[1,1,2,3,3,4,4]})
data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
duplicated方法返回一个布尔型Series,表示各行是否是重复行
data.duplicated()
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
drop_duplicates方法,直接返回去除重后的DataFrame
data.drop_duplicates()
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
可以指定列进行重复项判断
data['v1'] = range(7)
data.drop_duplicates(['k1'])
k1 k2 v1
0 one 1 0
3 two 3 3
duplicated和drop_duplicates保留的都是第一个出现的值,传入keep='last'则保留最后一个
data
k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6
# 默认保留相同选项的是第一个
data.drop_duplicates(['k1','k2'])
k1 k2 v1
0 one 1 0
2 one 2 2
3 two 3 3
5 two 4 5
# 保留相同选项里的最后一个
data.drop_duplicates(['k1','k2'],keep='last')
k1 k2 v1
1 one 1 1
2 one 2 2
4 two 3 4
6 two 4 6
利用函数或映射进行数据转换
Series的map方法可以接收一个函数或含有映射关系的字典型对象
data1 = pd.DataFrame({
'food':['bacon','pulled pork','bacon','Pastrami',
'corned beef','Bacon','pastrami','honey ham',
'nova lox'],
'ounces':[4,3,12,6,7.5,8,3,5,6]
})
data1
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
# 添加肉类来源这一列
meat_to_animal = {
'bacon':'pig',
'pulled pork':'pig',
'pastrami':'cow',
'corned beef':'cow',
'honey ham':'pig',
'nova lox':'salmon'
}
# 根据键映射对应的来源,str.lower的原因是肉类里面的键全是小写,但是food里的键有的是大写,想要映射需要一一对应
data1['animal'] = data1['food'].map(str.lower).map(meat_to_animal)
data1
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
lambda函数关系映射
# 映射出字典的值是谁
data1['food'].map(lambda x:meat_to_animal[x.lower()])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
替换值
data2 = Series([1.,-999.,2.,-999.,-1000.,3.])
data2
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data2.replace(-999,np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
替换多个值
data2.replace([-999,-1000],np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
对不同的值进行不同的替换,传入一个关系组成列表
data2.replace([-999,-1000],[np.nan,0])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
传入的参数是字典映射
data2.replace({-999:np.nan,-1000:0})
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
重命名轴索引
data3 = pd.DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
data3
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
# 这样会直接修改原始数据
data3.index = data3.index.map(str.upper)
data3
# 如果想要创建数据集的转换版,也就是副本
data3.rename(index=str.title,columns=str.upper)
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
# rename可以结合字典型对象一对一进行轴标签的更新
data3.rename(index={'Ohio':'New_Ohio'},columns={'three':'peekaboo'})
one two peekaboo four
OHIO 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11
# rename自带复制功能,如果希望就地修改,传入inplace=True即可
data3.rename(index={'OHIO':'inplace'},inplace=True)
data3
one two three four
inplace 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11
离散化和面元划分
cut和qcut对分量和分组分析非常重要
# 将下面这些数据划分为18-25,26-35,35-60,60以上
args = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(args,bins)
# 左开右闭
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 直接得到区间范围的数字,跟'区间'的数学符号一样,圆括号表示开端,方括号表示闭端,可以通过right=False进行修改
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
# 左闭右开
cut1 = pd.cut(args,bins,right=False)
cut1
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
pd.value_counts(cut1)
[25, 35) 4
[18, 25) 4
[35, 60) 3
[60, 100) 1
dtype: int64
# 不想干巴巴用那么丑的区间名做索引,指定区间的名称
group_names=['Youth','YoungAdult','MiddleAged','Senior']
cut2 = pd.cut(args,bins,labels=group_names)
cut2
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
pd.value_counts(cut2)
Youth 5
MiddleAged 3
YoungAdult 3
Senior 1
dtype: int64
自动划分区间
data = np.random.rand(20)
# precision保留的是小数点的位数
pd.cut(data,4,precision=2)
[(0.27, 0.5], (0.031, 0.27], (0.74, 0.97], (0.031, 0.27], (0.27, 0.5], ..., (0.74, 0.97], (0.031, 0.27], (0.27, 0.5], (0.74, 0.97], (0.5, 0.74]]
Length: 20
Categories (4, interval[float64]): [(0.031, 0.27] < (0.27, 0.5] < (0.5, 0.74] < (0.74, 0.97]]
# qcut是一个类似于cut的函数,它可以根据样本分位数对数据进行划分。
# 和cut不同的是,cut无法使各个面元中含有相同数量的数据点,二qcut可以
data4 = np.random.randn(1000)
cat4 = pd.qcut(data4, 4)
cat4
[(-3.114, -0.713], (-0.713, -0.0478], (-0.0478, 0.618], (-0.713, -0.0478], (0.618, 2.917], ..., (-0.0478, 0.618], (-0.0478, 0.618], (-3.114, -0.713], (-0.0478, 0.618], (0.618, 2.917]]
Length: 1000
Categories (4, interval[float64]): [(-3.114, -0.713] < (-0.713, -0.0478] < (-0.0478, 0.618] < (0.618, 2.917]]
# 区间出来是数量是相等的
pd.value_counts(cat4)
(0.618, 2.917] 250
(-0.0478, 0.618] 250
(-0.713, -0.0478] 250
(-3.114, -0.713] 250
dtype: int64
# 跟cut一样,也可以设置自定义的分位数
pd.qcut(data4,[0,0.1,0.5,0.9,1.])
[(-3.114, -1.263], (-1.263, -0.0478], (-0.0478, 1.247], (-1.263, -0.0478], (1.247, 2.917], ..., (-0.0478, 1.247], (-0.0478, 1.247], (-1.263, -0.0478], (-0.0478, 1.247], (-0.0478, 1.247]]
Length: 1000
Categories (4, interval[float64]): [(-3.114, -1.263] < (-1.263, -0.0478] < (-0.0478, 1.247] < (1.247, 2.917]]
检测和过滤异常值
# seed参数生成一个随机数的起始位置,使用了后,后面的随机数不会发生变化,否则每次生成一次变化一次
# seed参数的值随意,没有任何用
np.random.seed(0)
data5 = pd.DataFrame(np.random.randn(1000,4))
data5.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.062966 -0.002087 -0.025777 -0.010981
std 0.983517 0.967146 0.983671 0.993891
min -3.740101 -3.046143 -3.116857 -3.392300
25% -0.755720 -0.683680 -0.684833 -0.686776
50% -0.029995 -0.023210 -0.025068 -0.038192
75% 0.604792 0.652095 0.624139 0.648778
max 2.929096 2.662727 3.801660 3.427539
# 假设你想要找出某列中绝对值大小超过3的值
col = data5[3]
col[np.abs(col)>3]
861 3.427539
919 -3.392300
Name: 3, dtype: float64
# 选出全部含有超过3或-3的值的行,你可以利用布尔型DataFrame以及any方法
data[(np.abs(data5)>3)] # 找出绝对值大于3的值,不满足的为NaN
data[(np.abs(data5)>3).any(1)] # 找出绝对值大于3的行
0 1 2 3
147 0.823681 -2.929552 1.721550 1.039882
263 -1.326474 0.873638 -1.556238 -1.072714
504 0.991843 -1.198124 -0.060144 -1.802440
770 1.251980 0.801589 0.644481 1.106683
779 1.014776 0.088887 0.785261 0.849345
861 -0.736245 -1.258751 1.385519 0.509164
865 -0.034140 -0.825346 -0.921655 -0.023471
919 -0.580442 -0.347443 0.309293 -1.018100
938 -0.327996 0.703850 0.227200 2.131917
# 将值限制在区间-3到3以内,np.sign代表的是数组的正负,正值为1,负值为-1,0为0
data5[np.abs(data)>3] = data5*3
data5.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.062966 -0.002087 -0.025777 -0.010981
std 0.983517 0.967146 0.983671 0.993891
min -3.740101 -3.046143 -3.116857 -3.392300
25% -0.755720 -0.683680 -0.684833 -0.686776
50% -0.029995 -0.023210 -0.025068 -0.038192
75% 0.604792 0.652095 0.624139 0.648778
max 2.929096 2.662727 3.801660 3.427539