In [1]:
import pandas as pd
import numpy as np
删除缺失值¶
In [2]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[:4,1] = np.NaN
df.iloc[:2,2] = np.NaN
df
Out[2]:
In [3]:
#有2个以上的nan才删除
df.dropna(thresh=2)
Out[3]:
In [4]:
df.dropna(axis=1,thresh=3)
Out[4]:
In [7]:
df.duplicated()
Out[7]:
In [8]:
df.drop_duplicates()
Out[8]:
In [9]:
df.drop_duplicates(keep='last')
Out[9]:
In [10]:
df.drop_duplicates(['k2'])
Out[10]:
离散化和分箱¶
In [13]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
Out[13]:
In [14]:
cats.codes #ages的数据标签
Out[14]:
In [15]:
cats.categories
Out[15]:
In [16]:
pd.value_counts(cats)
Out[16]:
In [17]:
#改为左闭右开
pd.cut(ages,bins,right=False)
Out[17]:
In [18]:
#自定义箱名
pd.cut(ages,bins,labels=['Youth','YoungAdult','MiddleAged','Senior'])
Out[18]:
In [19]:
#定义箱子个数
data = np.random.rand(20)
pd.cut(data,4,precision=2)
Out[19]:
In [20]:
#根据样本的分位数进行分箱
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats
Out[20]:
In [21]:
pd.value_counts(cats)
Out[21]:
In [22]:
#自定义分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])
Out[22]:
检测和过滤异常值¶
In [23]:
data = pd.DataFrame(np.random.randn(1000,4))
#拥有绝对值大于3的行
data[(np.abs(data)>3).any(1)]
Out[23]:
In [24]:
data.iloc[500] = [4,5,6,-7]
#绝对值都大于3的行
data[(np.abs(data)>3).all(1)]
Out[24]:
In [25]:
np.sign(data).head()
Out[25]:
In [28]:
pd.get_dummies(df['key'])
Out[28]:
In [29]:
#一行属于多个类别时,比较复杂
movies = pd.read_csv('datasets/movielens/movies.dat',sep='::',header= None,names=['movie_id','title','genres'])
movies.head()
Out[29]:
In [30]:
all_genres = []
for genres in movies['genres']:
all_genres.extend(genres.split('|'))
genres = pd.unique(all_genres)
In [31]:
genres
Out[31]:
In [32]:
count_df = pd.DataFrame(np.zeros(shape=(len(movies),len(genres))),columns=genres)
count_df.head()
Out[32]:
In [33]:
#遍历分类可以避免遍历数据较多的movies,速度较快
for genre in genres:
count_df.loc[movies['genres'].str.contains(genre),genre] = 1
count_df.head()
Out[33]:
正则表达式¶
In [34]:
val = 'a,b, guido'
val.index(',')
Out[34]:
In [35]:
#index找不到时会报错,find找不到时返回-1
val.find(':')
Out[35]:
In [36]:
import re
text = 'foo bar\t baz \tqux'
#正则表达式对象
regex = re.compile('\s+')
regex.findall(text)
Out[36]:
In [37]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
#pattern里加上括号返回的是元组
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern=pattern,flags=re.IGNORECASE)
regex.findall(text)
Out[37]:
In [38]:
print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))
数据合并¶
In [39]:
left = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left
Out[39]:
In [40]:
right = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
right
Out[40]:
In [41]:
#使用right的行索引作为它的连接键
pd.merge(left,right,left_on='key',right_index=True)
Out[41]:
In [42]:
#不存在相同索引时,可以考虑concat
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two'])
df1
Out[42]:
In [43]:
df2 = pd.DataFrame(np.arange(5,9).reshape(2,2),index=list('ac'),columns=['three','four'])
df2
Out[43]:
In [44]:
pd.concat([df1,df2],keys=['df1','df2'],sort=True)
Out[44]:
In [45]:
pd.concat([df1,df2],axis=1,keys=['df1','df2'],sort=True)
Out[45]:
In [46]:
#不保留原来的索引,而是产生新的索引
pd.concat([df1,df2],ignore_index=True,sort=True)
Out[46]:
In [47]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df1
Out[47]:
In [48]:
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df2
Out[48]:
In [49]:
#相当于np.where(pd.isnull(a),b,a)
df1.combine_first(df2)
Out[49]:
In [51]:
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods
Out[51]:
In [52]:
columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns
Out[52]:
In [53]:
data = data.reindex(columns=columns)
data.head()
Out[53]:
In [54]:
data.index = periods.to_timestamp('D','e')
In [55]:
data.head()
Out[55]:
In [56]:
data = data.stack().reset_index().rename(columns={0:'value'})
data.head()
Out[56]:
In [57]:
data.pivot('date','item','value')[:5]#行,列,值
Out[57]:
In [58]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]})
df
Out[58]:
In [59]:
#pivot的反向操作
pd.melt(df,['key'])
Out[59]: