pandas总结
pandas基本介绍
numpy类似列表,那么pandas就类似于字典
import pandas as pd import numpy as np s = pd.Series([1,3,np.nan,44]) # 创建一个序列,没有给行命名,则默认为0 1 2 3 print(s) # 0 1.0 # 1 3.0 # 2 NaN # 3 44.0 # dtype: float64 dates = pd.date_range('20220202',periods=4) # 这个是打印从2022-02-02开始的4个日期,作为下面创建DataFrame(wishing就是矩阵)的行名字 print(dates) # DatetimeIndex(['2022-02-02', '2022-02-03', '2022-02-04', '2022-02-05'], dtype='datetime64[ns]', freq='D') print(pd.DataFrame(np.random.randn(4,3),index=dates,columns=['a','b','c'])) # 创建一个4*3的随机矩阵,然后为其赋值行名和列名 # a b c # 2022-02-02 1.007896 -1.582359 0.566451 # 2022-02-03 -0.720459 0.632218 -1.350577 # 2022-02-04 1.323772 -0.117397 -2.228370 # 2022-02-05 -0.496703 0.783604 -0.247601 print(pd.DataFrame(np.arange(12).reshape((4,3)))) # 默认行名和列名都是从0开始 # 0 1 2 # 0 0 1 2 # 1 3 4 5 # 2 6 7 8 # 3 9 10 11 df = pd.DataFrame({'a':1., # 根据最多的行的数量进行赋值 'b':pd.Timestamp('20130102'), 'c':pd.Series(1,index=list(range(4)),dtype='float32'), 'd':np.array([3]*4,dtype='int32'), 'e':pd.Categorical(['test','train','test','train']), 'f':'foo'}) # 使用输入字典的方式进行创建 print(df) # a b c d e f # 0 1.0 2013-01-02 1.0 3 test foo # 1 1.0 2013-01-02 1.0 3 train foo # 2 1.0 2013-01-02 1.0 3 test foo # 3 1.0 2013-01-02 1.0 3 train foo print(df.dtypes) # 打印DataFrame每一列的数据类型 # a float64 # b datetime64[ns] # c float32 # d int32 # e category # f object # dtype: object print(df.index) # 打印行名字 Int64Index([0, 1, 2, 3], dtype='int64') print(df.columns) # 打印列名字 Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object') print(df.values) # 打印矩阵中所有的值 # [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']] print(df.describe()) # 进行一些矩阵的数值型的描述 # a c d # count 4.0 4.0 4.0 # 总数 # mean 1.0 1.0 3.0 # 平均值 # std 0.0 0.0 0.0 # 方差 # min 1.0 1.0 3.0 # 最小值 # 25% 1.0 1.0 3.0 # # 50% 1.0 1.0 3.0 # 75% 1.0 1.0 3.0 # max 1.0 1.0 3.0 # 最大值 print(df.T) # 进行矩阵转置 print(df.sort_index(axis=1,ascending=False)) # 1表示对列名进行排序,False表示是倒序 # f e d c b a # 0 foo test 3 1.0 2013-01-02 1.0 # 1 foo train 3 1.0 2013-01-02 1.0 # 2 foo test 3 1.0 2013-01-02 1.0 # 3 foo train 3 1.0 2013-01-02 1.0 print(df.sort_values(by='e')) # 对列'e'按值进行排序 # a b c d e f # 0 1.0 2013-01-02 1.0 3 test foo # 2 1.0 2013-01-02 1.0 3 test foo # 1 1.0 2013-01-02 1.0 3 train foo # 3 1.0 2013-01-02 1.0 3 train foo
pandas选择数据
import numpy as np import pandas as pd dates = pd.date_range('20220202',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) print(df) # A B C D # 2022-02-02 0 1 2 3 # 2022-02-03 4 5 6 7 # 2022-02-04 8 9 10 11 # 2022-02-05 12 13 14 15 # 2022-02-06 16 17 18 19 # 2022-02-07 20 21 22 23 print(df['A']) # 输出这一列 等价于df.A # 2022-02-02 0 # 2022-02-03 4 # 2022-02-04 8 # 2022-02-05 12 # 2022-02-06 16 # 2022-02-07 20 # Freq: D, Name: A, dtype: int32 print(df[0:3]) # 输出前三行,等价于df['20220202':'20220204'] # A B C D # 2022-02-02 0 1 2 3 # 2022-02-03 4 5 6 7 # 2022-02-04 8 9 10 11 # 通过标签进行切片 print(df.loc['20220202']) # 输出这一行 通过标签来进行查找,而不是索引 # A 0 # B 1 # C 2 # D 3 # Name: 2022-02-02 00:00:00, dtype: int32 print(df.loc[:,['A','B']]) # 打印A B列的所有行 # A B # 2022-02-02 0 1 # 2022-02-03 4 5 # 2022-02-04 8 9 # 2022-02-05 12 13 # 2022-02-06 16 17 # 2022-02-07 20 21 # 通过位置进行切片 print(df.iloc[2]) # 输出第三行所有的内容 # A 8 # B 9 # C 10 # D 11 # Name: 2022-02-04 00:00:00, dtype: int32 print(df.iloc[2,1:3]) # 等价于 df.iloc[2,[1,2]] # B 9 # C 10 # Name: 2022-02-04 00:00:00, dtype: int32 # 通过判断条件切片 print(df[df.A>8]) # 输出所有满足A>8的行的内容 # A B C D # 2022-02-05 12 13 14 15 # 2022-02-06 16 17 18 19 # 2022-02-07 20 21 22 23
pandas设置值
import numpy as np import pandas as pd dates = pd.date_range('20220202',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) df.iloc[0,0] = -1111 # 根据索引改变值 df.loc['20220202','B'] = 2222 # 根据标签改变值 df[df.A>10] = 0 # 根据判断条件改变值,将所有A > 10的行都改为0 # df.A[df.A>10] = 0这样则只改变A大于10的行的A这一列 print(df) # A B C D # 2022-02-02 -1111 2222 2 3 # 2022-02-03 4 5 6 7 # 2022-02-04 8 9 10 11 # 2022-02-05 12 13 14 15 # 2022-02-06 16 17 18 19 # 2022-02-07 20 21 22 23 df['F'] = np.nan # 加入新的一列,每一行的值都是np,nan df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20220202',periods=6)) # 加入新的一列 print(df) # A B C D F E # 2022-02-02 -1111 2222 2 3 NaN 1 # 2022-02-03 4 5 6 7 NaN 2 # 2022-02-04 8 9 10 11 NaN 3 # 2022-02-05 0 0 0 0 NaN 4 # 2022-02-06 0 0 0 0 NaN 5 # 2022-02-07 0 0 0 0 NaN 6
pandas处理丢失数据
在矩阵中可能会消失一些数据,例如可能某些位置的值是np.nan
import numpy as np import pandas as pd dates = pd.date_range('20220202',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) df.iloc[0,1] = np.nan df.iloc[1,2] = np.nan print(df) # A B C D # 2022-02-02 0 NaN 2.0 3 # 2022-02-03 4 5.0 NaN 7 # 2022-02-04 8 9.0 10.0 11 # 2022-02-05 12 13.0 14.0 15 # 2022-02-06 16 17.0 18.0 19 # 2022-02-07 20 21.0 22.0 23 print(df.dropna(axis=0,how='any')) # 0表示丢掉行,1表示丢掉列 any表示只要有一个就丢掉,all表示都为Nan才丢掉 # A B C D # 2022-02-04 8 9.0 10.0 11 # 2022-02-05 12 13.0 14.0 15 # 2022-02-06 16 17.0 18.0 19 # 2022-02-07 20 21.0 22.0 23 print(df.fillna(value=0)) # 表示缺失部分使用0来代替 # A B C D # 2022-02-02 0 0.0 2.0 3 # 2022-02-03 4 5.0 0.0 7 # 2022-02-04 8 9.0 10.0 11 # 2022-02-05 12 13.0 14.0 15 # 2022-02-06 16 17.0 18.0 19 # 2022-02-07 20 21.0 22.0 23 print(df.isnull()) # 返回一个矩阵,如果是缺失,则该位置为True # A B C D # 2022-02-02 False True False False # 2022-02-03 False False True False # 2022-02-04 False False False False # 2022-02-05 False False False False # 2022-02-06 False False False False # 2022-02-07 False False False False print(np.any(df.isnull())) # 表示如果矩阵中存在True则返回True,否则返回False,可以用这个判断矩阵中是否存在缺失值
pandas导入导出
读取表格建议使用read_csv, 一般常用的是read_pickle,因为pickle是python自带的一种压缩格式。至于保存成什么样格式,直接将read改为to,也就是to_csv,to_pickle。
import numpy as np import pandas as pd data = pd.read_csv('student.csv') print(data) data.to_pickle('student.pickle')
pandas合并concat
import numpy as np import pandas as pd df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 合并多个DataFrame,0表示竖着合并,True表示之前的索引重新排序, # 如果为False那么索引就是0 1 2 0 1 2 0 1 2 print(res) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 # join参数 ['inner','outer'] df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3]) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4]) print(pd.concat([df1,df2],join='outer')) # join默认参数就是outer 就是将列求并集,如果原先矩阵没有的,以NaN代替表示 # a b c d e # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 0.0 0.0 0.0 0.0 NaN # 2 NaN 1.0 1.0 1.0 1.0 # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 print(pd.concat([df1,df2],join='inner')) # inner 求列的交集,这样不会新出现Nan # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0
pandas合并merge
import numpy as np import pandas as pd # 通过列进行合并 left = pd.DataFrame({'key':['k0','k1','k2','k3'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key':['k0','k1','k2','k3'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) print(pd.merge(left,right,on='key')) # 根据on所对应的列名,其内容相同的进行合并 # key A B C D # 0 k0 A0 B0 C0 D0 # 1 k1 A1 B1 C1 D1 # 2 k2 A2 B2 C2 D2 # 3 k3 A3 B3 C3 D3 left = pd.DataFrame({'key1':['k0','k0','k1','k2'], 'key2':['k0','k1','k0','k1'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key1':['k0','k1','k1','k2'], 'key2':['k0','k0','k0','k0'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) # how参数合并方式 ['inner','outer','left','right'] print(pd.merge(left,right,on=['key1','key2'],how='inner')) # 默认方式就是inner # inner只有key1 key2完全相同才合并 # key1 key2 A B C D # 0 k0 k0 A0 B0 C0 D0 # 1 k1 k0 A2 B2 C1 D1 # 2 k1 k0 A2 B2 C2 D2 print(pd.merge(left,right,on=['key1','key2'],how='outer')) # outer 是全合并,也就是如果另一个DataFrame中没有对应的key1 key2那么该位置就是用Nan来进行代替 # key1 key2 A B C D # 0 k0 k0 A0 B0 C0 D0 # 1 k0 k1 A1 B1 NaN NaN # 2 k1 k0 A2 B2 C1 D1 # 3 k1 k0 A2 B2 C2 D2 # 4 k2 k1 A3 B3 NaN NaN # 5 k2 k0 NaN NaN C3 D3 print(pd.merge(left,right,on=['key1','key2'],how='left')) # left 相比于outer 保留所有left的key1 key2 # key1 key2 A B C D # 0 k0 k0 A0 B0 C0 D0 # 1 k0 k1 A1 B1 NaN NaN # 2 k1 k0 A2 B2 C1 D1 # 3 k1 k0 A2 B2 C2 D2 # 4 k2 k1 A3 B3 NaN NaN print(pd.merge(left,right,on=['key1','key2'],how='right')) # right 保留所有right中的key1 key2 # key1 key2 A B C D # 0 k0 k0 A0 B0 C0 D0 # 1 k1 k0 A2 B2 C1 D1 # 2 k1 k0 A2 B2 C2 D2 # 3 k2 k0 NaN NaN C3 D3 df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']}) df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) # 参数indicator print(pd.merge(df1,df2,on='col1',how='outer',indicator=True)) # 默认为False,如果为True,则会多出一列_merge left_only表示只有左边有数据 both就是两边都有数据 right_only就是只有右边有数据 # col1 col_left col_right _merge # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only print(pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')) # 如果赋值为一个字符串,呢么这个字符串就是多出这一列的列名 # col1 col_left col_right indicator_column # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only left = pd.DataFrame({'A':['A0','A1','A2'], 'B':['B0','B1','B2']}, index=['k0','k1','k2']) right = pd.DataFrame({'C':['C0','C1','C2'], 'D':['D0','D1','D2']}, index=['k0','k1','k2']) print(left) # A B # k0 A0 B0 # k1 A1 B1 # k2 A2 B2 print(right) # C D # k0 C0 D0 # k1 C1 D1 # k2 C2 D2 print(pd.merge(left,right,left_index=True,right_index=True,how='outer')) # left_index right_index表示按照两个矩阵的行索引进行合并 # A B C D # k0 A0 B0 C0 D0 # k1 A1 B1 C1 D1 # k2 A2 B2 C2 D2 boys = pd.DataFrame({'k':['k0','k1','k2'],'age':[1,2,3]}) girls = pd.DataFrame({'k':['k0','k1','k2'],'age':[4,5,6]}) print(pd.merge(boys,girls,on='k',how='inner')) # 直接合并因为是按照k进行合并,都存在age所以会自动进行改名 # k age_x age_y # 0 k0 1 4 # 1 k1 2 5 # 2 k2 3 6 print(pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')) # 使用suffixes会将其中的内容加载原先age列名的后面 # k age_boy age_girl # 0 k0 1 4 # 1 k1 2 5 # 2 k2 3 6
pandas plot 画图
这里只简单介绍最基本的线性图,还有很多其余的图没有介绍,例如饼状图,散点图等等
import numpy as np import pandas as pd import matplotlib.pyplot as plt # data = pd.Series(np.random.randn(1000),index=np.arange(1000)) # data=data.cumsum() # 计算累加和 # data.plot() # plt.show() # data = pd.DataFrame(np.random.randn(100,4),index=np.arange(100),columns=list("ABCD")) # data.plot() # plt.show()
作者:孙建钊
出处:http://www.cnblogs.com/sunjianzhao/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。