pandas

 

创建Series的三种方式
from pandas import Series
bb= Series([0,1,11,2,22])
print bb[1]
cc= Series(['AA','BB',11,'cc',22],index=['a','b','c','d','e'])
print cc['a']

ff=Series({'name':'tst','age':18})  #把Series看做一个字典,字典的键就是索引

print cc[['a','d','e']] #选取一组值

print cc*2 #标量乘

print np.exp(bb) #应用数据函数

print bb[bb>10] #过滤

可以把Series看做一个字典print 'd' in cc 

========判断是否为有缺省数据==========

 print ff.isnull()

print ff.notnull()
print pd.isnull(ff)
print pd.notnull(ff)
======索引是可以修改的,DataFrame的索引通过set_index修改===========
bb= Series([0,1,11])
bb.index=['a','b','c']
print bb
============将某列改成索引===set_index的反操作是reset_index
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,10]})
print dd.set_index('cc')
print dd.set_index('cc',inplace=True) #修改索引
print dd.set_index('cc',drop=False)
=======创建DataFrame========
1)dd=DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]})
print dd
2)bb= DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]},columns=['BB','cc','ZZ','YY'],index=[4,5])
#传入字典的同时,可以指定列的顺序,YY不存在,不会报错,会补NaN,index若指定多了,则会报错

3)嵌套的字典创建DataFrame

bb= DataFrame({'ZZ':{1988:11,2000:22},'BB':{1999:33,2000:44},'cc':{2000:33,1988:44}})

 


 索引列

cc= bb['cc']
print list(cc.index)
print '--------'
cc= bb.ix[4]
print list(cc.index)

====通过列表或数组给DataFrame添加一列或修改一列,必须长度与DataFrame一致==================

bb= DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]},columns=['BB','cc','ZZ','YY'],index=[4,5])  
bb['ZZ']=[1,2]
print bb

 

===========

bb=bb.T 转置
===========通过ix对Series或DataFrame进行切片===
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.ix[[4,7],['ZZ','BB']]
========drop 返回在指定轴上删除列的新对象=================
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.drop(4) #删除某一行
print bb.drop('cc',axis=1) #删除某一列
print bb.drop(['cc','BB'],axis=1) #删除多列

========================

del bb['ZZ'] #删除列

 =================、

DataFrame 的切片与pyhon的切片运算不同,末端是包含的
=============
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb['cc'] #选取列,可以是多列
print bb.ix[4] #选取行,可以使多列
===========
在将Series或Datafame相加时,结果索引是两个对象索引的合集,NaN会被传播
对于DataFrame 对齐操作会发生在行和列
bb= DataFrame({'aa':{1988:11,2000:22},'BB':{1999:33,2000:44},'cc':{2000:33,1988:44}})
dd= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb+dd

为了防止NaN,可以在某个对象在某个轴上没值时,可以填充一个值

bb= DataFrame({'aa':{1988:11,2000:22},'BB':{1988:33,2000:44},'cc':{2000:33,1988:44}})
dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.add(dd,fill_value=0) #谁没有就给谁补0

  

=========DataFrame与Series相运算======

dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])

print dd-dd.ix[5] #逐行减去一个Series

 


print dd.sub(dd['cc'],axis=0) #逐列减去一个Series

 

=================

dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
ss=Series([1,1,1],index=[4,5,7])
print dd-ss #逐行减ss

print dd.sub(ss,axis=0) #逐列减ss

 ========================

dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
f=lambda x:x.max()
print dd.apply(f) #将函数应用在列上,一列一个结果

f=lambda x:Series([x.max(),x.min()],index=['max','min']) #将函数应用在列上,一列返回多个结果
print dd.apply(f)
 
print  dd.apply(f,axis=1) #将函数应用在行上,一行一个结果

===========
对每个元素做运算,DataFrame用applymap,Series用map
====按索引排序 ======
bb= Series([0,-1,11],index=['c','a','z'])
print bb.sort_index()
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,66]},index=[4,77,7])
print dd.sort_index(axis=1) # 根据列名排序
print dd.sort_index(by='cc')  #sort_index还可以用来根据某一列进行排序

===================
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,10]})
print dd.idxmin() #寻找最小值的位置

 

还有其他计算相关系数、协方差等现成的函数,用到的时候百度即可

==========================

print bb.unique() #唯一值,仅Series
print bb.value_counts() #值计数,仅Series
print bb[bb.isin([0,1])] #筛选,Series和DataFrame都有这个方法
 

==============NA处理方法

可以只drop全是NA的行,也可以drop NA数超过某个阈值的行

print cc.dropna(how='all')
print cc.dropna(thresh=2)





posted on 2019-03-17 11:17  我和你并没有不同  阅读(183)  评论(0编辑  收藏  举报