pandas
创建Series的三种方式
from pandas import Series
bb= Series([0,1,11,2,22])
print bb[1]
cc= Series(['AA','BB',11,'cc',22],index=['a','b','c','d','e'])
print cc['a']
ff=Series({'name':'tst','age':18}) #把Series看做一个字典,字典的键就是索引
print cc[['a','d','e']] #选取一组值
print cc*2 #标量乘
print np.exp(bb) #应用数据函数
print bb[bb>10] #过滤
可以把Series看做一个字典 :print 'd' in cc
========判断是否为有缺省数据==========
print ff.isnull()
print ff.notnull()
print pd.isnull(ff)
print pd.notnull(ff)
======索引是可以修改的,DataFrame的索引通过set_index修改===========
bb= Series([0,1,11])
bb.index=['a','b','c']
print bb
============将某列改成索引===set_index的反操作是reset_index
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,10]})
print dd.set_index('cc')
print dd.set_index('cc',inplace=True) #修改索引
print dd.set_index('cc',drop=False)
=======创建DataFrame========
1)dd=DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]})
print dd
2)bb= DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]},columns=['BB','cc','ZZ','YY'],index=[4,5])
#传入字典的同时,可以指定列的顺序,YY不存在,不会报错,会补NaN,index若指定多了,则会报错
3)嵌套的字典创建DataFrame
bb= DataFrame({'ZZ':{1988:11,2000:22},'BB':{1999:33,2000:44},'cc':{2000:33,1988:44}})
索引列
cc= bb['cc']
print list(cc.index)
print '--------'
cc= bb.ix[4]
print list(cc.index)
====通过列表或数组给DataFrame添加一列或修改一列,必须长度与DataFrame一致==================
bb= DataFrame({'ZZ':[11,22],'BB':[33,44],'cc':[33,44]},columns=['BB','cc','ZZ','YY'],index=[4,5])
bb['ZZ']=[1,2]
print bb
===========
bb=bb.T 转置
===========通过ix对Series或DataFrame进行切片===
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.ix[[4,7],['ZZ','BB']]
========drop 返回在指定轴上删除列的新对象=================
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.drop(4) #删除某一行
print bb.drop('cc',axis=1) #删除某一列
print bb.drop(['cc','BB'],axis=1) #删除多列
========================
del bb['ZZ'] #删除列
=================、
DataFrame 的切片与pyhon的切片运算不同,末端是包含的
=============
bb= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb['cc'] #选取列,可以是多列
print bb.ix[4] #选取行,可以使多列
===========
在将Series或Datafame相加时,结果索引是两个对象索引的合集,NaN会被传播
对于DataFrame 对齐操作会发生在行和列
bb= DataFrame({'aa':{1988:11,2000:22},'BB':{1999:33,2000:44},'cc':{2000:33,1988:44}})
dd= DataFrame({'ZZ':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb+dd
为了防止NaN,可以在某个对象在某个轴上没值时,可以填充一个值
bb= DataFrame({'aa':{1988:11,2000:22},'BB':{1988:33,2000:44},'cc':{2000:33,1988:44}})
dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print bb.add(dd,fill_value=0) #谁没有就给谁补0
=========DataFrame与Series相运算======
dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
print dd-dd.ix[5] #逐行减去一个Series
print dd.sub(dd['cc'],axis=0) #逐列减去一个Series
=================
dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
ss=Series([1,1,1],index=[4,5,7])
print dd-ss #逐行减ss
print dd.sub(ss,axis=0) #逐列减ss
========================
dd= DataFrame({'aa':[11,22,33],'BB':[33,44,55],'cc':[33,44,66]},index=[4,5,7])
f=lambda x:x.max()
print dd.apply(f) #将函数应用在列上,一列一个结果
f=lambda x:Series([x.max(),x.min()],index=['max','min']) #将函数应用在列上,一列返回多个结果
print dd.apply(f)
print dd.apply(f,axis=1) #将函数应用在行上,一行一个结果
===========
对每个元素做运算,DataFrame用applymap,Series用map
====按索引排序 ======
bb= Series([0,-1,11],index=['c','a','z'])
print bb.sort_index()
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,66]},index=[4,77,7])
print dd.sort_index(axis=1) # 根据列名排序
print dd.sort_index(by='cc') #sort_index还可以用来根据某一列进行排序
===================
dd= DataFrame({'zz':[11,22,33],'bb':[33,44,55],'cc':[33,77,10]})
print dd.idxmin() #寻找最小值的位置
还有其他计算相关系数、协方差等现成的函数,用到的时候百度即可
==========================
print bb.unique() #唯一值,仅Series
print bb.value_counts() #值计数,仅Series
print bb[bb.isin([0,1])] #筛选,Series和DataFrame都有这个方法
==============NA处理方法
可以只drop全是NA的行,也可以drop NA数超过某个阈值的行
print cc.dropna(how='all')
print cc.dropna(thresh=2)