numpy&pandas基础

numpy基础

import numpy as np

定义array

In [156]: np.ones(3)
Out[156]: array([1., 1., 1.])

In [157]: np.ones((3,5))
Out[157]: 
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [158]: 

In [158]: np.zeros(4)
Out[158]: array([0., 0., 0., 0.])

In [159]: np.zeros((2,5))
Out[159]: 
array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [160]: 
In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])

In [147]: print(a)
[[1 3 5 2]
 [4 2 6 1]]

In [148]:
In [161]: np.arange(10)
Out[161]: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [162]: np.arange(3,13)
Out[162]: array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [163]: np.arange(3,13).reshape((2,5))
Out[163]: 
array([[ 3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12]])

In [164]: 
In [169]: np.arange(2,25,2)
Out[169]: array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24])

In [170]: np.arange(2,25,2).reshape(3,4)
Out[170]: 
array([[ 2,  4,  6,  8],
       [10, 12, 14, 16],
       [18, 20, 22, 24]])

In [171]: 

In [176]:  np.linspace(1,10,4)
Out[176]: array([ 1.,  4.,  7., 10.])

In [177]:

array基本运算

In [7]: a = np.array([[1,2],[3,4]])

In [8]: b = np.arange(5,9).reshape((2,3))
In [10]: print(a)
[[1 2]
 [3 4]]

In [11]: print(b)
[[5 6]
 [7 8]]

In [12]:

In [12]: a+b
Out[12]: 
array([[ 6,  8],
       [10, 12]])

In [13]: a-b
Out[13]: 
array([[-4, -4],
       [-4, -4]])

In [14]: a*b     # 对应元素相乘
Out[14]: 
array([[ 5, 12],
       [21, 32]])

In [17]: a/b
Out[17]: 
array([[0, 0],
       [0, 0]])

In [18]: 

In [18]: a**2
Out[18]: 
array([[ 1, 4],
[ 9, 16]])

In [19]:



In [15]: np.dot(a,b)   # 矩阵乘法
Out[15]: 
array([[19, 22],
[43, 50]])

In [16]: a.dot(b)
Out[16]: 
array([[19, 22],
[43, 50]])

In [17]:



In [54]: print(a)
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]

In [55]: np.sum(a)
Out[55]: 90

In [56]: np.min(a)
Out[56]: 2

In [57]: np.max(a)
Out[57]: 13

In [58]: 

In [58]: np.sum(a,axis=1)
Out[58]: array([14, 30, 46])

In [59]: np.sum(a,axis=0)
Out[59]: array([18, 21, 24, 27])

In [60]:





# 三角函数结合random生成一组随机数据
In [74]: N = 10

In [75]: t = np.linspace(0, 2*np.pi, N)

In [76]: print(t)
[0.         0.6981317  1.3962634  2.0943951  2.7925268  3.4906585
 4.1887902  4.88692191 5.58505361 6.28318531]

In [77]: y = np.sin(t) + 0.02*np.random.randn(N)

In [78]: print(y)
[-0.00947902  0.64196198  0.96567468  0.89394571  0.33830193 -0.3015316
 -0.86943758 -0.95954123 -0.62526393  0.02872202]

In [79]: M = 3 

In [80]: for ii, vv in zip(np.random.rand(M)*N, np.random.randn(M)):
    ...:     y[int(ii):] += vv
    ...:     

In [81]: print(y)
[-0.00947902  0.64196198  1.47685437  1.55309848  0.99745469  0.35762117
 -0.21028481 -0.30038846 -0.29746375  0.35652221]

In [82]: 





In [101]: a = np.arange(2,14).reshape((3,4)) 

In [102]: print(a)
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]

In [103]: print(np.argmin(a))  # 最小值的索引
0

In [104]: print(np.argmax(a))  # 最大值的索引
11

In [105]: np.cumsum(a)         # 从0元素开始的累计和
Out[105]: array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])

In [106]: np.cumprod(a)        # 从1元素开始的累计乘
Out[106]: 
array([         2,          6,         24,        120,        720,
             5040,      40320,     362880,    3628800,   39916800,
        479001600, 6227020800])

In [107]: 
In [129]: a
Out[129]: 
array([[ 2,  3,  4,  5],
       [ 6,  7,  8,  9],
       [10, 11, 12, 13]])

In [130]: np.cumsum(a,axis=1)
Out[130]: 
array([[ 2,  5,  9, 14],
       [ 6, 13, 21, 30],
       [10, 21, 33, 46]])

In [131]: np.cumsum(a,axis=0)
Out[131]: 
array([[ 2,  3,  4,  5],
       [ 8, 10, 12, 14],
       [18, 21, 24, 27]])

In [132]:
In [133]: np.cumprod(a,axis=1)
Out[133]: 
array([[    2,     6,    24,   120],
       [    6,    42,   336,  3024],
       [   10,   110,  1320, 17160]])

In [134]: np.cumprod(a,axis=0)
Out[134]: 
array([[  2,   3,   4,   5],
       [ 12,  21,  32,  45],
       [120, 231, 384, 585]])

In [135]: 




In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])

In [147]: print(a)
[[1 3 5 2]
 [4 2 6 1]]

In [148]: a.shape
Out[148]: (2, 4)

In [149]: a.ndim
Out[149]: 2

In [150]: a.size
Out[150]: 8

In [151]: np.diff(a)      # 累差运算
Out[151]: 
array([[ 2,  2, -3],
       [-2,  4, -5]])

In [152]: np.diff(a,axis=1)
Out[152]: 
array([[ 2,  2, -3],
       [-2,  4, -5]])

In [153]: np.diff(a,axis=0)
Out[153]: array([[ 3, -1,  1, -1]])

In [154]: 





In [108]: a = np.array([10,7,11,9,8,13,12,9])

In [109]: a.ndim
Out[109]: 1

In [110]: a.shape
Out[110]: (8,)

In [111]: a.size
Out[111]: 8

In [112]: a.mean()      # 均值
Out[112]: 9.875

In [113]: a.var()       # 方差
Out[113]: 3.609375

In [114]: a.std()       # 标准差
Out[114]: 1.899835519196333

In [115]:
In [117]: np.median(a)  # 中位数
Out[117]: 9.5

In [118]: 
In [138]: z = (a-a.mean())/a.std()   # z-score

In [139]: print(z)
[ 0.06579517 -1.5132889   0.59215653 -0.46056619 -0.98692754  1.64487924
  1.11851788 -0.46056619]

In [140]: 




In [198]: a = np.arange(-3,3).reshape((2,3))

In [199]: a
Out[199]: 
array([[-3, -2, -1],
       [ 0,  1,  2]])

In [200]: np.nonzero(a)  # 查找非0元素
Out[200]: (array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))

In [201]: print(np.nonzero(a))
(array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))

In [202]: 



In [207]: a = np.arange(14,2,-1).reshape((3,4))

In [208]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]

In [209]: np.sort(a)     # 排序
Out[209]: 
array([[11, 12, 13, 14],
       [ 7,  8,  9, 10],
       [ 3,  4,  5,  6]])

In [210]: 

In [210]: np.sort(a,axis=1)
Out[210]: 
array([[11, 12, 13, 14],
       [ 7,  8,  9, 10],
       [ 3,  4,  5,  6]])

In [211]: np.sort(a,axis=0)
Out[211]: 
array([[ 6,  5,  4,  3],
       [10,  9,  8,  7],
       [14, 13, 12, 11]])

In [212]: 




# 矩阵的转置
In [212]: a = np.arange(14,2,-1).reshape((3,4))

In [213]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]

In [214]: 

In [215]: print(np.transpose(a))
[[14 10  6]
 [13  9  5]
 [12  8  4]
 [11  7  3]]

In [216]: a.T
Out[216]: 
array([[14, 10,  6],
       [13,  9,  5],
       [12,  8,  4],
       [11,  7,  3]])

In [217]: 

In [220]: a.T.dot(a)  # 先转置，再进行矩阵乘法
Out[220]: 
array([[332, 302, 272, 242],
       [302, 275, 248, 221],
       [272, 248, 224, 200],
       [242, 221, 200, 179]])

In [221]: 



# 矩阵的clip，处理最大值和最小值
In [221]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]

In [222]: np.clip(a,5,11)
Out[222]: 
array([[11, 11, 11, 11],
       [10,  9,  8,  7],
       [ 6,  5,  5,  5]])

In [223]:

卷积运算

numpy.convolve(weights,array)
 
weight = [a,b,c]
array = [i,j,k,m,n]
 
Result：[ai, bi+aj, ci+bj+ak, cj+bk+am, ck+bm+an, cm+bn, cn][N-1:-N+1]


针对移动平均算法来预测下一个数据，越接近待预测点的数据权重越大，
那么就需要让 i, j, k, m, n 的系数逐渐增大即可；即让 a > b > c ，并且 a+b+c=1 。



示例：
In [223]: weight = np.ones(3)/3

In [224]: print(weight)
[0.33333333 0.33333333 0.33333333]

In [225]: arr = np.array([8,11,9,7,10])

In [226]: np.convolve(weight,arr)
Out[226]: 
array([2.66666667, 6.33333333, 9.33333333, 9.        , 8.66666667,
       5.66666667, 3.33333333])

In [227]: 

In [227]: weight = np.array([0.8,0.1,0.1])

In [228]: np.convolve(weight,arr)
Out[228]: array([6.4, 9.6, 9.1, 7.6, 9.6, 1.7, 1. ])

In [229]:

random常用操作

# 生成随机浮点数，范围是在0.0~1.0之间
In [19]: a = np.random.random((2,3))

In [20]: print(a)
[[0.02185901 0.69585563 0.04555439]
 [0.37331857 0.32903986 0.62448246]]

In [21]:

# 生成随机整数，可指定起止范围
In [48]: np.random.randint(3)
Out[48]: 2

In [49]: np.random.randint(low=3,high=9)
Out[49]: 6

In [50]: np.random.randint(low=3,high=9,size=(3,4))
Out[50]: 
array([[5, 6, 7, 8],
       [8, 7, 3, 8],
       [5, 4, 5, 5]])

In [51]: 
In [68]: np.random.randint(low=-5,high=2,size=(3,4))
Out[68]: 
array([[-4, -4, -2,  1],
       [ 1,  0,  0,  1],
       [-4, -3,  1, -5]])

In [69]: 

# 生成正态分布，又名高斯分布（Gaussian distribution）随机数
In [64]: np.random.normal()
Out[64]: -0.5399414561419419

In [65]: np.random.normal(loc=0,scale=1,size=(2,3))
Out[65]: 
array([[-0.50318082, -0.38614219,  0.30450427],
       [ 0.41711087,  0.29990928, -0.7843322 ]])

In [66]:
In [66]: np.random.normal(loc=2,scale=3,size=(2,3))
Out[66]: 
array([[ 3.37067379,  6.23517315,  2.3267659 ],
       [ 6.46832646, -2.76363304,  5.77883853]])

In [67]:

# 生成标准正态分布（"standard normal" distribution）随机数，标准正态分布的平均值为0，方差为1，服从u（0，1）分布。
In [83]: np.random.randn()
Out[83]: 0.502482341264108

In [84]: np.random.randn(3,4)
Out[84]: 
array([[ 0.34507555, -0.26868132, -0.56103417,  0.86176617],
       [-0.16535555, -0.38045904,  0.48176385, -1.09005206],
       [-0.60780266,  1.74113117, -0.72427329, -0.51232408]])

In [85]:

# 生成[0, 1)间随机数
In [99]: np.random.rand()
Out[99]: 0.607701127768974

In [100]: np.random.rand(3,4)
Out[100]: 
array([[0.73020695, 0.53993878, 0.46693879, 0.82611629],
       [0.76117076, 0.16522599, 0.85129611, 0.74448772],
       [0.6450236 , 0.49994053, 0.04115063, 0.30081311]])

In [101]:

array索引

# 一维数组的索引和list类似
略

# 二维数组的索引
In [13]: import numpy as np

In [14]: a = np.arange(3,15).reshape((3,4))

In [15]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]

In [16]: a[1]
Out[16]: array([ 7,  8,  9, 10])

In [17]: a[1,2]
Out[17]: 9

In [18]: a[1][2]              # 等价于 a[1,2]
Out[18]: 9

In [19]: 

In [19]: a[1,1:-1]            # 获取第二行，除去首尾元素
Out[19]: array([8, 9])

In [20]: a[1,1:2]             # 获取第二行第二个元素
Out[20]: array([8])

In [21]:
In [24]: a[1:-1,2]            # 获取第二列，除去首尾元素
Out[24]: array([9])

In [26]: a[:,2]               # 获取第二列元素
Out[26]: array([ 5,  9, 13])

In [27]:

迭代array

# 迭代行
In [27]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]

In [28]: for row in a:
    ...:     print(row)
    ...:     
[3 4 5 6]
[ 7  8  9 10]
[11 12 13 14]

In [29]:     

# 迭代列
In [29]: print(a.T)
[[ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]
 [ 6 10 14]]

In [30]: for column in a.T:
    ...:     print(column)
    ...:     
[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
[ 6 10 14]

In [31]: 




# 二维矩阵，多行转换成一行，迭代每一个item
In [31]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]

In [32]: print(a.flat)
<numpy.flatiter object at 0x7f392e3545c0>

In [33]: print(a.flatten())
[ 3  4  5  6  7  8  9 10 11 12 13 14]

In [34]: for item in a.flat:
    ...:     print(item)
    ...:     
3
4
5
6
7
8
9
10
11
12
13
14

In [35]:

合并array

In [39]: a = np.array([1,2,3])

In [40]: b = np.array([2,2,2])

In [41]: c = np.vstack((a,b))     # vertical stack，上下合并

In [42]: print(c)
[[1 2 3]
 [2 2 2]]

In [43]: c.shape
Out[43]: (2, 3)

In [44]: c.ndim
Out[44]: 2

In [45]: c.size
Out[45]: 6

In [46]: 



In [47]: d = np.hstack((a,b))     # horizontal stack，左右合并

In [48]: print(d)
[1 2 3 2 2 2]

In [49]: d.shape
Out[49]: (6,)

In [50]: d.ndim
Out[50]: 1

In [51]: d.size
Out[51]: 6

In [52]: 




# newaxis改变数组维度
In [54]: print(a)
[1 2 3]

In [55]: e = a[np.newaxis,:]

In [56]: print(e)
[[1 2 3]]

In [57]: f = a[:,np.newaxis]

In [58]: print(f)
[[1]
 [2]
 [3]]

In [59]: 




In [59]: a = np.array([1,2,3])[:,np.newaxis]

In [60]: b = np.array([2,2,2])[:,np.newaxis]

In [61]: print(a)
[[1]
 [2]
 [3]]

In [62]: print(b)
[[2]
 [2]
 [2]]

In [63]: c = np.vstack((a,b))

In [64]: print(c)
[[1]
 [2]
 [3]
 [2]
 [2]
 [2]]

In [65]: d = np.hstack((a,b))        # 合并两个array

In [66]: print(d)
[[1 2]
 [2 2]
 [3 2]]

In [67]: 
In [74]: d = np.hstack((a,b,b,a))    # 合并多个array

In [75]: print(d)
[[1 2 2 1]
 [2 2 2 2]
 [3 2 2 3]]

In [76]: 




# concatenate 常用来合并多个矩阵或序列，axis可以方便的指定维度
In [76]: a = np.array([1,2,3])

In [77]: b = np.array([2,2,2])

In [78]: a = a[:,np.newaxis]

In [79]: b = b[:,np.newaxis]

In [80]: c = np.concatenate((a,b,b,a),axis=0)

In [81]: print(c)
[[1]
 [2]
 [3]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [3]]

In [82]: c = np.concatenate((a,b,b,a),axis=1)

In [83]: print(c)
[[1 2 2 1]
 [2 2 2 2]
 [3 2 2 3]]

In [84]:

分割array

In [92]: a = np.arange(12).reshape((3,4))

In [93]: print(a)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

In [94]: c = np.split(a,2,axis=1)               # 等项分割

In [95]: len(c)
Out[95]: 2

In [96]: c[0]
Out[96]: 
array([[0, 1],
       [4, 5],
       [8, 9]])

In [97]: c[1]
Out[97]: 
array([[ 2,  3],
       [ 6,  7],
       [10, 11]])

In [98]: 

In [98]: print(c)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]

In [99]: 


In [99]: d = np.array_split(a,3,axis=1)         # 不等项分割

In [100]: len(d)
Out[100]: 3

In [101]: print(d)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]

In [102]: d[0]
Out[102]: 
array([[0, 1],
       [4, 5],
       [8, 9]])

In [103]: d[1]
Out[103]: 
array([[ 2],
       [ 6],
       [10]])

In [104]: d[2]
Out[104]: 
array([[ 3],
       [ 7],
       [11]])

In [105]: 





In [111]: print(a)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

In [112]: b = np.hsplit(a,2)          # horizontal split，水平分割

In [113]: print(b)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]

In [114]: b[0]
Out[114]: 
array([[0, 1],
       [4, 5],
       [8, 9]])

In [115]: b[1]
Out[115]: 
array([[ 2,  3],
       [ 6,  7],
       [10, 11]])

In [116]: 

In [116]: c = np.vsplit(a,3)          # vertical split，垂直分割

In [117]: len(c)
Out[117]: 3

In [118]: print(c)
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]

In [119]: c[0]
Out[119]: array([[0, 1, 2, 3]])

In [120]: c[1]
Out[120]: array([[4, 5, 6, 7]])

In [121]: c[2]
Out[121]: array([[ 8,  9, 10, 11]])

In [122]:

Numpy.copy()

In [150]: a = np.arange(4)

In [151]: print(a)
[0 1 2 3]

In [152]: b = a

In [153]: b is a
Out[153]: True

In [154]: a[0] = 99

In [155]: print(b)
[99  1  2  3]

In [156]: 

In [156]: c = a.copy()      # deep copy

In [157]: c is a
Out[157]: False

In [159]: print(a)
[99  1  2  3]

In [160]: a[1:3] = [7,8]

In [161]: print(a)
[99  7  8  3]

In [163]: print(b)
[99  7  8  3]

In [164]: print(c)
[99  1  2  3]

In [165]:

Numpy其他

In [169]: a = np.array([-9,7,12,-4,-3,6,2])

In [170]: print(a)
[-9  7 12 -4 -3  6  2]

In [171]: np.abs(a)
Out[171]: array([ 9,  7, 12,  4,  3,  6,  2])

In [172]: np.where(np.abs(a)>6)
Out[172]: (array([0, 1, 2]),)

In [173]:

numpy参考：http://pda.readthedocs.io/en/latest/chp4.html

Pandas基础

import pandas as pd

Series

In [173]: import pandas as pd

In [174]: import numpy as np

In [175]: s = pd.Series([1,3,6,np.nan,44,1])                  # 定义pandas.Series

In [176]: print(s)
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

In [177]:

Base Time Series Frequencies

Aggragate for duplicate Indices

In [157]: dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000','1/3/2000','1/3/2000'])

In [158]: dates
Out[158]: 
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
               '2000-01-03', '2000-01-03'],
              dtype='datetime64[ns]', freq=None)

In [159]: dup_ts = pd.Series(np.arange(6), index=dates)

In [160]: dup_ts
Out[160]: 
2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
2000-01-03    5
dtype: int64

In [161]: dup_ts.index.is_unique
Out[161]: False

In [162]: dup_ts['2000-01-01']
Out[162]: 0

In [163]: dup_ts['2000-01-02']
Out[163]: 
2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [164]: dup_ts['2000-01-03']
Out[164]: 
2000-01-03    4
2000-01-03    5
dtype: int64

In [165]: 

In [165]: grouped = dup_ts.groupby(level=0)

In [166]: grouped.mean()
Out[166]: 
2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.5
dtype: float64

In [167]: grouped.count()
Out[167]: 
2000-01-01    1
2000-01-02    3
2000-01-03    2
dtype: int64

In [168]: grouped.sum()
Out[168]: 
2000-01-01    0
2000-01-02    6
2000-01-03    9
dtype: int64

In [169]:

Group by month or weekday by passing a function that accesses those fields on the time series’s index.

In [90]: rng = pd.date_range('1/1/2000', periods=100, freq='D')

In [91]: ts = pd.Series(np.arange(100), index=rng)

In [92]: ts.groupby(lambda x: x.month).mean()
Out[92]: 
1    15
2    45
3    75
4    95
dtype: int64

In [93]: ts.groupby(lambda x: x.month).sum()
Out[93]: 
1     465
2    1305
3    2325
4     855
dtype: int64

In [94]: ts.groupby(lambda x: x.month).max()
Out[94]: 
1    30
2    59
3    90
4    99
dtype: int64

In [95]: ts.groupby(lambda x: x.weekday).mean()
Out[95]: 
0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

In [96]: ts.groupby(lambda x: x.weekday).sum()
Out[96]: 
0    665
1    679
2    693
3    707
4    721
5    735
6    750
dtype: int64

In [97]:

Resample method arguments

Resampling and Frequency Conversion

In [50]: rng = pd.date_range('1/1/2000', periods=100, freq='D')

In [51]: ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [52]: ts
Out[52]: 
2000-01-01    0.030631
2000-01-02   -2.087034
2000-01-03    1.238687
2000-01-04   -1.297059
2000-01-05   -1.341296
2000-01-06   -0.353311
2000-01-07   -0.854693
2000-01-08    0.426789
                ...   
2000-03-27    1.262705
2000-03-28   -0.646236
2000-03-29   -0.349658
2000-03-30   -1.093438
2000-03-31   -0.254758
2000-04-01    0.146417
2000-04-02    1.774502
2000-04-03   -0.712635
2000-04-04   -1.552352
2000-04-05    0.303172
2000-04-06   -0.023492
2000-04-07   -1.418930
2000-04-08    0.789877
2000-04-09    1.767594
Freq: D, Length: 100, dtype: float64

In [53]: 

In [53]: ts.resample('M').mean()
Out[53]: 
2000-01-31    0.003531
2000-02-29    0.030067
2000-03-31   -0.106783
2000-04-30    0.119350
Freq: M, dtype: float64

In [54]: ts.resample('M',kind='period').mean()
Out[54]: 
2000-01    0.003531
2000-02    0.030067
2000-03   -0.106783
2000-04    0.119350
Freq: M, dtype: float64

In [55]:

Aggregate this data into five-minute chunks or bars by taking the sum of each group.

In [71]: rng = pd.date_range('1/1/2000', periods=24, freq='T')

In [72]: rng
Out[72]: 
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
               '2000-01-01 00:02:00', '2000-01-01 00:03:00',
               '2000-01-01 00:04:00', '2000-01-01 00:05:00',
               '2000-01-01 00:06:00', '2000-01-01 00:07:00',
               '2000-01-01 00:08:00', '2000-01-01 00:09:00',
               '2000-01-01 00:10:00', '2000-01-01 00:11:00',
               '2000-01-01 00:12:00', '2000-01-01 00:13:00',
               '2000-01-01 00:14:00', '2000-01-01 00:15:00',
               '2000-01-01 00:16:00', '2000-01-01 00:17:00',
               '2000-01-01 00:18:00', '2000-01-01 00:19:00',
               '2000-01-01 00:20:00', '2000-01-01 00:21:00',
               '2000-01-01 00:22:00', '2000-01-01 00:23:00'],
              dtype='datetime64[ns]', freq='T')

In [73]: ts = pd.Series(np.arange(24), index=rng)

In [74]: ts
Out[74]: 
2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
2000-01-01 00:12:00    12
2000-01-01 00:13:00    13
2000-01-01 00:14:00    14
2000-01-01 00:15:00    15
2000-01-01 00:16:00    16
2000-01-01 00:17:00    17
2000-01-01 00:18:00    18
2000-01-01 00:19:00    19
2000-01-01 00:20:00    20
2000-01-01 00:21:00    21
2000-01-01 00:22:00    22
2000-01-01 00:23:00    23
Freq: T, dtype: int64

In [75]: ts.resample('5min').sum()
Out[75]: 
2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    60
2000-01-01 00:15:00    85
2000-01-01 00:20:00    86
Freq: 5T, dtype: int64

In [76]: ts.resample('5min',closed='left').sum()
Out[76]: 
2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    60
2000-01-01 00:15:00    85
2000-01-01 00:20:00    86
Freq: 5T, dtype: int64

In [77]: 

In [77]: ts.resample('5min').max()
Out[77]: 
2000-01-01 00:00:00     4
2000-01-01 00:05:00     9
2000-01-01 00:10:00    14
2000-01-01 00:15:00    19
2000-01-01 00:20:00    23
Freq: 5T, dtype: int64

In [78]: 

In [78]: ts.resample('5min',closed='right').sum()
Out[78]: 
1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    65
2000-01-01 00:15:00    90
2000-01-01 00:20:00    66
Freq: 5T, dtype: int64

In [79]: 

In [79]: ts.resample('5min',loffset='-1s').sum()
Out[79]: 
1999-12-31 23:59:59    10
2000-01-01 00:04:59    35
2000-01-01 00:09:59    60
2000-01-01 00:14:59    85
2000-01-01 00:19:59    86
Freq: 5T, dtype: int64

In [80]:



# Open-High-Low-Close (OHLC) resampling
In [81]: ts.resample('5min').ohlc()
Out[81]: 
                     open  high  low  close
2000-01-01 00:00:00     0     4    0      4
2000-01-01 00:05:00     5     9    5      9
2000-01-01 00:10:00    10    14   10     14
2000-01-01 00:15:00    15    19   15     19
2000-01-01 00:20:00    20    23   20     23

In [82]:

Resampling with Periods

In [118]: frame = pd.DataFrame(np.random.randn(24, 4),
     ...:     index=pd.period_range('1-2000', '12-2001', freq='M'),
     ...:     columns=['Beijing', 'Luoyang', 'New York', 'Tokyo'])

In [119]: frame
Out[119]: 
          Beijing   Luoyang  New York     Tokyo
2000-01  1.120268 -1.120345 -1.154800  0.443861
2000-02  0.611443  0.200576 -1.163600 -1.137567
2000-03  0.658112  2.332235 -1.718285  1.589246
2000-04 -0.863050  1.890877  2.046202  0.410414
2000-05  0.710052 -0.041623  0.122719 -1.141112
2000-06  0.299393  1.227689  0.718627  1.004851
2000-07  1.287335 -0.179045 -0.476422  0.949235
2000-08 -2.140590  0.433699 -0.783202  1.073706
2000-09 -0.149710 -0.580780  0.755274  0.514259
2000-10  0.190940 -0.187451  1.710803 -1.631272
2000-11  0.419288  0.565235  0.470381  0.599020
2000-12  0.951111  0.464671 -0.854858 -0.009189
2001-01 -1.383493 -0.147035 -0.379006  0.472686
2001-02  1.803475 -1.628368 -0.896757 -0.508827
2001-03  0.575910 -0.528299  1.182473  0.159452
2001-04 -1.056161 -0.475357  0.861852  1.168667
2001-05 -1.316565  0.354719  1.354205 -0.369083
2001-06  0.497406 -1.799904 -0.512882 -0.092718
2001-07  0.896944 -1.276022  0.137365  0.087199
2001-08 -0.046908 -0.650024  0.958182 -0.048369
2001-09  0.085401  1.067235  0.541318  0.853376
2001-10  1.165047 -0.794425  1.137002  0.064595
2001-11 -0.438006  0.706564  1.464403  0.278069
2001-12 -0.094644  0.666789  0.220349 -0.386617

In [120]: frame[:5]
Out[120]: 
          Beijing   Luoyang  New York     Tokyo
2000-01  1.120268 -1.120345 -1.154800  0.443861
2000-02  0.611443  0.200576 -1.163600 -1.137567
2000-03  0.658112  2.332235 -1.718285  1.589246
2000-04 -0.863050  1.890877  2.046202  0.410414
2000-05  0.710052 -0.041623  0.122719 -1.141112

In [121]: annual_frame = frame.resample('A-DEC').mean()

In [122]: annual_frame
Out[122]: 
       Beijing   Luoyang  New York     Tokyo
2000  0.257883  0.417145 -0.027263  0.222121
2001  0.057367 -0.375344  0.505709  0.139869

In [123]: 

In [123]: annual_frame_max = frame.resample('A-DEC').max()

In [124]: annual_frame_max
Out[124]: 
       Beijing   Luoyang  New York     Tokyo
2000  1.287335  2.332235  2.046202  1.589246
2001  1.803475  1.067235  1.464403  1.168667

In [125]:

DataFrame

# 第一种定义pandas.DataFrame方式：直接导入numpy的数据
In [186]: df1 = pd.DataFrame(np.arange(12).reshape((3,4)))    # 定义pandas.DataFrame

In [187]: print(df1)
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

In [188]: 

In [178]: dates = pd.date_range('20160101',periods=6)

In [179]: print(dates)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [180]: 

# 定义pandas.DataFrame，并指定列名和行名
In [184]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])

In [185]: print(df)
                   a         b         c         d
2016-01-01  1.193589  0.165348  1.598806 -0.478980
2016-01-02  1.188886 -1.232185 -0.633066  0.594805
2016-01-03  2.707996 -0.116420  1.622761  0.399708
2016-01-04  0.416469  1.593061 -0.044390 -0.031153
2016-01-05 -0.637080  1.680110  1.371026  0.821549
2016-01-06 -0.079359  1.421577  0.042537  1.058749

In [186]: 


# 第二种定义pandas.DataFrame方式：把参数当做字典传入DataFrame
In [188]: df2 = pd.DataFrame({'A' : 1.,
     ...:                     'B' : pd.Timestamp('20130102'),
     ...:                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
     ...:                     'D' : np.array([3] * 4,dtype='int32'),
     ...:                     'E' : pd.Categorical(["test","train","test","train"]),
     ...:                     'F' : 'foo'})

In [189]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

In [190]:
In [190]: print(df2.dtypes)                      # 查看DataFrame内容的类型
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [191]:
In [191]: print(df2.index)                       # 打印DataFrame列的名字
Int64Index([0, 1, 2, 3], dtype='int64')

In [192]: 
In [192]: print(df2.columns)                     # 打印DataFrame行的名字
Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

In [193]: 
 
In [194]: print(df2.values)                      # 打印DataFrame的内容
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]

In [195]: 





In [196]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

In [197]: 

In [197]: print(df2.describe())                  # 打印出DataFrame的数学运算的相关数据
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0

In [198]: 


In [200]: print(df2.T)                           # 把DataFrame进行transport，即转置
                     0                    1                    2                    3
A                    1                    1                    1                    1
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00
C                    1                    1                    1                    1
D                    3                    3                    3                    3
E                 test                train                 test                train
F                  foo                  foo                  foo                  foo

In [201]: 






# 对DataFrame排序
In [203]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

In [204]: df2.sort_index(axis=1, ascending=False)   # 按照index（列名）排序
Out[204]: 
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0

In [205]:
In [205]: df2.sort_index(axis=0, ascending=False)   # 按照行名排序
Out[205]: 
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo

In [206]: 



In [207]: df2.sort_values(by='E')                   # 指定value进行排序
Out[207]: 
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
3  1.0 2013-01-02  1.0  3  train  foo

In [208]:

Pandas筛选数据

In [212]: dates = pd.date_range('20160101',periods=6)

In [213]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])

In [214]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [215]: 

In [215]: print(df['A'])            # 选取指定列
2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int64
 
In [216]: print(df.A)               # 等价于 df['A']
2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int64

In [217]:

In [217]: print(df[0:3])            # 切片方式选取某些行
            A  B   C   D
2016-01-01  0  1   2   3
2016-01-02  4  5   6   7
2016-01-03  8  9  10  11

In [218]: print(df['2016-01-01':'2016-01-03'])   # 等价于 df[0:3]
            A  B   C   D
2016-01-01  0  1   2   3
2016-01-02  4  5   6   7
2016-01-03  8  9  10  11

In [219]: 




# select by label : loc
In [220]: print(df.loc['2016-01-02'])
A    4
B    5
C    6
D    7
Name: 2016-01-02 00:00:00, dtype: int64

In [221]: 
In [221]: print(df.loc['2016-01-02']['B'])
5

In [222]: 

In [227]: print(df.loc[:,['A','B']])
             A   B
2016-01-01   0   1
2016-01-02   4   5
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17
2016-01-06  20  21

In [228]: 
In [228]: print(df.loc['2016-01-03',['A','B']])
A    8
B    9
Name: 2016-01-03 00:00:00, dtype: int64

In [229]: 
In [232]: print(df.loc['2016-01-03':'2016-01-05',['A','B']])
             A   B
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17

In [233]: 




# select by position : iloc
In [235]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [236]: print(df.iloc[3])
A    12
B    13
C    14
D    15
Name: 2016-01-04 00:00:00, dtype: int64

In [237]: print(df.iloc[3,1])
13

In [238]: 

In [238]: print(df.iloc[3:5,1:3])
             B   C
2016-01-04  13  14
2016-01-05  17  18

In [239]: 

In [240]: print(df.iloc[[1,3,5],1:3])
             B   C
2016-01-02   5   6
2016-01-04  13  14
2016-01-06  21  22

In [241]: 




# mixed selection : ix
In [243]: print(df.ix[:3,['A','C']])
/usr/local/anaconda2/bin/ipython2:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  #!/usr/local/anaconda2/bin/python
            A   C
2016-01-01  0   2
2016-01-02  4   6
2016-01-03  8  10

In [244]: 




# Boolean indexing
In [9]: print(df[df.A>8])
             A   B   C   D
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [10]:

df.head(n)      # 返回DataFrame前n行

df.tail(n)      # 返回DateFrame后n行

Pandas设置值

# 给DataFrame设置值
In [1]: import numpy as np

In [2]: import pandas as pd

In [3]: dates = pd.date_range('20160101',periods=6)

In [4]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])

In [5]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [6]:
In [7]: df.iloc[2,2] = 99

In [10]: df.loc['2016-01-02','B'] = 100

In [11]: print(df)
             A    B   C   D
2016-01-01   0    1   2   3
2016-01-02   4  100   6   7
2016-01-03   8    9  99  11
2016-01-04  12   13  14  15
2016-01-05  16   17  18  19
2016-01-06  20   21  22  23

In [12]:




In [17]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [18]: df.A[df.A>4] = 0

In [19]: print(df)
            A   B   C   D
2016-01-01  0   1   2   3
2016-01-02  4   5   6   7
2016-01-03  0   9  10  11
2016-01-04  0  13  14  15
2016-01-05  0  17  18  19
2016-01-06  0  21  22  23

In [20]: 



In [21]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [22]: df[df.A>4] = 0

In [23]: print(df)
            A  B  C  D
2016-01-01  0  1  2  3
2016-01-02  4  5  6  7
2016-01-03  0  0  0  0
2016-01-04  0  0  0  0
2016-01-05  0  0  0  0
2016-01-06  0  0  0  0

In [24]: 




In [30]: df['F'] = np.nan        # 增加一列，赋值为NaN

In [31]: print(df)
             A   B   C   D   F
2016-01-01   0   1   2   3 NaN
2016-01-02   4   5   6   7 NaN
2016-01-03   8   9  10  11 NaN
2016-01-04  12  13  14  15 NaN
2016-01-05  16  17  18  19 NaN
2016-01-06  20  21  22  23 NaN

In [32]: 
                                 # 增加一列，需要制定行名
In [46]: df['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20160101',periods=6)) 

In [47]: print(df)
             A   B   C   D   E  F
2016-01-01   0   1   2   3 NaN  1
2016-01-02   4   5   6   7 NaN  2
2016-01-03   8   9  10  11 NaN  3
2016-01-04  12  13  14  15 NaN  4
2016-01-05  16  17  18  19 NaN  5
2016-01-06  20  21  22  23 NaN  6

In [48]:

Pandas删除DataFrame数据

In [1]: import numpy as np

In [2]: import pandas as pd

In [3]: values = np.arange(12).reshape((3,4))

In [4]: print(values)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

In [5]:
In [8]: df = pd.DataFrame(values,index=['row1','row2','row3'],columns=['A','B','C','D'])

In [9]: print(df)
      A  B   C   D
row1  0  1   2   3
row2  4  5   6   7
row3  8  9  10  11

In [10]:
In [10]: print(df.shape)
(3, 4)

In [11]:
In [11]: df.drop(columns='A',axis=1)
Out[11]: 
      B   C   D
row1  1   2   3
row2  5   6   7
row3  9  10  11

In [12]: df.drop(columns=['A','C'],axis=1)
Out[12]: 
      B   D
row1  1   3
row2  5   7
row3  9  11

In [13]: 

In [13]: df.drop(index='row2',axis=0)
Out[13]: 
      A  B   C   D
row1  0  1   2   3
row3  8  9  10  11

In [14]: df.drop(index=['row2','row3'],axis=0)
Out[14]: 
      A  B  C  D
row1  0  1  2  3

In [15]:

如果index用的是 “pd.date_range('20160101',periods=6)”

In [43]: print(df)
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-02 -0.770801  1.641150  0.840216 -0.991219
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-04  1.863281  1.163653  0.319549 -1.545655
2016-01-05  0.452804  0.203472 -1.232536  0.681963
2016-01-06  0.171324  0.353359  1.674004 -2.026071

In [44]: print(df.index)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [45]: 

In [45]: df.drop(index=pd.datetime(2016,1,4),axis=0)
Out[45]: 
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-02 -0.770801  1.641150  0.840216 -0.991219
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-05  0.452804  0.203472 -1.232536  0.681963
2016-01-06  0.171324  0.353359  1.674004 -2.026071

In [46]: df.drop(index=[pd.datetime(2016,1,2),pd.datetime(2016,1,5)],axis=0)
Out[46]: 
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-04  1.863281  1.163653  0.319549 -1.545655
2016-01-06  0.171324  0.353359  1.674004 -2.026071

In [47]:

Pandas处理丢失的数据

# 处理丢失数据

In [7]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

In [8]: df.iloc[0,1] = np.nan

In [9]: df.iloc[1,2] = np.nan

In [10]: print(df)
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

In [11]: print(df.dropna(axis=1,how='any'))  # 删除NaN数据所在行，how = {'any','all'}
             A   D
2016-01-01   0   3
2016-01-02   4   7
2016-01-03   8  11
2016-01-04  12  15
2016-01-05  16  19
2016-01-06  20  23

In [12]: print(df.dropna(axis=0,how='any'))  # 删除NaN数据所在行，how = {'any','all'} 
             A     B     C   D
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

In [13]: 
In [13]: print(df.dropna(axis=0,how='all'))
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

In [14]: 
In [14]: print(df.dropna(axis=1,how='all'))
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

In [15]: 



In [15]: df.fillna(value=0)                # 把NaN填充为制定数值
Out[15]: 
             A     B     C   D
2016-01-01   0   0.0   2.0   3
2016-01-02   4   5.0   0.0   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

In [16]: 






In [19]: print(df.isnull())                # 把数值为NaN的位置标识出来
                A      B      C      D
2016-01-01  False   True  False  False
2016-01-02  False  False   True  False
2016-01-03  False  False  False  False
2016-01-04  False  False  False  False
2016-01-05  False  False  False  False
2016-01-06  False  False  False  False

In [20]: 


In [22]: print(np.any(df.isnull()) == True)   # 检查DataFrame是否含有NaN值
True

In [23]:

Pandas导入导出示例

In [33]: import pandas as pd

In [34]: data = pd.read_csv('student.csv')

In [35]: print(data)
    Student ID  name   age  gender
0         1100  Kelly   22  Female
1         1101    Clo   21  Female
2         1102  Tilly   22  Female
3         1103   Tony   24    Male
4         1104  David   20    Male
5         1105  Catty   22  Female
6         1106      M    3  Female
7         1107      N   43    Male
8         1108      A   13    Male
9         1109      S   12    Male
10        1110  David   33    Male
11        1111     Dw    3  Female
12        1112      Q   23    Male
13        1113      W   21  Female

In [36]: print(type(data))
<class 'pandas.core.frame.DataFrame'>

In [37]: data.to_pickle('student.pickle')

In [38]: data.to_json('student.json')

In [39]:

更多IO Tools参考：官方介绍

Pandas concat合并

# pandas 合并

# concatenating
In [40]: import numpy as np

In [41]: import pandas as pd

In [42]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])

In [43]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])

In [44]: df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])

In [45]: print(df1)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0

In [46]: print(df2)
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0

In [47]: print(df3)
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

In [48]: result = pd.concat([df1,df2,df3],axis=0)   # vertical 垂直合并

In [49]: print(result)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

In [50]: 
In [50]: result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)  # 序号重新排列

In [51]: print(result)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

In [52]:





# join合并   ['inner','outer']
In [63]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])

In [64]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])

In [65]: print(df1)
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0

In [66]: print(df2)
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0

In [67]: 
In [67]: result = pd.concat([df1,df2])     # 即 pd.concat([df1,df2],join='outer') ， 默认就是outer模式
/usr/local/anaconda2/bin/ipython2:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

  #!/usr/local/anaconda2/bin/python

In [68]: 

In [68]: print(result)
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0

In [69]:

In [70]: result = pd.concat([df1,df2],join='inner')  # inner模式

In [71]: print(result)
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0

In [72]: 
In [72]: result = pd.concat([df1,df2],join='inner',ignore_index=True)

In [73]: print(result)
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0

In [74]: 




# join_axes合并
In [78]: res = pd.concat([df1, df2], axis=1)

In [79]: print(res)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0

In [80]: 
In [74]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])

In [75]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])

In [76]: res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])

In [77]: print(res)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0

In [78]: 
In [80]: res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])

In [81]: print(res)
     a    b    c    d    b    c    d    e
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0

In [82]: 




# append合并

In [87]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])

In [88]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])

In [89]: df1.append(df2,ignore_index=True)
Out[89]: 
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0

In [90]: df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])

In [91]: df1.append([df2,df3],ignore_index=True)
Out[91]: 
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  1.0  1.0  1.0  1.0
7  1.0  1.0  1.0  1.0
8  1.0  1.0  1.0  1.0

In [92]: 


# 添加一行数据到DataFrame
In [92]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])

In [93]: s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])

In [94]: res = df1.append(s1,ignore_index=True)

In [95]: print(res)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0

In [96]:

Pandas merge合并

# merge合并
In [99]: import pandas as pd

In [100]: left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
     ...:                      'A': ['A0', 'A1', 'A2', 'A3'],
     ...:                      'B': ['B0', 'B1', 'B2', 'B3']})

In [101]: right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
     ...:                       'C': ['C0', 'C1', 'C2', 'C3'],
     ...:                       'D': ['D0', 'D1', 'D2', 'D3']})

In [102]: 

In [102]: print(left)
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3

In [103]: print(right)
    C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3

In [104]: 
In [104]: res = pd.merge(left,right,on='key')

In [105]: print(res)
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3

In [106]: 


# consider two keys
In [106]: left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
     ...:                       'key2': ['K0', 'K1', 'K0', 'K1'],
     ...:                       'A': ['A0', 'A1', 'A2', 'A3'],
     ...:                       'B': ['B0', 'B1', 'B2', 'B3']})

In [107]: right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
     ...:                        'key2': ['K0', 'K0', 'K0', 'K0'],
     ...:                        'C': ['C0', 'C1', 'C2', 'C3'],
     ...:                        'D': ['D0', 'D1', 'D2', 'D3']})

In [108]: print(left)
    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1

In [109]: print(right)
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0

In [110]: res = pd.merge(left,right,on=['key1','key2'])

In [111]: print(res)
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2


# how={'left','right','inner','outer'}
In [112]: res = pd.merge(left,right,on=['key1','key2'],how='inner')  # 默认就是inner模式

In [113]: print(res)
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2

In [114]: res = pd.merge(left,right,on=['key1','key2'],how='outer')

In [115]: print(res)
     A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3

In [116]: 
In [116]: res = pd.merge(left,right,on=['key1','key2'],how='left')

In [117]: print(res)
    A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN

In [118]: res = pd.merge(left,right,on=['key1','key2'],how='right')

In [119]: print(res)
     A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3

In [120]: 



# indicator
In [121]: df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})

In [122]: df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

In [123]: print(df1)
   col1 col_left
0     0        a
1     1        b

In [124]: print(df2)
   col1  col_right
0     1          2
1     2          2
2     2          2

In [125]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) # 给一个提示

In [126]: print(res)
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only

In [127]:
In [129]: res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') # 指定提示的列名

In [130]: print(res)
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only

In [131]: 
In [127]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=False)

In [128]: print(res)
   col1 col_left  col_right
0     0        a        NaN
1     1        b        2.0
2     2      NaN        2.0
3     2      NaN        2.0

In [129]: 






In [131]: left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     ...:                      'B': ['B0', 'B1', 'B2']},
     ...:                      index=['K0', 'K1', 'K2'])

In [132]: right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
     ...:                       'D': ['D0', 'D2', 'D3']},
     ...:                      index=['K0', 'K2', 'K3'])

In [133]: print(left)
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2

In [134]: print(right)
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3

In [135]: res = pd.merge(left, right, left_index=True, right_index=True, how='outer')

In [136]: print(res)
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3

In [137]: res = pd.merge(left, right, left_index=True, right_index=True, how='inner')

In [138]: print(res)
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2

In [139]: 






# handle overlapping
In [139]: boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})

In [140]: girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

In [141]: print(boys)
   age   k
0    1  K0
1    2  K1
2    3  K2

In [142]: print(girls)
   age   k
0    4  K0
1    5  K0
2    6  K3

In [143]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')

In [144]: print(res)
   age_boy   k  age_girl
0        1  K0         4
1        1  K0         5

In [145]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')

In [146]: print(res)
   age_boy   k  age_girl
0      1.0  K0       4.0
1      1.0  K0       5.0
2      2.0  K1       NaN
3      3.0  K2       NaN
4      NaN  K3       6.0

In [147]:

关于Concat 函数、Merge 函数和 Join 函数

Pandas Moving Window Functions

Pandas plot可视化

#!/usr/bin/python2.7

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))

data = data.cumsum()


data.plot()
plt.show()

#!/usr/bin/python2.7

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
                    index=np.arange(1000), \
                    columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))

data.plot()
plt.show()

#!/usr/bin/python2.7

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
                    index=np.arange(1000), \
                    columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))

# plot method:
#     'bar','hist','box','kde','aera','scatter','pie','hexbin'...
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class AB')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class AC',ax=ax)
plt.show()

补充：Matplotlib 3D图像

#!/usr/bin/python2.7

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)

# height value
Z = np.sin(R)

ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))

plt.show()

#!/usr/bin/python2.7

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)

# height value
Z = np.sin(R)

ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))

ax.contourf(X,Y,Z,zdir='z',offset=-2,cmap='rainbow') # 增加等高线

ax.set_zlim(-2,2)

plt.show()

参考：https://github.com/MorvanZhou

参考：https://morvanzhou.github.io/tutorials/

posted @ 2018-08-12 13:29 马贡多在下雨阅读(936) 评论(0) 收藏举报

刷新页面返回顶部

DataTalkClub

记录和分享

numpy&pandas基础

numpy基础

定义array

array基本运算

卷积运算

random常用操作

array索引

迭代array

合并array

分割array

Numpy.copy()

Numpy其他

Pandas基础

Series

Base Time Series Frequencies

Aggragate for duplicate Indices

Group by month or weekday by passing a function that accesses those fields on the time series’s index.

Resample method arguments

Resampling and Frequency Conversion

Aggregate this data into five-minute chunks or bars by taking the sum of each group.

Resampling with Periods

DataFrame

Pandas筛选数据

Pandas设置值

Pandas删除DataFrame数据

Pandas处理丢失的数据

Pandas导入导出示例

Pandas concat合并

Pandas merge合并

Pandas Moving Window Functions

Pandas plot可视化

补充：Matplotlib 3D图像

公告