Numpy and Pandas

一、为什么要使用Numpy and Pandas？

　　运算速度快：numpy 和 pandas 都是采用 C 语言编写, pandas 又是基于 numpy, 是 numpy 的升级版本。

　　消耗资源少：采用的是矩阵运算，会比 python 自带的字典或者列表快好多。

　　numpy 和 pandas是科学计算中很重要的两个模块，可以应用于数据分析，机器学习和深度学习

二、Numpy 和 Pandas 安装

　　可以通过sudo pip install numpy和sudo pip install pandas进行安装。

三、Numpy 的学习

　　ndim：维度

　　shape：行数和列数

　　size：元素个数

　　①、numpy属性：

 1 import numpy as np #为了方便使用numpy 采用np简写
 2 
 3 array = np.array([[1,2,3],[2,3,4]])  #列表转化为矩阵
 4 print(array)
 5 
 6 print('number of dim:',array.ndim)  # 维度
 7 # number of dim: 2
 8 
 9 print('shape :',array.shape)    # 行数和列数
10 # shape : (2, 3)
11 
12 print('size:',array.size)   # 元素个数
13 # size: 6

　　②、numpy创建array

　　　　关键字：array：创建数组

　　　　　　　　dtype：指定数据类型

　　　　　　　　zeros：创建数据全为0

　　　　　　　　 ones：创建数据全为1

　　　　　　　　empty：创建数据接近0

　　　　　　　　arrange：按指定范围创建数据

　　　　　　　　linspace：创建线段

　　　　创建数组：

1 a = np.array([2,23,4])  # list 1d
2 print(a)
3 # [2 23 4]

　　　　指定数据 dtype：

 1 a = np.array([2,23,4],dtype=np.int)
 2 print(a.dtype)
 3 # int 64
 4 
 5 a = np.array([2,23,4],dtype=np.int32)
 6 print(a.dtype)
 7 # int32
 8 
 9 a = np.array([2,23,4],dtype=np.float)
10 print(a.dtype)
11 # float64
12 
13 a = np.array([2,23,4],dtype=np.float32)
14 print(a.dtype)
15 # float32

　　　　创建特定数据：

 1 a = np.array([[2,23,4],[2,32,4]])  # 2d 矩阵 2行3列
 2 print(a)
 3 """
 4 [[ 2 23  4]
 5  [ 2 32  4]]
 6 """
 7 
 8 #创建全零数组
 9 a = np.zeros((3,4)) # 数据全为0，3行4列
10 """
11 array([[ 0.,  0.,  0.,  0.],
12        [ 0.,  0.,  0.,  0.],
13        [ 0.,  0.,  0.,  0.]])
14 """
15 
16 #创建全一数组, 同时也能指定这些特定数据的 dtype:
17 a = np.ones((3,4),dtype = np.int)   # 数据为1，3行4列
18 """
19 array([[1, 1, 1, 1],
20        [1, 1, 1, 1],
21        [1, 1, 1, 1]])
22 """
23 
24 #创建全空数组, 其实每个值都是接近于零的数:
25 a = np.empty((3,4)) # 数据为empty，3行4列
26 """
27 array([[  0.00000000e+000,   4.94065646e-324,   9.88131292e-324,
28           1.48219694e-323],
29        [  1.97626258e-323,   2.47032823e-323,   2.96439388e-323,
30           3.45845952e-323],
31        [  3.95252517e-323,   4.44659081e-323,   4.94065646e-323,
32           5.43472210e-323]])
33 """
34 
35 #用 arange 创建连续数组:
36 a = np.arange(10,20,2) # 10-19 的数据，2步长
37 """
38 array([10, 12, 14, 16, 18])
39 """
40 
41 #使用 reshape 改变数据的形状
42 a = np.arange(12).reshape((3,4))    # 3行4列，0到11
43 """
44 array([[ 0,  1,  2,  3],
45        [ 4,  5,  6,  7],
46        [ 8,  9, 10, 11]])
47 """
48 
49 #用 linspace 创建线段型数据:
50 a = np.linspace(1,10,20)    # 开始端1，结束端10，且分割成20个数据，生成线段
51 """
52 array([  1.        ,   1.47368421,   1.94736842,   2.42105263,
53          2.89473684,   3.36842105,   3.84210526,   4.31578947,
54          4.78947368,   5.26315789,   5.73684211,   6.21052632,
55          6.68421053,   7.15789474,   7.63157895,   8.10526316,
56          8.57894737,   9.05263158,   9.52631579,  10.        ])
57 """
58 
59 #进行 reshape 工作:
60 a = np.linspace(1,10,20).reshape((5,4)) # 更改shape
61 """
62 array([[  1.        ,   1.47368421,   1.94736842,   2.42105263],
63        [  2.89473684,   3.36842105,   3.84210526,   4.31578947],
64        [  4.78947368,   5.26315789,   5.73684211,   6.21052632],
65        [  6.68421053,   7.15789474,   7.63157895,   8.10526316],
66        [  8.57894737,   9.05263158,   9.52631579,  10.        ]])
67 """

　　③、numpy的基础运算1：

 1 import numpy as np
 2 a=np.array([10,20,30,40])   # array([10, 20, 30, 40])
 3 b=np.arange(4)              # array([0, 1, 2, 3])
 4 
 5 #numpy 的几种基本运算：
 6 c=a-b  # array([10, 19, 28, 37])
 7 
 8 c=a+b   # array([10, 21, 32, 43])
 9 
10 c=a*b   # array([  0,  20,  60, 120])
11 
12 c=b**2  # array([0, 1, 4, 9])
13 
14 c=10*np.sin(a)  
15 # array([-5.44021111,  9.12945251, -9.88031624,  7.4511316 ])
16 
17 print(b<3)  
18 # array([ True,  True,  True, False], dtype=bool)
19 
20 #对多行多维度的矩阵进行操作，需要对开始的脚本进行一些修改：
21 a=np.array([[1,1],[0,1]])
22 b=np.arange(4).reshape((2,2))
23 
24 print(a)
25 # array([[1, 1],
26 #       [0, 1]])
27 
28 print(b)
29 # array([[0, 1],
30 #       [2, 3]])
31 
32 #矩阵乘法
33 c_dot = np.dot(a,b)
34 # array([[2, 4],
35 #       [2, 3]])
36 
37 c_dot_2 = a.dot(b)
38 # array([[2, 4],
39 #       [2, 3]])
40 
41 #sum(), min(), max()的使用：
42 import numpy as np
43 a=np.random.random((2,4))
44 print(a)
45 # array([[ 0.94692159,  0.20821798,  0.35339414,  0.2805278 ],
46 #       [ 0.04836775,  0.04023552,  0.44091941,  0.21665268]])
47 
48 np.sum(a)   # 4.4043622002745959
49 np.min(a)   # 0.23651223533671784
50 np.max(a)   # 0.90438450240606416
51 
52 #当axis的值为0的时候，将会以列作为查找单元， 当axis的值为1的时候，将会以行作为查找单元。
53 print("a =",a)
54 # a = [[ 0.23651224  0.41900661  0.84869417  0.46456022]
55 # [ 0.60771087  0.9043845   0.36603285  0.55746074]]
56 
57 print("sum =",np.sum(a,axis=1))
58 # sum = [ 1.96877324  2.43558896]
59 
60 print("min =",np.min(a,axis=0))
61 # min = [ 0.23651224  0.41900661  0.36603285  0.46456022]
62 
63 print("max =",np.max(a,axis=1))
64 # max = [ 0.84869417  0.9043845 ]

　　④、numpy 的几种基本运算2：

 1 import numpy as np
 2 A = np.arange(2,14).reshape((3,4)) 
 3 
 4 # array([[ 2, 3, 4, 5]
 5 #        [ 6, 7, 8, 9]
 6 #        [10,11,12,13]])
 7 
 8 #最小值和最大值的索引号         
 9 print(np.argmin(A))    # 0
10 print(np.argmax(A))    # 11
11 
12 #整个矩阵的均值
13 print(np.mean(A))        # 7.5
14 print(np.average(A))     # 7.5
15 
16 #mean()函数还有另外一种写法：
17 print(A.mean())          # 7.5
18 
19 #求解中位数的函数：
20 print(A.median())       # 7.5
21 
22 #cumsum()累加函数
23 print(np.cumsum(A)) 
24 # [2 5 9 14 20 27 35 44 54 65 77 90]
25 
26 #累差运算函数：该函数计算的便是每一行中后一项与前一项之差。故一个3行4列矩阵通过函数计算得到的矩阵便是3行3列的矩阵。
27 print(np.diff(A))    
28 
29 # [[1 1 1]
30 #  [1 1 1]
31 #  [1 1 1]]
32 
33 #nonzero()函数：将所有非零元素的行与列坐标分割开，重构成两个分别关于行和列的矩阵。
34 print(np.nonzero(A))    
35 
36 # (array([0,0,0,0,1,1,1,1,2,2,2,2]),array([0,1,2,3,0,1,2,3,0,1,2,3]))
37 
38 #排序函数仍然仅针对每一行进行从小到大排序操作：
39 import numpy as np
40 A = np.arange(14,2, -1).reshape((3,4)) 
41 
42 # array([[14, 13, 12, 11],
43 #       [10,  9,  8,  7],
44 #       [ 6,  5,  4,  3]])
45 
46 print(np.sort(A))    
47 
48 # array([[11,12,13,14]
49 #        [ 7, 8, 9,10]
50 #        [ 3, 4, 5, 6]])
51 
52 #矩阵的转置有两种表示方法：
53 print(np.transpose(A))    
54 print(A.T)
55 
56 # array([[14,10, 6]
57 #        [13, 9, 5]
58 #        [12, 8, 4]
59 #        [11, 7, 3]])
60 # array([[14,10, 6]
61 #        [13, 9, 5]
62 #        [12, 8, 4]
63 #        [11, 7, 3]])
64 
65 #函数的格式是clip(Array,Array_min,Array_max)，顾名思义，Array指的是将要被执行用的矩阵，而后面的最小值最大值则用于让函数判断矩阵中元素是否有比最小值小的或者比最大值大的元素，并将这些指定的元素转换为最小值或者最大值。
66 print(A)
67 # array([[14,13,12,11]
68 #        [10, 9, 8, 7]
69 #        [ 6, 5, 4, 3]])
70 
71 print(np.clip(A,5,9))    
72 # array([[ 9, 9, 9, 9]
73 #        [ 9, 9, 8, 7]
74 #        [ 6, 5, 5, 5]])

　　⑤、Numpy 索引

 1 #一维索引
 2 import numpy as np
 3 A = np.arange(3,15)
 4 
 5 # array([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
 6          
 7 print(A[3])    # 6
 8 
 9 #将矩阵转换为二维的，此时进行同样的操作：
10 #A[2]对应的就是矩阵A中第三行(从0开始算第一行)的所有元素。
11 A = np.arange(3,15).reshape((3,4))
12 """
13 array([[ 3,  4,  5,  6]
14        [ 7,  8,  9, 10]
15        [11, 12, 13, 14]])
16 """
17          
18 print(A[2])         
19 # [11 12 13 14]

 1 A = np.arange(3,15).reshape((3,4))
 2 """
 3 array([[ 3,  4,  5,  6]
 4        [ 7,  8,  9, 10]
 5        [11, 12, 13, 14]])
 6 """
 7 print(A[1][1])      # 8
 8 print(A[1, 1:3])    # [8 9]
 9 
10 #利用for函数进行打印：
11 for row in A:
12     print(row)
13 """    
14 [ 3,  4,  5, 6]
15 [ 7,  8,  9, 10]
16 [11, 12, 13, 14]
17 """
18 
19 #进行逐列打印
20 for column in A.T:
21     print(column)
22 """  
23 [ 3,  7,  11]
24 [ 4,  8,  12]
25 [ 5,  9,  13]
26 [ 6, 10,  14]
27 """
28 
29 #flatten是一个展开性质的函数，将多维的矩阵进行展开成1行的数列。而flat是一个迭代器，本身是一个object属性。
30 import numpy as np
31 A = np.arange(3,15).reshape((3,4))
32          
33 print(A.flatten())   
34 # array([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
35 
36 for item in A.flat:
37     print(item)
38     
39 # 3
40 # 4
41 ……
42 # 14

　　⑥、Numpy array 合并

 1 #vertical stack本身属于一种上下合并，即对括号中的两个整体进行对应操作。
 2 import numpy as np
 3 A = np.array([1,1,1])
 4 B = np.array([2,2,2])
 5          
 6 print(np.vstack((A,B)))    # vertical stack
 7 """
 8 [[1,1,1]
 9  [2,2,2]]
10 """
11 
12 #对组合而成的矩阵进行属性探究：利用shape函数可以让我们很容易地知道A和C的属性，从打印出的结果来看，A仅仅是一个拥有3项元素的数组（数列），而合并后得到的C是一个2行3列的矩阵。
13 C = np.vstack((A,B))      
14 print(A.shape,C.shape)
15 
16 # (3,) (2,3)
17 
18 #左右合并：
19 D = np.hstack((A,B))       # horizontal stack
20 
21 print(D)
22 # [1,1,1,2,2,2]
23 
24 print(A.shape,D.shape)
25 # (3,) (6,)
26 
27 #array转换为了1行3列以及3行1列的矩阵
28 print(A[np.newaxis,:])
29 # [[1 1 1]]
30 
31 print(A[np.newaxis,:].shape)
32 # (1,3)
33 
34 print(A[:,np.newaxis])
35 """
36 [[1]
37 [1]
38 [1]]
39 """
40 
41 print(A[:,np.newaxis].shape)
42 # (3,1)
43 
44 #综合起来:
45 import numpy as np
46 A = np.array([1,1,1])[:,np.newaxis]
47 B = np.array([2,2,2])[:,np.newaxis]
48          
49 C = np.vstack((A,B))   # vertical stack
50 D = np.hstack((A,B))   # horizontal stack
51 
52 print(D)
53 """
54 [[1 2]
55 [1 2]
56 [1 2]]
57 """
58 
59 print(A.shape,D.shape)
60 # (3,1) (3,2)
61 
62 #合并操作需要针对多个矩阵或序列时，借助concatenate函数更加方便：
63 C = np.concatenate((A,B,B,A),axis=0)
64 
65 print(C)
66 """
67 array([[1],
68        [1],
69        [1],
70        [2],
71        [2],
72        [2],
73        [2],
74        [2],
75        [2],
76        [1],
77        [1],
78        [1]])
79 """
80 
81 D = np.concatenate((A,B,B,A),axis=1)
82 
83 print(D)
84 """
85 array([[1, 2, 2, 1],
86        [1, 2, 2, 1],
87        [1, 2, 2, 1]])
88 """

　　⑦、Numpy array 分割

 1 #创建数据 
 2 import numpy as np
 3 #建立3行4列的Array
 4 A = np.arange(12).reshape((3, 4))
 5 print(A)
 6 """
 7 array([[ 0,  1,  2,  3],
 8     [ 4,  5,  6,  7],
 9     [ 8,  9, 10, 11]])
10 """
11 
12 #纵向分割
13 print(np.split(A, 2, axis=1))
14 """
15 [array([[0, 1],
16         [4, 5],
17         [8, 9]]), array([[ 2,  3],
18         [ 6,  7],
19         [10, 11]])]
20 """
21 
22 #横向分割
23 print(np.split(A, 3, axis=0))
24 
25 # [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
26 
27 #不等量的分割 ,np.array_split()
28 print(np.array_split(A, 3, axis=1))
29 """
30 [array([[0, 1],
31         [4, 5],
32         [8, 9]]), array([[ 2],
33         [ 6],
34         [10]]), array([[ 3],
35         [ 7],
36         [11]])]
37 """
38 
39 #其他的分割方式
40 print(np.vsplit(A, 3)) #等于 print(np.split(A, 3, axis=0))
41 
42 # [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
43 
44 
45 print(np.hsplit(A, 2)) #等于 print(np.split(A, 2, axis=1))
46 """
47 [array([[0, 1],
48        [4, 5],
49        [8, 9]]), array([[ 2,  3],
50         [ 6,  7],
51         [10, 11]])]
52 """

　　⑦、Numpy copy & deep copy

 1 #= 的赋值方式会带有关联性 
 2 import numpy as np
 3 
 4 a = np.arange(4)
 5 # array([0, 1, 2, 3])
 6 
 7 b = a
 8 c = a
 9 d = b
10 #改变a的第一个值，b、c、d的第一个值也会同时改变。
11 a[0] = 11
12 print(a)
13 # array([11,  1,  2,  3])
14 
15 #确认b、c、d是否与a相同
16 b is a  # True
17 c is a  # True
18 d is a  # True
19 
20 #同样更改d的值，a、b、c也会改变
21 d[1:3] = [22, 33]   # array([11, 22, 33,  3])
22 print(a)            # array([11, 22, 33,  3])
23 print(b)            # array([11, 22, 33,  3])
24 print(c)            # array([11, 22, 33,  3])
25 
26 #copy() 的赋值方式没有关联性 
27 b = a.copy()    # deep copy
28 print(b)        # array([11, 22, 33,  3])
29 a[3] = 44
30 print(a)        # array([11, 22, 33, 44])
31 print(b)        # array([11, 22, 33,  3])

四、Pandas学习

　　Numpy 和 Pandas 有什么不同？

　　用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的，没有数值标签，而 Pandas 就是字典形式。Pandas是基于Numpy构建的，让Numpy为中心的应用变得更加简单。

　　要使用pandas，首先需要了解他主要两个数据结构：Series和DataFrame。

　　①、pandas基本情况

 1 #Series用法
 2 #Series的字符串表现形式为：索引在左边，值在右边。由于我们没有为数据指定索引。于是会自动创建一个0到N-1（N为长度）的整数型索引。
 3 import pandas as pd
 4 import numpy as np
 5 s = pd.Series([1,3,6,np.nan,44,1])
 6 
 7 print(s)
 8 """
 9 0     1.0
10 1     3.0
11 2     6.0
12 3     NaN
13 4    44.0
14 5     1.0
15 dtype: float64
16 """

  1 #DataFrame用法
  2 #DataFrame是一个表格型的数据结构，它包含有一组有序的列，每列可以是不同的值类型（数值，字符串，布尔值等）。DataFrame既有行索引也有列索引， 它可以被看做由Series组成的大字典。
  3 dates = pd.date_range('20160101',periods=6)
  4 df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
  5 
  6 print(df)
  7 """
  8                    a         b         c         d
  9 2016-01-01 -0.253065 -2.071051 -0.640515  0.613663
 10 2016-01-02 -1.147178  1.532470  0.989255 -0.499761
 11 2016-01-03  1.221656 -2.390171  1.862914  0.778070
 12 2016-01-04  1.473877 -0.046419  0.610046  0.204672
 13 2016-01-05 -1.584752 -0.700592  1.487264 -1.778293
 14 2016-01-06  0.633675 -1.414157 -0.277066 -0.442545
 15 """
 16 
 17 #DataFrame 的一些简单运用
 18 print(df['b'])
 19 
 20 """
 21 2016-01-01   -2.071051
 22 2016-01-02    1.532470
 23 2016-01-03   -2.390171
 24 2016-01-04   -0.046419
 25 2016-01-05   -0.700592
 26 2016-01-06   -1.414157
 27 Freq: D, Name: b, dtype: float64
 28 """
 29 
 30 #创建一组没有给定行标签和列标签的数据 df1，默认的从0开始 index. 
 31 df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
 32 print(df1)
 33 
 34 """
 35    0  1   2   3
 36 0  0  1   2   3
 37 1  4  5   6   7
 38 2  8  9  10  11
 39 """
 40 
 41 #还有一种生成 df 的方法
 42 df2 = pd.DataFrame({'A' : 1.,
 43                     'B' : pd.Timestamp('20130102'),
 44                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
 45                     'D' : np.array([3] * 4,dtype='int32'),
 46                     'E' : pd.Categorical(["test","train","test","train"]),
 47                     'F' : 'foo'})
 48                     
 49 print(df2)
 50 
 51 """
 52      A          B    C  D      E    F
 53 0  1.0 2013-01-02  1.0  3   test  foo
 54 1  1.0 2013-01-02  1.0  3  train  foo
 55 2  1.0 2013-01-02  1.0  3   test  foo
 56 3  1.0 2013-01-02  1.0  3  train  foo
 57 """
 58 
 59 #查看数据中的类型
 60 print(df2.dtypes)
 61 
 62 """
 63 df2.dtypes
 64 A           float64
 65 B    datetime64[ns]
 66 C           float32
 67 D             int32
 68 E          category
 69 F            object
 70 dtype: object
 71 """
 72 
 73 #对列的序号:
 74 print(df2.index)
 75 
 76 # Int64Index([0, 1, 2, 3], dtype='int64')
 77 
 78 #每种数据的名称：
 79 print(df2.columns)
 80 
 81 # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
 82 
 83 
 84 #所有df2的值:
 85 print(df2.values)
 86 
 87 """
 88 array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
 89        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
 90        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
 91        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']], dtype=object)
 92 """
 93 
 94 #数据的总结, 可以用 describe():st为标准差
 95 df2.describe()
 96 
 97 """
 98          A    C    D
 99 count  4.0  4.0  4.0
100 mean   1.0  1.0  3.0
101 std    0.0  0.0  0.0
102 min    1.0  1.0  3.0
103 25%    1.0  1.0  3.0
104 50%    1.0  1.0  3.0
105 75%    1.0  1.0  3.0
106 max    1.0  1.0  3.0
107 """
108 
109 #翻转数据, transpose:
110 print(df2.T)
111 
112 """                   
113 0                    1                    2  \
114 A                    1                    1                    1   
115 B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
116 C                    1                    1                    1   
117 D                    3                    3                    3   
118 E                 test                train                 test   
119 F                  foo                  foo                  foo   
120 
121                      3  
122 A                    1  
123 B  2013-01-02 00:00:00  
124 C                    1  
125 D                    3  
126 E                train  
127 F                  foo  
128 
129 """
130 
131 #对数据的 index 进行排序并输出:
132 print(df2.sort_index(axis=1, ascending=False))
133 
134 """
135      F      E  D    C          B    A
136 0  foo   test  3  1.0 2013-01-02  1.0
137 1  foo  train  3  1.0 2013-01-02  1.0
138 2  foo   test  3  1.0 2013-01-02  1.0
139 3  foo  train  3  1.0 2013-01-02  1.0
140 """
141 
142 #对数据 值 排序输出:
143 print(df2.sort_values(by='B'))
144 
145 """
146      A          B    C  D      E    F
147 0  1.0 2013-01-02  1.0  3   test  foo
148 1  1.0 2013-01-02  1.0  3  train  foo
149 2  1.0 2013-01-02  1.0  3   test  foo
150 3  1.0 2013-01-02  1.0  3  train  foo
151 """

　　②、Pandas 选择数据

  1 #建立了一个 6X4 的矩阵数据
  2 dates = pd.date_range('20130101', periods=6)
  3 df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
  4 
  5 """
  6              A   B   C   D
  7 2013-01-01   0   1   2   3
  8 2013-01-02   4   5   6   7
  9 2013-01-03   8   9  10  11
 10 2013-01-04  12  13  14  15
 11 2013-01-05  16  17  18  19
 12 2013-01-06  20  21  22  23
 13 """
 14 
 15 #简单的筛选
 16 #选取DataFrame中的数据
 17 print(df['A'])
 18 print(df.A)
 19 
 20 """
 21 2013-01-01     0
 22 2013-01-02     4
 23 2013-01-03     8
 24 2013-01-04    12
 25 2013-01-05    16
 26 2013-01-06    20
 27 Freq: D, Name: A, dtype: int64
 28 """
 29 
 30 #选择跨越多行或多列:
 31 print(df[0:3])
 32  
 33 """
 34             A  B   C   D
 35 2013-01-01  0  1   2   3
 36 2013-01-02  4  5   6   7
 37 2013-01-03  8  9  10  11
 38 """
 39 
 40 print(df['20130102':'20130104'])
 41 
 42 """
 43 A   B   C   D
 44 2013-01-02   4   5   6   7
 45 2013-01-03   8   9  10  11
 46 2013-01-04  12  13  14  15
 47 """
 48 
 49 #根据标签 loc
 50 print(df.loc['20130102'])
 51 """
 52 A    4
 53 B    5
 54 C    6
 55 D    7
 56 Name: 2013-01-02 00:00:00, dtype: int64
 57 """
 58 
 59 print(df.loc[:,['A','B']]) 
 60 """
 61              A   B
 62 2013-01-01   0   1
 63 2013-01-02   4   5
 64 2013-01-03   8   9
 65 2013-01-04  12  13
 66 2013-01-05  16  17
 67 2013-01-06  20  21
 68 """
 69 
 70 print(df.loc['20130102',['A','B']])
 71 """
 72 A    4
 73 B    5
 74 Name: 2013-01-02 00:00:00, dtype: int64
 75 """
 76 
 77 #采用位置进行选择 iloc
 78 print(df.iloc[3,1])
 79 # 13
 80 
 81 print(df.iloc[3:5,1:3])
 82 """
 83              B   C
 84 2013-01-04  13  14
 85 2013-01-05  17  18
 86 """
 87 
 88 print(df.iloc[[1,3,5],1:3])
 89 """
 90              B   C
 91 2013-01-02   5   6
 92 2013-01-04  13  14
 93 2013-01-06  21  22
 94 
 95 """
 96 
 97 #根据混合的这两种 ix,采用混合选择 ix, 其中选择’A’和’C’的两列，并选择前三行的数据。
 98 print(df.ix[:3,['A','C']])
 99 """
100             A   C
101 2013-01-01  0   2
102 2013-01-02  4   6
103 2013-01-03  8  10
104 """
105 #通过判断的筛选
106 print(df[df.A>8])
107 """
108              A   B   C   D
109 2013-01-04  12  13  14  15
110 2013-01-05  16  17  18  19
111 2013-01-06  20  21  22  23
112 """

　　③、Pandas 设置值

 1 #创建数据 
 2 import pandas as pd
 3 import numpy as np
 4 datas = pd.date_range("20190508", periods=6)
 5 df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=datas, columns=['A', 'B', 'C', 'D'])
 6 print(df)
 7 
 8 #利用索引或者标签确定需要修改值的位置：
 9 df.iloc[2, 2] = 1111
10 df.loc['20190508', 'B'] = 2222
11 print(df)
12 
13 #根据条件设置
14 df.B[df.A>4] = 0
15 """
16                 A     B     C   D
17 2013-01-01   0  2222     2   3
18 2013-01-02   4     5     6   7
19 2013-01-03   8     0  1111  11
20 2013-01-04  12     0    14  15
21 2013-01-05  16     0    18  19
22 2013-01-06  20     0    22  23 
23 """
24 
25 #按行或列设置，对整列做批处理, 加上一列 ‘F’, 并将 F 列全改为 NaN, 如下:
26 df['F'] = np.nan
27 """
28              A     B     C   D   F
29 2013-01-01   0  2222     2   3 NaN
30 2013-01-02   4     5     6   7 NaN
31 2013-01-03   8     0  1111  11 NaN
32 2013-01-04  12     0    14  15 NaN
33 2013-01-05  16     0    18  19 NaN
34 2013-01-06  20     0    22  23 NaN
35 """
36 
37 #添加数据，用上面的方法也可以加上 Series 序列（但是长度必须对齐）。
38 df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101',periods=6)) 
39 """
40              A     B     C   D   F  E
41 2013-01-01   0  2222     2   3 NaN  1
42 2013-01-02   4     5     6   7 NaN  2
43 2013-01-03   8     0  1111  11 NaN  3
44 2013-01-04  12     0    14  15 NaN  4
45 2013-01-05  16     0    18  19 NaN  5
46 2013-01-06  20     0    22  23 NaN  6
47 """

　　④、Pandas 处理丢失数据

 1 #如何删除或者是填补这些 NaN 数据
 2 #建立了一个6X4的矩阵数据并且把两个位置置为空
 3 dates = pd.date_range('20130101', periods=6)
 4 df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
 5 df.iloc[0,1] = np.nan
 6 df.iloc[1,2] = np.nan
 7 """
 8              A     B     C   D
 9 2013-01-01   0   NaN   2.0   3
10 2013-01-02   4   5.0   NaN   7
11 2013-01-03   8   9.0  10.0  11
12 2013-01-04  12  13.0  14.0  15
13 2013-01-05  16  17.0  18.0  19
14 2013-01-06  20  21.0  22.0  23
15 """
16 #直接去掉有 NaN 的行或列, 可以使用 dropna
17 df=df.dropna(
18     axis=0,     # 0: 对行进行操作; 1: 对列进行操作
19     how='any'   # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop 
20     ) 
21 """
22              A     B     C   D
23 2013-01-03   8   9.0  10.0  11
24 2013-01-04  12  13.0  14.0  15
25 2013-01-05  16  17.0  18.0  19
26 2013-01-06  20  21.0  22.0  23
27 """
28 
29 #将 NaN 的值用其他值代替, 比如代替成 0:
30 df.fillna(value=0)
31 """
32              A     B     C   D
33 2013-01-01   0   0.0   2.0   3
34 2013-01-02   4   5.0   0.0   7
35 2013-01-03   8   9.0  10.0  11
36 2013-01-04  12  13.0  14.0  15
37 2013-01-05  16  17.0  18.0  19
38 2013-01-06  20  21.0  22.0  23
39 """
40 
41 #判断是否有缺失数据 NaN, 为 True 表示缺失数据:
42 df.isnull() 
43 """
44                 A      B      C      D
45 2013-01-01  False   True  False  False
46 2013-01-02  False  False   True  False
47 2013-01-03  False  False  False  False
48 2013-01-04  False  False  False  False
49 2013-01-05  False  False  False  False
50 2013-01-06  False  False  False  False
51 """
52 
53 #检测在数据中是否存在 NaN, 如果存在就返回 True:
54 np.any(df.isnull()) == True  
55 # True

　　⑤、Pandas 导入导出

　　　　pandas可以读取与存取的资料格式有很多种，像csv、excel、json、html与pickle等

 1 #读取csv
 2 import pandas as pd #加载模块
 3 
 4 #读取csv
 5 data = pd.read_csv('student.csv')
 6 
 7 #打印出data
 8 print(data)
 9 
10 #将资料存取成pickle
11 data.to_pickle('student.pickle')

　　⑥、Pandas 合并 concat

　　　　pandas处理多组数据的时候往往会要用到数据的合并处理,使用 concat是一种基本的合并方式.而且concat中有很多参数可以调整,合并成你想要的数据形式.

  1 #axis (合并方向)
  2 #axis=0是预设值，因此未设定任何参数时，函数默认axis=0
  3 import pandas as pd
  4 import numpy as np
  5 
  6 #定义资料集
  7 df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
  8 df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
  9 df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
 10 
 11 #concat纵向合并
 12 res = pd.concat([df1, df2, df3], axis=0)
 13 
 14 #打印结果
 15 print(res)
 16 #     a    b    c    d
 17 # 0  0.0  0.0  0.0  0.0
 18 # 1  0.0  0.0  0.0  0.0
 19 # 2  0.0  0.0  0.0  0.0
 20 # 0  1.0  1.0  1.0  1.0
 21 # 1  1.0  1.0  1.0  1.0
 22 # 2  1.0  1.0  1.0  1.0
 23 # 0  2.0  2.0  2.0  2.0
 24 # 1  2.0  2.0  2.0  2.0
 25 # 2  2.0  2.0  2.0  2.0
 26 
 27 #ignore_index (重置 index)
 28 #承上一个例子，并将index_ignore设定为True
 29 res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
 30 
 31 #打印结果
 32 print(res)
 33 #     a    b    c    d
 34 # 0  0.0  0.0  0.0  0.0
 35 # 1  0.0  0.0  0.0  0.0
 36 # 2  0.0  0.0  0.0  0.0
 37 # 3  1.0  1.0  1.0  1.0
 38 # 4  1.0  1.0  1.0  1.0
 39 # 5  1.0  1.0  1.0  1.0
 40 # 6  2.0  2.0  2.0  2.0
 41 # 7  2.0  2.0  2.0  2.0
 42 # 8  2.0  2.0  2.0  2.0
 43 
 44 #join (合并方式)
 45 #join='outer'为预设值，因此未设定任何参数时，函数默认join='outer'。此方式是依照column来做纵向合并，有相同的column上下合并在一起，其他独自的column个自成列，原本没有值的位置皆以NaN填充。
 46 import pandas as pd
 47 import numpy as np
 48 
 49 #定义资料集
 50 df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
 51 df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
 52 
 53 #纵向"外"合并df1与df2
 54 res = pd.concat([df1, df2], axis=0, join='outer')
 55 
 56 print(res)
 57 #     a    b    c    d    e
 58 # 1  0.0  0.0  0.0  0.0  NaN
 59 # 2  0.0  0.0  0.0  0.0  NaN
 60 # 3  0.0  0.0  0.0  0.0  NaN
 61 # 2  NaN  1.0  1.0  1.0  1.0
 62 # 3  NaN  1.0  1.0  1.0  1.0
 63 # 4  NaN  1.0  1.0  1.0  1.0
 64 
 65 #join='inner'，相同的column合并在一起，其他的会被抛弃。
 66 #承上一个例子
 67 
 68 #纵向"内"合并df1与df2
 69 res = pd.concat([df1, df2], axis=0, join='inner')
 70 
 71 #打印结果
 72 print(res)
 73 #     b    c    d
 74 # 1  0.0  0.0  0.0
 75 # 2  0.0  0.0  0.0
 76 # 3  0.0  0.0  0.0
 77 # 2  1.0  1.0  1.0
 78 # 3  1.0  1.0  1.0
 79 # 4  1.0  1.0  1.0
 80 
 81 #重置index并打印结果
 82 res = pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
 83 print(res)
 84 #     b    c    d
 85 # 0  0.0  0.0  0.0
 86 # 1  0.0  0.0  0.0
 87 # 2  0.0  0.0  0.0
 88 # 3  1.0  1.0  1.0
 89 # 4  1.0  1.0  1.0
 90 # 5  1.0  1.0  1.0
 91 
 92 #join_axes (依照 axes 合并)
 93 import pandas as pd
 94 import numpy as np
 95 
 96 #定义资料集
 97 df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
 98 df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
 99 
100 #依照`df1.index`进行横向合并
101 res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
102 
103 #打印结果
104 print(res)
105 #     a    b    c    d    b    c    d    e
106 # 1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
107 # 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
108 # 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
109 
110 #移除join_axes，并打印结果
111 res = pd.concat([df1, df2], axis=1)
112 print(res)
113 #     a    b    c    d    b    c    d    e
114 # 1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
115 # 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
116 # 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
117 # 4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
118 
119 #append (添加数据)
120 #append只有纵向合并，没有横向合并
121 import pandas as pd
122 import numpy as np
123 
124 #定义资料集
125 df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
126 df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
127 df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
128 s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
129 
130 #将df2合并到df1的下面，以及重置index，并打印出结果
131 res = df1.append(df2, ignore_index=True)
132 print(res)
133 #     a    b    c    d
134 # 0  0.0  0.0  0.0  0.0
135 # 1  0.0  0.0  0.0  0.0
136 # 2  0.0  0.0  0.0  0.0
137 # 3  1.0  1.0  1.0  1.0
138 # 4  1.0  1.0  1.0  1.0
139 # 5  1.0  1.0  1.0  1.0
140 
141 #合并多个df，将df2与df3合并至df1的下面，以及重置index，并打印出结果
142 res = df1.append([df2, df3], ignore_index=True)
143 print(res)
144 #     a    b    c    d
145 # 0  0.0  0.0  0.0  0.0
146 # 1  0.0  0.0  0.0  0.0
147 # 2  0.0  0.0  0.0  0.0
148 # 3  1.0  1.0  1.0  1.0
149 # 4  1.0  1.0  1.0  1.0
150 # 5  1.0  1.0  1.0  1.0
151 # 6  1.0  1.0  1.0  1.0
152 # 7  1.0  1.0  1.0  1.0
153 # 8  1.0  1.0  1.0  1.0
154 
155 #合并series，将s1合并至df1，以及重置index，并打印出结果
156 res = df1.append(s1, ignore_index=True)
157 print(res)
158 #     a    b    c    d
159 # 0  0.0  0.0  0.0  0.0
160 # 1  0.0  0.0  0.0  0.0
161 # 2  0.0  0.0  0.0  0.0
162 # 3  1.0  2.0  3.0  4.0

　　⑦、Pandas 合并 merge

　　　　pandas中的merge和concat类似,但主要是用于两组有key column的数据,统一索引的数据. 通常也被用在Database的处理当中。

  1 #依据一组key合并
  2 import pandas as pd
  3 
  4 #定义资料集并打印出
  5 left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
  6                              'A': ['A0', 'A1', 'A2', 'A3'],
  7                              'B': ['B0', 'B1', 'B2', 'B3']})
  8 right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
  9                               'C': ['C0', 'C1', 'C2', 'C3'],
 10                               'D': ['D0', 'D1', 'D2', 'D3']})
 11 
 12 print(left)
 13 #    A   B key
 14 # 0  A0  B0  K0
 15 # 1  A1  B1  K1
 16 # 2  A2  B2  K2
 17 # 3  A3  B3  K3
 18 
 19 print(right)
 20 #    C   D key
 21 # 0  C0  D0  K0
 22 # 1  C1  D1  K1
 23 # 2  C2  D2  K2
 24 # 3  C3  D3  K3
 25 
 26 #依据key column合并，并打印出
 27 res = pd.merge(left, right, on='key')
 28 
 29 print(res)
 30      A   B key   C   D
 31 # 0  A0  B0  K0  C0  D0
 32 # 1  A1  B1  K1  C1  D1
 33 # 2  A2  B2  K2  C2  D2
 34 # 3  A3  B3  K3  C3  D3
 35 
 36 #依据两组key合并
 37 #合并时有4种方法how = ['left', 'right', 'outer', 'inner']，预设值how='inner'。
 38 import pandas as pd
 39 
 40 #定义资料集并打印出
 41 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
 42                       'key2': ['K0', 'K1', 'K0', 'K1'],
 43                       'A': ['A0', 'A1', 'A2', 'A3'],
 44                       'B': ['B0', 'B1', 'B2', 'B3']})
 45 right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
 46                        'key2': ['K0', 'K0', 'K0', 'K0'],
 47                        'C': ['C0', 'C1', 'C2', 'C3'],
 48                        'D': ['D0', 'D1', 'D2', 'D3']})
 49 
 50 print(left)
 51 #    A   B key1 key2
 52 # 0  A0  B0   K0   K0
 53 # 1  A1  B1   K0   K1
 54 # 2  A2  B2   K1   K0
 55 # 3  A3  B3   K2   K1
 56 
 57 print(right)
 58 #    C   D key1 key2
 59 # 0  C0  D0   K0   K0
 60 # 1  C1  D1   K1   K0
 61 # 2  C2  D2   K1   K0
 62 # 3  C3  D3   K2   K0
 63 
 64 #依据key1与key2 columns进行合并，并打印出四种结果['left', 'right', 'outer', 'inner']
 65 res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
 66 print(res)
 67 #    A   B key1 key2   C   D
 68 # 0  A0  B0   K0   K0  C0  D0
 69 # 1  A2  B2   K1   K0  C1  D1
 70 # 2  A2  B2   K1   K0  C2  D2
 71 
 72 res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
 73 print(res)
 74 #     A    B key1 key2    C    D
 75 # 0   A0   B0   K0   K0   C0   D0
 76 # 1   A1   B1   K0   K1  NaN  NaN
 77 # 2   A2   B2   K1   K0   C1   D1
 78 # 3   A2   B2   K1   K0   C2   D2
 79 # 4   A3   B3   K2   K1  NaN  NaN
 80 # 5  NaN  NaN   K2   K0   C3   D3
 81 
 82 res = pd.merge(left, right, on=['key1', 'key2'], how='left')
 83 print(res)
 84 #    A   B key1 key2    C    D
 85 # 0  A0  B0   K0   K0   C0   D0
 86 # 1  A1  B1   K0   K1  NaN  NaN
 87 # 2  A2  B2   K1   K0   C1   D1
 88 # 3  A2  B2   K1   K0   C2   D2
 89 # 4  A3  B3   K2   K1  NaN  NaN
 90 
 91 res = pd.merge(left, right, on=['key1', 'key2'], how='right')
 92 print(res)
 93 #     A    B key1 key2   C   D
 94 # 0   A0   B0   K0   K0  C0  D0
 95 # 1   A2   B2   K1   K0  C1  D1
 96 # 2   A2   B2   K1   K0  C2  D2
 97 # 3  NaN  NaN   K2   K0  C3  D3
 98 
 99 #Indicator ，indicator=True会将合并的记录放在新的一列。
100 import pandas as pd
101 
102 #定义资料集并打印出
103 df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
104 df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
105 
106 print(df1)
107 #   col1 col_left
108 # 0     0        a
109 # 1     1        b
110 
111 print(df2)
112 #   col1  col_right
113 # 0     1          2
114 # 1     2          2
115 # 2     2          2
116 
117 # 依据col1进行合并，并启用indicator=True，最后打印出
118 res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
119 print(res)
120 #   col1 col_left  col_right      _merge
121 # 0   0.0        a        NaN   left_only
122 # 1   1.0        b        2.0        both
123 # 2   2.0      NaN        2.0  right_only
124 # 3   2.0      NaN        2.0  right_only
125 
126 # 自定indicator column的名称，并打印出
127 res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
128 print(res)
129 #   col1 col_left  col_right indicator_column
130 # 0   0.0        a        NaN        left_only
131 # 1   1.0        b        2.0             both
132 # 2   2.0      NaN        2.0       right_only
133 # 3   2.0      NaN        2.0       right_only
134 
135 #依据index合并
136 import pandas as pd
137 
138 #定义资料集并打印出
139 left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
140                      'B': ['B0', 'B1', 'B2']},
141                      index=['K0', 'K1', 'K2'])
142 right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
143                       'D': ['D0', 'D2', 'D3']},
144                      index=['K0', 'K2', 'K3'])
145 
146 print(left)
147 #     A   B
148 # K0  A0  B0
149 # K1  A1  B1
150 # K2  A2  B2
151 
152 print(right)
153 #     C   D
154 # K0  C0  D0
155 # K2  C2  D2
156 # K3  C3  D3
157 
158 #依据左右资料集的index进行合并，how='outer',并打印出
159 res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
160 print(res)
161 #      A    B    C    D
162 # K0   A0   B0   C0   D0
163 # K1   A1   B1  NaN  NaN
164 # K2   A2   B2   C2   D2
165 # K3  NaN  NaN   C3   D3
166 
167 #依据左右资料集的index进行合并，how='inner',并打印出
168 res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
169 print(res)
170 #     A   B   C   D
171 # K0  A0  B0  C0  D0
172 # K2  A2  B2  C2  D2
173 
174 #解决overlapping的问题 
175 import pandas as pd
176 
177 #定义资料集
178 boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
179 girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
180 
181 #使用suffixes解决overlapping的问题
182 res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
183 print(res)
184 #    age_boy   k  age_girl
185 # 0        1  K0         4
186 # 1        1  K0         5

　　⑧、Pandas plot 出图

 1 #创建一个Series
 2 #随机生成1000个数据，Series 默认的 index 就是从0开始的整数，但是这里我显式赋值以便让大家看的更清楚
 3 # 随机生成1000个数据
 4 data = pd.Series(np.random.randn(1000),index=np.arange(1000))
 5  
 6 # 为了方便观看效果, 我们累加这个数据
 7 data.cumsum()
 8 
 9 # pandas 数据可以直接观看其可视化形式
10 data.plot()
11 
12 plt.show()
13 
14 #Dataframe 可视化
15 #生成一个1000*4 的DataFrame，并对他们累加
16 data = pd.DataFrame(
17     np.random.randn(1000,4),
18     index=np.arange(1000),
19     columns=list("ABCD")
20     )
21 data.cumsum()
22 data.plot()
23 plt.show()

　　经常会用到还有scatter，这个会显示散点图，首先给大家说一下在 pandas 中有多少种方法bar，hist，box，kde，area，scatter，hexbin

1 #scatter只有x，y两个属性
2 ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
3 #再画一个在同一个ax上面，选择不一样的数据列，不同的 color 和 label
4 #将之下这个 data 画在上一个 ax 上面
5 data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
6 plt.show()

posted @ 2019-05-08 21:08 嗨_放飞梦想阅读(452) 评论(0) 收藏举报

刷新页面返回顶部

嗨_放飞梦想

Numpy and Pandas

公告