Python Numpy & Pandas

需要安装Numpy 和 Pandas

Numpy

基础创建

 import numpy as np
# 创建
# dtype : int 16, 32, 64; float 16, 32 ,64; complex
         
array = np.array([[1, 2, 3],
                [4, 5, 6]], dtype=np.int)
 
# 创建全部为0的矩阵
array = np.zeros((3, 4))
print(array)
'''
output:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
'''
 
# 创建全部为1的矩阵
array = np.ones((3, 4))
print(array)
'''
output:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
'''
 
# 创建未初始化的矩阵
array = np.empty( (2,3) )
print(array)
'''
[[6.23042070e-307 1.89146896e-307 1.37961302e-306]
 [1.05699242e-307 1.11261638e-306 1.24610927e-306]]
'''
 
 
# 9 numbers from 0 to 2
array = np.linspace( 0, 2, 9 )
print(array)
''''
output:
[0.   0.25 0.5  0.75 1.   1.25 1.5  1.75 2.  ]
'''
 
array = np.arange(15).reshape(3, 5)
print(array)
'''
output:
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
'''
print('number of dim: ', array.ndim)
print('shape: ', array.shape)
print('size: ', array.size)
print('type:', array.dtype)
'''
output:
number of dim:  2
shape:  (3, 5)
size:  15
type: int32
'''

基础运算

 import numpy as np
 
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
print(b)
# output: [0, 1, 2, 3]
# 对应位置元素相减
c = a-b
print(c)
# output: [20, 29, 38, 47]
# b每个元素平方
print(b**2)
# output: [0, 1, 4, 9]
print(10*np.sin(a))
# output: [ 9.12945251, -9.88031624,  7.4511316 , -2.62374854]
# 对于a的每个元素判断是否小于35,并输出结果
print(a<35)
# output: [ True,  True, False, False]

矩阵运算

 import numpy as np
A = np.array( [[1,1],
               [0,1]] )
B = np.array( [[2,0],
               [3,4]] )
# 矩阵对应位置元素相乘
print(A * B)
'''
array([[2, 0],
       [0, 4]])
'''
 
 
# matrix product
print(A @ B)
'''
array([[5, 4],
       [3, 4]])
'''
# another matrix product
print(A.dot(B))
'''
array([[5, 4],
       [3, 4]])
'''
 
# 矩阵求和、最值
a = np.random.random((2,3))
'''
[[0.70284491 0.67667991 0.60919611]
 [0.65004259 0.98493693 0.23953326]]
'''
print(a)
print(a.sum())
print(a.min())
print(a.max())
'''
3.8632337099338896
0.23953325605165765
0.9849369271091678
'''
# axis: 1 对行操作， 0 对列操作
print(a.sum(axis=1))
print(a.min(axis=0))
print(a.max(axis=1))
'''
[1.98872094 1.87451277]
[0.65004259 0.67667991 0.23953326]
[0.70284491 0.98493693]
'''

索引、均值

 import numpy as np
 
a = np.arange(2, 14).reshape((3, 4))
# 最值的索引
print(np.argmin(a))
print(np.argmax(a))
'''
0
11
'''
# 均值
print(a.mean())
print(np.average(a))
'''
7.5
7.5
'''
# 中位数
print(np.median(a))
# 7.5 
# 累加
print(np.cumsum(a))
'''
[ 2  5  9 14 20 27 35 44 54 65 77 90]
'''
# 累差
print(np.diff(a))
'''
[[1 1 1]
 [1 1 1]
 [1 1 1]]
'''
# 非零数 输出对应的行号、列号
print(np.nonzero(a))
# (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
# 排序
print(np.sort(a))
 
# 矩阵转置
print(np.transpose(a))
print(a.T)
'''
[[ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]]
'''
 
print((a.T).dot(a))
'''
[[140 158 176 194]
 [158 179 200 221]
 [176 200 224 248]
 [194 221 248 275]]
'''
# numpy.clip(a, a_min, a_max, out=None)
# 将数组中的元素限制在a_min, a_max之间，大于a_max的就使得它等于 a_max，小于a_min,的就使得它等于a_min。
print(np.clip(a, 5, 9))
'''
[[5 5 5 5]
 [6 7 8 9]
 [9 9 9 9]]
'''

矩阵合并

 import numpy as np
 
a = np.array([1, 1, 1])
b = np.array([2, 2, 2])
 
# 垂直合并
c = np.vstack((a, b))
print(c)
print(a.shape, c.shape)
'''
[[1 1 1]
 [2 2 2]]
(3,) (2, 3)
'''
# 水平合并
d = np.hstack((a, b))
print(d)
print(a.shape, d.shape)
'''
[1 1 1 2 2 2]
(3,) (6,)
'''
 
a = np.array([1, 1, 1])[:, np.newaxis]
b = np.array([2, 2, 2])[:, np.newaxis]
 
print(a)
'''
[[1]
 [1]
 [1]]
'''
# concatenate 可以合并多个矩阵
# axis 可以设置合并方向
e = np.concatenate((a, b, b, a), axis=0)
print(e)
'''
[[1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [1]
 [1]]
'''
e = np.concatenate((a, b, b, a), axis=1)
print(e)
'''
[[1 2 2 1]
 [1 2 2 1]
 [1 2 2 1]]
'''

Pandas

基本操作

 import numpy as np
import pandas as pd
 
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
'''
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''
 
dates = pd.date_range("20210101", periods=6)
print(dates)
 
'''
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')
'''
 
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)
 
'''
                   A         B         C         D
2021-01-01 -0.817100  0.113590  0.401565 -1.007716
2021-01-02  1.241367  0.816412 -0.460833  0.275248
2021-01-03  1.780875 -1.638779  0.895724  1.522088
2021-01-04  0.872075 -0.136323 -0.828276 -1.705793
2021-01-05  0.939255 -0.459394 -0.331591  0.107772
2021-01-06  0.187501 -1.024592 -1.118402 -2.061401
'''
 
df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
print(df2)
'''
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
'''
# 字典,以列的方式输入数据
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
       }
   )
print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''
# 类型
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''
# 行号
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''
 
# 列号
print(df2.columns)
'''
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
 
# 数值
print(df2.values)
'''
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''
 
# 统计信息
print(df2.describe())
'''
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
'''
 
# 转置 适用于数值型
print(df2.T)
'''
                     0  ...                    3
A                    1  ...                    1
B  2013-01-02 00:00:00  ...  2013-01-02 00:00:00
C                    1  ...                    1
D                    3  ...                    3
E                 test  ...                train
F                  foo  ...                  foo
 
[6 rows x 4 columns]
'''
 
# 排序
# 按列 倒序
print(df2.sort_index(axis=1, ascending=False))
'''
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
'''
# 按行 倒序
print(df2.sort_index(axis=0, ascending=False))
'''
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo
'''
 
# 按值排序
print(df2.sort_values(by='E'))
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''

选择数据

 import numpy as np
import pandas as pd
 
 
dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
 A   B   C   D
2021-01-01   0   1   2   3
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''
 
# 选择某一列 两种方式
print(df['A'])
print(df.A)
'''
2021-01-01     0
2021-01-02     4
2021-01-03     8
2021-01-04    12
2021-01-05    16
2021-01-06    20
Freq: D, Name: A, dtype: int32
'''
 
# 切片
print(df[0:3])
print(df['20210102':'20210104'])
'''
            A  B   C   D
2021-01-01  0  1   2   3
2021-01-02  4  5   6   7
2021-01-03  8  9  10  11
             A   B   C   D
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
 
'''
 
# select by label: loc
print(df.loc['20210102'])
print(df.loc[:,['A','B']])
print(df.loc['20210102', ['A','B']])
'''
A    4
B    5
C    6
D    7
Name: 2021-01-02 00:00:00, dtype: int32
----------------------------------------
 A   B
2021-01-01   0   1
2021-01-02   4   5
2021-01-03   8   9
2021-01-04  12  13
2021-01-05  16  17
2021-01-06  20  21
---------------------------------------
A    4
B    5
Name: 2021-01-02 00:00:00, dtype: int32
 
'''
 
# select by position: iloc
print(df.iloc[3])
'''
A    12
B    13
C    14
D    15
Name: 2021-01-04 00:00:00, dtype: int32
'''
print(df.iloc[3, 1])
# 13
print(df.iloc[3:5,0:2])
'''
             A   B
2021-01-04  12  13
2021-01-05  16  17
'''
print(df.iloc[[1,2,4],[0,2]])
'''
             A   C
2021-01-02   4   6
2021-01-03   8  10
2021-01-05  16  18
'''
 
# mixed selection: ix
print(df.ix[:3, ['A', 'C']])
'''
            A   C
2021-01-01  0   2
2021-01-02  4   6
2021-01-03  8  10
'''
 
# Boolean indexing
print(df[df.A > 0])
'''
             A   B   C   D
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''

赋值

 import numpy as np
import pandas as pd
 
 
dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
             A   B   C   D
2021-01-01   0   1   2   3
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''
df.iloc[2, 2] = 1111
df.iloc[2,2] = 1111
df.loc['2021-01-03', 'D'] = 2222
df.A[df.A>0] = 0
df['F'] = np.nan
df['G']  = pd.Series([1,2,3,4,5,6], index=pd.date_range('20210101', periods=6))
print(df)
'''
            A   B     C     D   F  G
2021-01-01  0   1     2     3 NaN  1
2021-01-02  0   5     6     7 NaN  2
2021-01-03  0   9  1111  2222 NaN  3
2021-01-04  0  13    14    15 NaN  4
2021-01-05  0  17    18    19 NaN  5
2021-01-06  0  21    22    23 NaN  6
'''

处理丢失数据

 import pandas as pd
import numpy as np
 
dates = pd.date_range('20210101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])
 
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
'''
             A     B     C   D
2021-01-01   0   NaN   2.0   3
2021-01-02   4   5.0   NaN   7
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''
 
# 丢掉数据： 按行,存在空值丢掉该行
print(df.dropna(axis=0, how='any'))   # how={'any', 'all'}
'''
             A     B     C   D
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''
 
# 丢掉数据： 按列,存在空值丢掉该列
print(df.dropna(axis=1, how='any'))   # how={'any', 'all'}
'''
             A   D
2021-01-01   0   3
2021-01-02   4   7
2021-01-03   8  11
2021-01-04  12  15
2021-01-05  16  19
2021-01-06  20  23
'''
 
# 空值补零
print(df.fillna(value=0))
'''
             A     B     C   D
2021-01-01   0   0.0   2.0   3
2021-01-02   4   5.0   0.0   7
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''
 
# 检查是否缺失数据
print(pd.isnull(df))
'''
                A      B      C      D
2021-01-01  False   True  False  False
2021-01-02  False  False   True  False
2021-01-03  False  False  False  False
2021-01-04  False  False  False  False
2021-01-05  False  False  False  False
2021-01-06  False  False  False  False
'''
 
# 如果数据太多，使用这种方法进行检查空值
# True 表示至少存在一个空值
print(np.any(df.isnull()))
# output: True

导入导出数据

支持的数据类型:

CSV 推荐使用

HDF5

Excel

Gotchas

 import pandas as pd
 
# read from
data = pd.read_csv('student.csv')
print(data)
 
# save to
data.to_pickle('student.pickle')

数据合并

concat

 import pandas as pd
import numpy as np
 
# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
# print(df1)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
# print(df2)
'''
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
 
'''
# print(df3)
'''
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
 
'''
# 按行合并
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
# print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
'''
 
# join, ('inner', 'outer')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
print(df1)
'''
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
'''
 
print(df2)
'''
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
'''
 
#  join outer 保留所有列，没有的补NaN
res = pd.concat([df1, df2], axis=0, join='outer')
print(res)
'''
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
'''
# join inner - 只保留相交的部分
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
'''
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
'''
 
# join_axes
# 按列，根据df1的行号进行合并，没有的补NaN
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)
'''
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
 
'''
# append
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2, df3])
print(res)
 
'''
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
0  1.0  1.0  1.0  1.0  NaN
1  1.0  1.0  1.0  1.0  NaN
2  1.0  1.0  1.0  1.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
 
'''
#
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
'''

merge

 import pandas as pd
 
# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
'''
print(right)
'''
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
'''
# on 根据key列进行合并
res = pd.merge(left, right, on='key')
print(res)
'''
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
'''
 
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
'''
print(right)
'''
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
'''
 
# default for how='inner'
# 仅保留 两个key值相同的数据
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
'''
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
'''
 
# how = ['left', 'right', 'outer', 'inner']
# outer 保留所有行，数值不存在的补NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
'''
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3
'''
# 根据 left的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
'''
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
'''
 
# 根据 right的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
'''
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3
'''
 
# indicator 显示合并的依据
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
'''
   col1 col_left
0     0        a
1     1        b
'''
print(df2)
'''
   col1  col_right
0     1          2
1     2          2
2     2          2
'''
#
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
'''
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only
'''
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
'''
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only
'''
 
# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                       index=['K0', 'K2', 'K3'])
print(left)
'''
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
'''
print(right)
'''
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
'''
# left_index and right_index
# 根据行号合并
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
'''
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2
 
'''
 
 
# handle overlapping
 
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
print(boys)
'''
    k  age
0  K0    1
1  K1    2
2  K2    3
'''
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(girls)
'''
    k  age
0  K0    4
1  K0    5
2  K3    6
'''
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5
'''
 
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
'''
    k  age_boy  age_girl
0  K0      1.0       4.0
1  K0      1.0       5.0
2  K1      2.0       NaN
3  K2      3.0       NaN
4  K3      NaN       6.0
'''

绘图

 import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
# plot data
 
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 数据累加
data = data.cumsum()
data.plot()
 
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
 
plt.show()

参考资料

NumPy quickstart

10 minutes to pandas

Numpy & Pandas

posted @ 2021-06-18 01:36 ZTianming 阅读(62) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了
· 上周热点回顾（2.24-3.2）

公告

2025年3月

日

一

二

三

四

五

六

当前浏览器不支持canvas,请更换浏览器

ZTianming

种一棵树，最好的时间是十年前，其次是现在。

Python Numpy & Pandas

Numpy

基础创建

基础运算

矩阵运算

索引、均值

矩阵合并

Pandas

基本操作

选择数据

赋值

导入导出数据

CSV 推荐使用

HDF5

Excel

Gotchas

数据合并

concat

merge

绘图

参考资料

NumPy quickstart

10 minutes to pandas

Numpy & Pandas

公告

搜索

常用链接

我的标签

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论

喜欢请打赏

	import numpy as np
	# 创建
	# dtype : int 16, 32, 64; float 16, 32 ,64; complex

	array = np.array([[1, 2, 3],
	[4, 5, 6]], dtype=np.int)

	# 创建全部为0的矩阵
	array = np.zeros((3, 4))
	print(array)
	'''
	output:
	[[0. 0. 0. 0.]
	[0. 0. 0. 0.]
	[0. 0. 0. 0.]]
	'''

	# 创建全部为1的矩阵
	array = np.ones((3, 4))
	print(array)
	'''
	output:
	[[1. 1. 1. 1.]
	[1. 1. 1. 1.]
	[1. 1. 1. 1.]]
	'''

	# 创建未初始化的矩阵
	array = np.empty( (2,3) )
	print(array)
	'''
	[[6.23042070e-307 1.89146896e-307 1.37961302e-306]
	[1.05699242e-307 1.11261638e-306 1.24610927e-306]]
	'''


	# 9 numbers from 0 to 2
	array = np.linspace( 0, 2, 9 )
	print(array)
	''''
	output:
	[0. 0.25 0.5 0.75 1. 1.25 1.5 1.75 2. ]
	'''

	array = np.arange(15).reshape(3, 5)
	print(array)
	'''
	output:
	[[ 0 1 2 3 4]
	[ 5 6 7 8 9]
	[10 11 12 13 14]]
	'''
	print('number of dim: ', array.ndim)
	print('shape: ', array.shape)
	print('size: ', array.size)
	print('type:', array.dtype)
	'''
	output:
	number of dim: 2
	shape: (3, 5)
	size: 15
	type: int32
	'''

	import numpy as np

	a = np.array( [20,30,40,50] )
	b = np.arange( 4 )
	print(b)
	# output: [0, 1, 2, 3]
	# 对应位置元素相减
	c = a-b
	print(c)
	# output: [20, 29, 38, 47]
	# b每个元素平方
	print(b**2)
	# output: [0, 1, 4, 9]
	print(10*np.sin(a))
	# output: [ 9.12945251, -9.88031624, 7.4511316 , -2.62374854]
	# 对于a的每个元素判断是否小于35,并输出结果
	print(a<35)
	# output: [ True, True, False, False]

	import numpy as np
	A = np.array( [[1,1],
	[0,1]] )
	B = np.array( [[2,0],
	[3,4]] )
	# 矩阵对应位置元素相乘
	print(A * B)
	'''
	array([[2, 0],
	[0, 4]])
	'''


	# matrix product
	print(A @ B)
	'''
	array([[5, 4],
	[3, 4]])
	'''
	# another matrix product
	print(A.dot(B))
	'''
	array([[5, 4],
	[3, 4]])
	'''

	# 矩阵求和、最值
	a = np.random.random((2,3))
	'''
	[[0.70284491 0.67667991 0.60919611]
	[0.65004259 0.98493693 0.23953326]]
	'''
	print(a)
	print(a.sum())
	print(a.min())
	print(a.max())
	'''
	3.8632337099338896
	0.23953325605165765
	0.9849369271091678
	'''
	# axis: 1 对行操作， 0 对列操作
	print(a.sum(axis=1))
	print(a.min(axis=0))
	print(a.max(axis=1))
	'''
	[1.98872094 1.87451277]
	[0.65004259 0.67667991 0.23953326]
	[0.70284491 0.98493693]
	'''

	import numpy as np

	a = np.arange(2, 14).reshape((3, 4))
	# 最值的索引
	print(np.argmin(a))
	print(np.argmax(a))
	'''
	0
	11
	'''
	# 均值
	print(a.mean())
	print(np.average(a))
	'''
	7.5
	7.5
	'''
	# 中位数
	print(np.median(a))
	# 7.5
	# 累加
	print(np.cumsum(a))
	'''
	[ 2 5 9 14 20 27 35 44 54 65 77 90]
	'''
	# 累差
	print(np.diff(a))
	'''
	[[1 1 1]
	[1 1 1]
	[1 1 1]]
	'''
	# 非零数输出对应的行号、列号
	print(np.nonzero(a))
	# (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
	# 排序
	print(np.sort(a))

	# 矩阵转置
	print(np.transpose(a))
	print(a.T)
	'''
	[[ 2 6 10]
	[ 3 7 11]
	[ 4 8 12]
	[ 5 9 13]]
	'''

	print((a.T).dot(a))
	'''
	[[140 158 176 194]
	[158 179 200 221]
	[176 200 224 248]
	[194 221 248 275]]
	'''
	# numpy.clip(a, a_min, a_max, out=None)
	# 将数组中的元素限制在a_min, a_max之间，大于a_max的就使得它等于 a_max，小于a_min,的就使得它等于a_min。
	print(np.clip(a, 5, 9))
	'''
	[[5 5 5 5]
	[6 7 8 9]
	[9 9 9 9]]
	'''

	import numpy as np

	a = np.array([1, 1, 1])
	b = np.array([2, 2, 2])

	# 垂直合并
	c = np.vstack((a, b))
	print(c)
	print(a.shape, c.shape)
	'''
	[[1 1 1]
	[2 2 2]]
	(3,) (2, 3)
	'''
	# 水平合并
	d = np.hstack((a, b))
	print(d)
	print(a.shape, d.shape)
	'''
	[1 1 1 2 2 2]
	(3,) (6,)
	'''

	a = np.array([1, 1, 1])[:, np.newaxis]
	b = np.array([2, 2, 2])[:, np.newaxis]

	print(a)
	'''
	[[1]
	[1]
	[1]]
	'''
	# concatenate 可以合并多个矩阵
	# axis 可以设置合并方向
	e = np.concatenate((a, b, b, a), axis=0)
	print(e)
	'''
	[[1]
	[1]
	[1]
	[2]
	[2]
	[2]
	[2]
	[2]
	[2]
	[1]
	[1]
	[1]]
	'''
	e = np.concatenate((a, b, b, a), axis=1)
	print(e)
	'''
	[[1 2 2 1]
	[1 2 2 1]
	[1 2 2 1]]
	'''

	import numpy as np
	import pandas as pd

	s = pd.Series([1, 3, 5, np.nan, 6, 8])
	print(s)
	'''
	0 1.0
	1 3.0
	2 5.0
	3 NaN
	4 6.0
	5 8.0
	dtype: float64
	'''

	dates = pd.date_range("20210101", periods=6)
	print(dates)

	'''
	DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
	'2021-01-05', '2021-01-06'],
	dtype='datetime64[ns]', freq='D')
	'''

	df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
	print(df)

	'''
	A B C D
	2021-01-01 -0.817100 0.113590 0.401565 -1.007716
	2021-01-02 1.241367 0.816412 -0.460833 0.275248
	2021-01-03 1.780875 -1.638779 0.895724 1.522088
	2021-01-04 0.872075 -0.136323 -0.828276 -1.705793
	2021-01-05 0.939255 -0.459394 -0.331591 0.107772
	2021-01-06 0.187501 -1.024592 -1.118402 -2.061401
	'''

	df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
	print(df2)
	'''
	0 1 2 3
	0 0 1 2 3
	1 4 5 6 7
	2 8 9 10 11
	'''
	# 字典,以列的方式输入数据
	df2 = pd.DataFrame(
	{
	"A": 1.0,
	"B": pd.Timestamp("20130102"),
	"C": pd.Series(1, index=list(range(4)), dtype="float32"),
	"D": np.array([3] * 4, dtype="int32"),
	"E": pd.Categorical(["test", "train", "test", "train"]),
	"F": "foo",
	}
	)
	print(df2)
	'''
	A B C D E F
	0 1.0 2013-01-02 1.0 3 test foo
	1 1.0 2013-01-02 1.0 3 train foo
	2 1.0 2013-01-02 1.0 3 test foo
	3 1.0 2013-01-02 1.0 3 train foo
	'''
	# 类型
	print(df2.dtypes)
	'''
	A float64
	B datetime64[ns]
	C float32
	D int32
	E category
	F object
	dtype: object
	'''
	# 行号
	print(df2.index)
	'''
	Int64Index([0, 1, 2, 3], dtype='int64')
	'''

	# 列号
	print(df2.columns)
	'''
	Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
	'''

	# 数值
	print(df2.values)
	'''
	[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
	[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
	[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
	[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
	'''

	# 统计信息
	print(df2.describe())
	'''
	A C D
	count 4.0 4.0 4.0
	mean 1.0 1.0 3.0
	std 0.0 0.0 0.0
	min 1.0 1.0 3.0
	25% 1.0 1.0 3.0
	50% 1.0 1.0 3.0
	75% 1.0 1.0 3.0
	max 1.0 1.0 3.0
	'''

	# 转置适用于数值型
	print(df2.T)
	'''
	0 ... 3
	A 1 ... 1
	B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00
	C 1 ... 1
	D 3 ... 3
	E test ... train
	F foo ... foo

	[6 rows x 4 columns]
	'''

	# 排序
	# 按列倒序
	print(df2.sort_index(axis=1, ascending=False))
	'''
	F E D C B A
	0 foo test 3 1.0 2013-01-02 1.0
	1 foo train 3 1.0 2013-01-02 1.0
	2 foo test 3 1.0 2013-01-02 1.0
	3 foo train 3 1.0 2013-01-02 1.0
	'''
	# 按行倒序
	print(df2.sort_index(axis=0, ascending=False))
	'''
	A B C D E F
	3 1.0 2013-01-02 1.0 3 train foo
	2 1.0 2013-01-02 1.0 3 test foo
	1 1.0 2013-01-02 1.0 3 train foo
	0 1.0 2013-01-02 1.0 3 test foo
	'''

	# 按值排序
	print(df2.sort_values(by='E'))
	'''
	A B C D E F
	0 1.0 2013-01-02 1.0 3 test foo
	2 1.0 2013-01-02 1.0 3 test foo
	1 1.0 2013-01-02 1.0 3 train foo
	3 1.0 2013-01-02 1.0 3 train foo
	'''

	import pandas as pd
	import numpy as np

	dates = pd.date_range('20210101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])

	df.iloc[0,1] = np.nan
	df.iloc[1,2] = np.nan
	print(df)
	'''
	A B C D
	2021-01-01 0 NaN 2.0 3
	2021-01-02 4 5.0 NaN 7
	2021-01-03 8 9.0 10.0 11
	2021-01-04 12 13.0 14.0 15
	2021-01-05 16 17.0 18.0 19
	2021-01-06 20 21.0 22.0 23
	'''

	# 丢掉数据：按行,存在空值丢掉该行
	print(df.dropna(axis=0, how='any')) # how={'any', 'all'}
	'''
	A B C D
	2021-01-03 8 9.0 10.0 11
	2021-01-04 12 13.0 14.0 15
	2021-01-05 16 17.0 18.0 19
	2021-01-06 20 21.0 22.0 23
	'''

	# 丢掉数据：按列,存在空值丢掉该列
	print(df.dropna(axis=1, how='any')) # how={'any', 'all'}
	'''
	A D
	2021-01-01 0 3
	2021-01-02 4 7
	2021-01-03 8 11
	2021-01-04 12 15
	2021-01-05 16 19
	2021-01-06 20 23
	'''

	# 空值补零
	print(df.fillna(value=0))
	'''
	A B C D
	2021-01-01 0 0.0 2.0 3
	2021-01-02 4 5.0 0.0 7
	2021-01-03 8 9.0 10.0 11
	2021-01-04 12 13.0 14.0 15
	2021-01-05 16 17.0 18.0 19
	2021-01-06 20 21.0 22.0 23
	'''

	# 检查是否缺失数据
	print(pd.isnull(df))
	'''
	A B C D
	2021-01-01 False True False False
	2021-01-02 False False True False
	2021-01-03 False False False False
	2021-01-04 False False False False
	2021-01-05 False False False False
	2021-01-06 False False False False
	'''

	# 如果数据太多，使用这种方法进行检查空值
	# True 表示至少存在一个空值
	print(np.any(df.isnull()))
	# output: True

	import pandas as pd

	# read from
	data = pd.read_csv('student.csv')
	print(data)

	# save to
	data.to_pickle('student.pickle')

	import pandas as pd
	import numpy as np

	# concatenating
	# ignore index
	df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
	df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
	df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
	# print(df1)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	'''
	# print(df2)
	'''
	a b c d
	0 1.0 1.0 1.0 1.0
	1 1.0 1.0 1.0 1.0
	2 1.0 1.0 1.0 1.0

	'''
	# print(df3)
	'''
	a b c d
	0 2.0 2.0 2.0 2.0
	1 2.0 2.0 2.0 2.0
	2 2.0 2.0 2.0 2.0

	'''
	# 按行合并
	res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
	# print(res)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	5 1.0 1.0 1.0 1.0
	6 2.0 2.0 2.0 2.0
	7 2.0 2.0 2.0 2.0
	8 2.0 2.0 2.0 2.0
	'''

	# join, ('inner', 'outer')
	df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
	df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
	print(df1)
	'''
	a b c d
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 0.0 0.0 0.0 0.0
	'''

	print(df2)
	'''
	b c d e
	2 1.0 1.0 1.0 1.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	'''

	# join outer 保留所有列，没有的补NaN
	res = pd.concat([df1, df2], axis=0, join='outer')
	print(res)
	'''
	a b c d e
	1 0.0 0.0 0.0 0.0 NaN
	2 0.0 0.0 0.0 0.0 NaN
	3 0.0 0.0 0.0 0.0 NaN
	2 NaN 1.0 1.0 1.0 1.0
	3 NaN 1.0 1.0 1.0 1.0
	4 NaN 1.0 1.0 1.0 1.0
	'''
	# join inner - 只保留相交的部分
	res = pd.concat([df1, df2], axis=0, join='inner')
	print(res)
	'''
	b c d
	1 0.0 0.0 0.0
	2 0.0 0.0 0.0
	3 0.0 0.0 0.0
	2 1.0 1.0 1.0
	3 1.0 1.0 1.0
	4 1.0 1.0 1.0
	'''

	# join_axes
	# 按列，根据df1的行号进行合并，没有的补NaN
	res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
	print(res)
	'''
	a b c d b c d e
	1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
	2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
	3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0

	'''
	# append
	df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
	df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
	df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
	res = df1.append(df2, ignore_index=True)
	res = df1.append([df2, df3])
	print(res)

	'''
	a b c d e
	0 0.0 0.0 0.0 0.0 NaN
	1 0.0 0.0 0.0 0.0 NaN
	2 0.0 0.0 0.0 0.0 NaN
	0 1.0 1.0 1.0 1.0 NaN
	1 1.0 1.0 1.0 1.0 NaN
	2 1.0 1.0 1.0 1.0 NaN
	2 NaN 1.0 1.0 1.0 1.0
	3 NaN 1.0 1.0 1.0 1.0
	4 NaN 1.0 1.0 1.0 1.0

	'''
	#
	s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
	res = df1.append(s1, ignore_index=True)
	print(res)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 2.0 3.0 4.0
	'''

	import pandas as pd

	# merging two df by key/keys. (may be used in database)
	# simple example
	left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})
	print(left)
	'''
	key A B
	0 K0 A0 B0
	1 K1 A1 B1
	2 K2 A2 B2
	3 K3 A3 B3
	'''
	print(right)
	'''
	key C D
	0 K0 C0 D0
	1 K1 C1 D1
	2 K2 C2 D2
	3 K3 C3 D3
	'''
	# on 根据key列进行合并
	res = pd.merge(left, right, on='key')
	print(res)
	'''
	key A B C D
	0 K0 A0 B0 C0 D0
	1 K1 A1 B1 C1 D1
	2 K2 A2 B2 C2 D2
	3 K3 A3 B3 C3 D3
	'''

	# consider two keys
	left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
	'key2': ['K0', 'K1', 'K0', 'K1'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
	'key2': ['K0', 'K0', 'K0', 'K0'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})
	print(left)
	'''
	key1 key2 A B
	0 K0 K0 A0 B0
	1 K0 K1 A1 B1
	2 K1 K0 A2 B2
	3 K2 K1 A3 B3
	'''
	print(right)
	'''
	key1 key2 C D
	0 K0 K0 C0 D0
	1 K1 K0 C1 D1
	2 K1 K0 C2 D2
	3 K2 K0 C3 D3
	'''

	# default for how='inner'
	# 仅保留两个key值相同的数据
	res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
	print(res)
	'''
	key1 key2 A B C D
	0 K0 K0 A0 B0 C0 D0
	1 K1 K0 A2 B2 C1 D1
	2 K1 K0 A2 B2 C2 D2
	'''

	# how = ['left', 'right', 'outer', 'inner']
	# outer 保留所有行，数值不存在的补NaN
	res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
	print(res)
	'''
	key1 key2 A B C D
	0 K0 K0 A0 B0 C0 D0
	1 K0 K1 A1 B1 NaN NaN
	2 K1 K0 A2 B2 C1 D1
	3 K1 K0 A2 B2 C2 D2
	4 K2 K1 A3 B3 NaN NaN
	5 K2 K0 NaN NaN C3 D3
	'''
	# 根据 left的key进行合并
	res = pd.merge(left, right, on=['key1', 'key2'], how='left')
	print(res)
	'''
	key1 key2 A B C D
	0 K0 K0 A0 B0 C0 D0
	1 K0 K1 A1 B1 NaN NaN
	2 K1 K0 A2 B2 C1 D1
	3 K1 K0 A2 B2 C2 D2
	4 K2 K1 A3 B3 NaN NaN
	'''

	# 根据 right的key进行合并
	res = pd.merge(left, right, on=['key1', 'key2'], how='right')
	print(res)
	'''
	key1 key2 A B C D
	0 K0 K0 A0 B0 C0 D0
	1 K1 K0 A2 B2 C1 D1
	2 K1 K0 A2 B2 C2 D2
	3 K2 K0 NaN NaN C3 D3
	'''

	# indicator 显示合并的依据
	df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
	df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
	print(df1)
	'''
	col1 col_left
	0 0 a
	1 1 b
	'''
	print(df2)
	'''
	col1 col_right
	0 1 2
	1 2 2
	2 2 2
	'''
	#
	res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
	print(res)
	'''
	col1 col_left col_right _merge
	0 0 a NaN left_only
	1 1 b 2.0 both
	2 2 NaN 2.0 right_only
	3 2 NaN 2.0 right_only
	'''
	# give the indicator a custom name
	res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
	print(res)
	'''
	col1 col_left col_right indicator_column
	0 0 a NaN left_only
	1 1 b 2.0 both
	2 2 NaN 2.0 right_only
	3 2 NaN 2.0 right_only
	'''

	# merged by index
	left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
	'B': ['B0', 'B1', 'B2']},
	index=['K0', 'K1', 'K2'])
	right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
	'D': ['D0', 'D2', 'D3']},
	index=['K0', 'K2', 'K3'])
	print(left)
	'''
	A B
	K0 A0 B0
	K1 A1 B1
	K2 A2 B2
	'''
	print(right)
	'''
	C D
	K0 C0 D0
	K2 C2 D2
	K3 C3 D3
	'''
	# left_index and right_index
	# 根据行号合并
	res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
	print(res)
	'''
	A B C D
	K0 A0 B0 C0 D0
	K1 A1 B1 NaN NaN
	K2 A2 B2 C2 D2
	K3 NaN NaN C3 D3
	'''
	res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
	print(res)
	'''
	A B C D
	K0 A0 B0 C0 D0
	K2 A2 B2 C2 D2

	'''


	# handle overlapping

	boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
	print(boys)
	'''
	k age
	0 K0 1
	1 K1 2
	2 K2 3
	'''
	girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
	print(girls)
	'''
	k age
	0 K0 4
	1 K0 5
	2 K3 6
	'''
	res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
	print(res)
	'''
	k age_boy age_girl
	0 K0 1 4
	1 K0 1 5
	'''

	res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
	print(res)
	'''
	k age_boy age_girl
	0 K0 1.0 4.0
	1 K0 1.0 5.0
	2 K1 2.0 NaN
	3 K2 3.0 NaN
	4 K3 NaN 6.0
	'''

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	# plot data

	# Series
	data = pd.Series(np.random.randn(1000), index=np.arange(1000))
	# 数据累加
	data = data.cumsum()
	data.plot()

	# DataFrame
	data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
	data = data.cumsum()
	# plot methods:
	# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
	ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
	data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

	plt.show()