Fork me on GitHub

Python Numpy & Pandas

Python Numpy & Pandas

需要安装Numpy 和 Pandas

Numpy

基础创建

import numpy as np
# 创建
# dtype : int 16, 32, 64; float 16, 32 ,64; complex
array = np.array([[1, 2, 3],
[4, 5, 6]], dtype=np.int)
# 创建全部为0的矩阵
array = np.zeros((3, 4))
print(array)
'''
output:
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
'''
# 创建全部为1的矩阵
array = np.ones((3, 4))
print(array)
'''
output:
[[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]]
'''
# 创建未初始化的矩阵
array = np.empty( (2,3) )
print(array)
'''
[[6.23042070e-307 1.89146896e-307 1.37961302e-306]
[1.05699242e-307 1.11261638e-306 1.24610927e-306]]
'''
# 9 numbers from 0 to 2
array = np.linspace( 0, 2, 9 )
print(array)
''''
output:
[0. 0.25 0.5 0.75 1. 1.25 1.5 1.75 2. ]
'''
array = np.arange(15).reshape(3, 5)
print(array)
'''
output:
[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]]
'''
print('number of dim: ', array.ndim)
print('shape: ', array.shape)
print('size: ', array.size)
print('type:', array.dtype)
'''
output:
number of dim: 2
shape: (3, 5)
size: 15
type: int32
'''

基础运算

import numpy as np
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
print(b)
# output: [0, 1, 2, 3]
# 对应位置元素相减
c = a-b
print(c)
# output: [20, 29, 38, 47]
# b每个元素平方
print(b**2)
# output: [0, 1, 4, 9]
print(10*np.sin(a))
# output: [ 9.12945251, -9.88031624, 7.4511316 , -2.62374854]
# 对于a的每个元素判断是否小于35,并输出结果
print(a<35)
# output: [ True, True, False, False]

矩阵运算

import numpy as np
A = np.array( [[1,1],
[0,1]] )
B = np.array( [[2,0],
[3,4]] )
# 矩阵对应位置元素相乘
print(A * B)
'''
array([[2, 0],
[0, 4]])
'''
# matrix product
print(A @ B)
'''
array([[5, 4],
[3, 4]])
'''
# another matrix product
print(A.dot(B))
'''
array([[5, 4],
[3, 4]])
'''
# 矩阵求和、最值
a = np.random.random((2,3))
'''
[[0.70284491 0.67667991 0.60919611]
[0.65004259 0.98493693 0.23953326]]
'''
print(a)
print(a.sum())
print(a.min())
print(a.max())
'''
3.8632337099338896
0.23953325605165765
0.9849369271091678
'''
# axis: 1 对行操作, 0 对列操作
print(a.sum(axis=1))
print(a.min(axis=0))
print(a.max(axis=1))
'''
[1.98872094 1.87451277]
[0.65004259 0.67667991 0.23953326]
[0.70284491 0.98493693]
'''

索引、均值

import numpy as np
a = np.arange(2, 14).reshape((3, 4))
# 最值的索引
print(np.argmin(a))
print(np.argmax(a))
'''
0
11
'''
# 均值
print(a.mean())
print(np.average(a))
'''
7.5
7.5
'''
# 中位数
print(np.median(a))
# 7.5
# 累加
print(np.cumsum(a))
'''
[ 2 5 9 14 20 27 35 44 54 65 77 90]
'''
# 累差
print(np.diff(a))
'''
[[1 1 1]
[1 1 1]
[1 1 1]]
'''
# 非零数 输出对应的行号、列号
print(np.nonzero(a))
# (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
# 排序
print(np.sort(a))
# 矩阵转置
print(np.transpose(a))
print(a.T)
'''
[[ 2 6 10]
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]]
'''
print((a.T).dot(a))
'''
[[140 158 176 194]
[158 179 200 221]
[176 200 224 248]
[194 221 248 275]]
'''
# numpy.clip(a, a_min, a_max, out=None)
# 将数组中的元素限制在a_min, a_max之间,大于a_max的就使得它等于 a_max,小于a_min,的就使得它等于a_min。
print(np.clip(a, 5, 9))
'''
[[5 5 5 5]
[6 7 8 9]
[9 9 9 9]]
'''

矩阵合并

import numpy as np
a = np.array([1, 1, 1])
b = np.array([2, 2, 2])
# 垂直合并
c = np.vstack((a, b))
print(c)
print(a.shape, c.shape)
'''
[[1 1 1]
[2 2 2]]
(3,) (2, 3)
'''
# 水平合并
d = np.hstack((a, b))
print(d)
print(a.shape, d.shape)
'''
[1 1 1 2 2 2]
(3,) (6,)
'''
a = np.array([1, 1, 1])[:, np.newaxis]
b = np.array([2, 2, 2])[:, np.newaxis]
print(a)
'''
[[1]
[1]
[1]]
'''
# concatenate 可以合并多个矩阵
# axis 可以设置合并方向
e = np.concatenate((a, b, b, a), axis=0)
print(e)
'''
[[1]
[1]
[1]
[2]
[2]
[2]
[2]
[2]
[2]
[1]
[1]
[1]]
'''
e = np.concatenate((a, b, b, a), axis=1)
print(e)
'''
[[1 2 2 1]
[1 2 2 1]
[1 2 2 1]]
'''

Pandas

基本操作

import numpy as np
import pandas as pd
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
'''
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
'''
dates = pd.date_range("20210101", periods=6)
print(dates)
'''
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06'],
dtype='datetime64[ns]', freq='D')
'''
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
A B C D
2021-01-01 -0.817100 0.113590 0.401565 -1.007716
2021-01-02 1.241367 0.816412 -0.460833 0.275248
2021-01-03 1.780875 -1.638779 0.895724 1.522088
2021-01-04 0.872075 -0.136323 -0.828276 -1.705793
2021-01-05 0.939255 -0.459394 -0.331591 0.107772
2021-01-06 0.187501 -1.024592 -1.118402 -2.061401
'''
df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
print(df2)
'''
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
'''
# 字典,以列的方式输入数据
df2 = pd.DataFrame(
{
"A": 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo",
}
)
print(df2)
'''
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
'''
# 类型
print(df2.dtypes)
'''
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
'''
# 行号
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''
# 列号
print(df2.columns)
'''
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
# 数值
print(df2.values)
'''
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''
# 统计信息
print(df2.describe())
'''
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
'''
# 转置 适用于数值型
print(df2.T)
'''
0 ... 3
A 1 ... 1
B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00
C 1 ... 1
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]
'''
# 排序
# 按列 倒序
print(df2.sort_index(axis=1, ascending=False))
'''
F E D C B A
0 foo test 3 1.0 2013-01-02 1.0
1 foo train 3 1.0 2013-01-02 1.0
2 foo test 3 1.0 2013-01-02 1.0
3 foo train 3 1.0 2013-01-02 1.0
'''
# 按行 倒序
print(df2.sort_index(axis=0, ascending=False))
'''
A B C D E F
3 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
0 1.0 2013-01-02 1.0 3 test foo
'''
# 按值排序
print(df2.sort_values(by='E'))
'''
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
3 1.0 2013-01-02 1.0 3 train foo
'''

选择数据

import numpy as np
import pandas as pd
dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
'''
# 选择某一列 两种方式
print(df['A'])
print(df.A)
'''
2021-01-01 0
2021-01-02 4
2021-01-03 8
2021-01-04 12
2021-01-05 16
2021-01-06 20
Freq: D, Name: A, dtype: int32
'''
# 切片
print(df[0:3])
print(df['20210102':'20210104'])
'''
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
A B C D
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
'''
# select by label: loc
print(df.loc['20210102'])
print(df.loc[:,['A','B']])
print(df.loc['20210102', ['A','B']])
'''
A 4
B 5
C 6
D 7
Name: 2021-01-02 00:00:00, dtype: int32
----------------------------------------
A B
2021-01-01 0 1
2021-01-02 4 5
2021-01-03 8 9
2021-01-04 12 13
2021-01-05 16 17
2021-01-06 20 21
---------------------------------------
A 4
B 5
Name: 2021-01-02 00:00:00, dtype: int32
'''
# select by position: iloc
print(df.iloc[3])
'''
A 12
B 13
C 14
D 15
Name: 2021-01-04 00:00:00, dtype: int32
'''
print(df.iloc[3, 1])
# 13
print(df.iloc[3:5,0:2])
'''
A B
2021-01-04 12 13
2021-01-05 16 17
'''
print(df.iloc[[1,2,4],[0,2]])
'''
A C
2021-01-02 4 6
2021-01-03 8 10
2021-01-05 16 18
'''
# mixed selection: ix
print(df.ix[:3, ['A', 'C']])
'''
A C
2021-01-01 0 2
2021-01-02 4 6
2021-01-03 8 10
'''
# Boolean indexing
print(df[df.A > 0])
'''
A B C D
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
'''

赋值

import numpy as np
import pandas as pd
dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
'''
df.iloc[2, 2] = 1111
df.iloc[2,2] = 1111
df.loc['2021-01-03', 'D'] = 2222
df.A[df.A>0] = 0
df['F'] = np.nan
df['G'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20210101', periods=6))
print(df)
'''
A B C D F G
2021-01-01 0 1 2 3 NaN 1
2021-01-02 0 5 6 7 NaN 2
2021-01-03 0 9 1111 2222 NaN 3
2021-01-04 0 13 14 15 NaN 4
2021-01-05 0 17 18 19 NaN 5
2021-01-06 0 21 22 23 NaN 6
'''

处理丢失数据

import pandas as pd
import numpy as np
dates = pd.date_range('20210101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
'''
A B C D
2021-01-01 0 NaN 2.0 3
2021-01-02 4 5.0 NaN 7
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
'''
# 丢掉数据: 按行,存在空值丢掉该行
print(df.dropna(axis=0, how='any')) # how={'any', 'all'}
'''
A B C D
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
'''
# 丢掉数据: 按列,存在空值丢掉该列
print(df.dropna(axis=1, how='any')) # how={'any', 'all'}
'''
A D
2021-01-01 0 3
2021-01-02 4 7
2021-01-03 8 11
2021-01-04 12 15
2021-01-05 16 19
2021-01-06 20 23
'''
# 空值补零
print(df.fillna(value=0))
'''
A B C D
2021-01-01 0 0.0 2.0 3
2021-01-02 4 5.0 0.0 7
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
'''
# 检查是否缺失数据
print(pd.isnull(df))
'''
A B C D
2021-01-01 False True False False
2021-01-02 False False True False
2021-01-03 False False False False
2021-01-04 False False False False
2021-01-05 False False False False
2021-01-06 False False False False
'''
# 如果数据太多,使用这种方法进行检查空值
# True 表示至少存在一个空值
print(np.any(df.isnull()))
# output: True

导入导出数据

支持的数据类型:

CSV 推荐使用

HDF5

Excel

Gotchas

import pandas as pd
# read from
data = pd.read_csv('student.csv')
print(data)
# save to
data.to_pickle('student.pickle')

数据合并

concat

import pandas as pd
import numpy as np
# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
# print(df1)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
'''
# print(df2)
'''
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
'''
# print(df3)
'''
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
'''
# 按行合并
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
# print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
'''
# join, ('inner', 'outer')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
print(df1)
'''
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
'''
print(df2)
'''
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
'''
# join outer 保留所有列,没有的补NaN
res = pd.concat([df1, df2], axis=0, join='outer')
print(res)
'''
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
'''
# join inner - 只保留相交的部分
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
'''
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
'''
# join_axes
# 按列,根据df1的行号进行合并,没有的补NaN
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)
'''
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
'''
# append
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2, df3])
print(res)
'''
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
0 1.0 1.0 1.0 1.0 NaN
1 1.0 1.0 1.0 1.0 NaN
2 1.0 1.0 1.0 1.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
'''
#
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
'''

merge

import pandas as pd
# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
'''
print(right)
'''
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
'''
# on 根据key列进行合并
res = pd.merge(left, right, on='key')
print(res)
'''
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
'''
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
'''
print(right)
'''
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
'''
# default for how='inner'
# 仅保留 两个key值相同的数据
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
'''
# how = ['left', 'right', 'outer', 'inner']
# outer 保留所有行,数值不存在的补NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
'''
# 根据 left的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
'''
# 根据 right的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
'''
# indicator 显示合并的依据
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
'''
col1 col_left
0 0 a
1 1 b
'''
print(df2)
'''
col1 col_right
0 1 2
1 2 2
2 2 2
'''
#
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
'''
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''
# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
'''
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
'''
print(right)
'''
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
'''
# left_index and right_index
# 根据行号合并
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
'''
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
'''
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
print(boys)
'''
k age
0 K0 1
1 K1 2
2 K2 3
'''
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(girls)
'''
k age
0 K0 4
1 K0 5
2 K3 6
'''
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
k age_boy age_girl
0 K0 1 4
1 K0 1 5
'''
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
'''
k age_boy age_girl
0 K0 1.0 4.0
1 K0 1.0 5.0
2 K1 2.0 NaN
3 K2 3.0 NaN
4 K3 NaN 6.0
'''

绘图

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 数据累加
data = data.cumsum()
data.plot()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()


参考资料

NumPy quickstart

10 minutes to pandas

Numpy & Pandas

posted @   ZTianming  阅读(62)  评论(0编辑  收藏  举报
编辑推荐:
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)
欢迎阅读『Python Numpy & Pandas』

喜欢请打赏

扫描二维码打赏

支付宝打赏

点击右上角即可分享
微信分享提示