python 基础数据处理(代码)
import numpy as np
array = np.array([[1,2,3],
[2,3,4]])
print(array)
[[1 2 3]
[2 3 4]]
print(array)
print('number of dim:', array.ndim)
print('shape:', array.shape)
print('size', array.size)
[[1 2 3]
[2 3 4]]
number of dim: 2
shape: (2, 3)
size 6
创建arrpy
a = np.array([2,23,4])
print(a)
[ 2 23 4]
b = np.array([2,23,4], dtype = np.int) #指定array的数据类型 np.int np.float
print(b.dtype)
int32
a = np.zeros((3, 4)) #生成一个3行4列的矩阵,所有值为 0
print(a)
a = np.ones((3, 4)) #生成一个3行4列的矩阵,所有值为 1
print(a)
a = np.empty((3, 4), dtype = float) #生成一个3行4列的矩阵,所有值为 0
print(a)
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
[[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]]
[[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]]
a = np.arange(10, 20) #生成从 10 到 20 的数列 arange(start, end, setp)
print(a)
a = np.arange(12).reshape((3, 4)) #重新定义大小
print(a)
[10 11 12 13 14 15 16 17 18 19]
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
#生成一个线段
a = np.linspace(1, 10, 20)
print(a)
a = np.linspace(1, 10, 20).reshape((5, 4))
[ 1. 1.47368421 1.94736842 2.42105263 2.89473684 3.36842105
3.84210526 4.31578947 4.78947368 5.26315789 5.73684211 6.21052632
6.68421053 7.15789474 7.63157895 8.10526316 8.57894737 9.05263158
9.52631579 10. ]
a = np.array([10, 20, 30, 40])
b = np.arange(4)
print(a, b)
[10 20 30 40] [0 1 2 3]
c = a - b
print(c)
c = a + b
print(c)
c = b**2 # b 的 平方
print(c)
c = 10 * np.sin(a)
print(c)
c = 10 * np.cos(a)
print(c)
[10 19 28 37]
[10 21 32 43]
[0 1 4 9]
[-5.44021111 9.12945251 -9.88031624 7.4511316 ]
[-8.39071529 4.08082062 1.5425145 -6.66938062]
print(b, b < 3, b == 3)
[0 1 2 3] [ True True True False] [False False False True]
a = np.array([[1, 1],
[0,1]])
b = np.arange(4).reshape((2, 2))
print(a)
print(b)
c = a * b #元素对应相乘
c_dot = np.dot(a, b) #矩阵乘法
print(c)
print(c_dot)
[[1 1]
[0 1]]
[[0 1]
[2 3]]
[[0 1]
[0 3]]
[[2 4]
[2 3]]
a = np.random.random((2, 4)) #随机生成[0, 1]的小数
print(a)
print(np.sum(a))
print(np.min(a))
print(np.max(a))
[[0.42529947 0.17039602 0.19660961 0.46083628]
[0.94819732 0.24865026 0.94451173 0.08625252]]
3.48075321320014
0.08625252476463197
0.9481973213759178
A = np.arange(2, 14).reshape((3, 4))
print(A)
print(np.argmin(A)) #找到最小值的索引
print(np.argmax(A))
print(A.mean()) #平均数
print(np.mean(A))
print(np.median(A)) #中位数
print(np.cumsum(A)) #逐步累加
print(np.diff(A)) #累差
print(np.sort(A)) #逐行排序
print(np.transpose(A)) #转置
print(A.T)
print(np.clip(A, 5, 10)) #小于5的数让他等于5,大于9的数等于9
print(np.mean(A, axis=0)) #列的平均值
print(np.mean(A, axis=1)) #行的平均值
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
0
11
7.5
7.5
7.5
[ 2 5 9 14 20 27 35 44 54 65 77 90]
[[1 1 1]
[1 1 1]
[1 1 1]]
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
[[ 2 6 10]
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]]
[[ 2 6 10]
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]]
[[ 5 5 5 5]
[ 6 7 8 9]
[10 10 10 10]]
[6. 7. 8. 9.]
[ 3.5 7.5 11.5]
A = np.arange(3, 15)
print(A)
print(A[3])
A = np.arange(3, 15).reshape((3, 4))
print(A)
print(A[2])
print(A[2][1])
print(A[:, 1]) #所有行的第一行数
#迭代行
for row in A:
print(row)
#迭代列
for column in A.T:
print(column)
print(A.flatten()) #返回值
for item in A.flat:
print(item)
[ 3 4 5 6 7 8 9 10 11 12 13 14]
6
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
[11 12 13 14]
12
[ 4 8 12]
[3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
[ 6 10 14]
[ 3 4 5 6 7 8 9 10 11 12 13 14]
3
4
5
6
7
8
9
10
11
12
13
14
A = np.array([1,1,1])
B = np.array([2,2,2])
C = np.vstack((A, B))
D = np.hstack((A, B))
print(D)
print(A.shape)
print(D.shape)
print(A[np.newaxis, :]) #加了一个维度
print(A[np.newaxis, :].shape)
print(A[:,np.newaxis])
print(A[:,np.newaxis].shape)
[1 1 1 2 2 2]
(3,)
(6,)
[[1 1 1]]
(1, 3)
[[1]
[1]
[1]]
(3, 1)
A = np.arange(12).reshape((3, 4))
print(A)
print(np.split(A, 2, axis = 1)) #按列分割成 2 个 列数能被块数整数
print(np.split(A, 3, axis = 0)) #按行分割成 2 个 行数能被块数整数
print(np.array_split(A, 3, axis = 1)) #不等分割
print(np.vsplit(A, 3)) #按行分割
print(np.hsplit(A,2)) #按列分割
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
import pandas as pd
s = pd.Series([1,3,4, np.nan,44,1]) #创建一个序列
print(s)
0 1.0
1 3.0
2 4.0
3 NaN
4 44.0
5 1.0
dtype: float64
dates = pd.date_range('20210922', periods=6) #生成日期序列
print(dates)
DatetimeIndex(['2021-09-22', '2021-09-23', '2021-09-24', '2021-09-25',
'2021-09-26', '2021-09-27'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns=['a', 'b', 'c', 'd'])
df
a | b | c | d | |
---|---|---|---|---|
2021-09-22 | 1.032756 | -0.529079 | -0.161499 | -0.436609 |
2021-09-23 | -1.059394 | 2.122808 | -0.519834 | -0.822384 |
2021-09-24 | -0.002292 | 0.119694 | -0.749122 | 1.148803 |
2021-09-25 | 0.456498 | -1.414832 | -0.386781 | -2.410006 |
2021-09-26 | 0.540748 | -1.741398 | 0.339414 | -0.667571 |
2021-09-27 | -0.591441 | 2.414537 | -0.105859 | 0.755189 |
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df2 = pd.DataFrame({'A' : 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index = list(range(4)), dtype = 'float32'),
'D': np.array([3] * 4, dtype='int32'),
'E':pd.Categorical(["test", "train", "test", "train"]),
'F':'foo'})
print(df2)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
print(df2.dtypes)
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
print(df2.index)
Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns) #打印列的名字
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.describe()) #描述方差,平均值,最大最小值
print(df2.T) #转置
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
0 1 2 \
A 1 1 1
B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2013-01-02 00:00:00
C 1
D 3
E train
F foo
print(df2.sort_index(axis = 1, ascending = False)) # index 排序
F E D C B A
0 foo test 3 1.0 2013-01-02 1.0
1 foo train 3 1.0 2013-01-02 1.0
2 foo test 3 1.0 2013-01-02 1.0
3 foo train 3 1.0 2013-01-02 1.0
print(df2.sort_values(by='E')) #按照某行的值进行排序
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
3 1.0 2013-01-02 1.0 3 train foo
dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index = dates, columns = ['A', 'B', 'C', 'D'])
print(df)
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
print(df['A'], df.A)
print(df[0:3]) #切片选择
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32 2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
#select by lable
print(df.loc['20130102']) #某行数据
print(df.loc[:,['A', 'B']])
print(df.loc['20130102',['A', 'B']])
#select by position iloc
print(df.iloc[3])
#mixed selection:ix
print("mix")
print(df.ix[:3,['A','C']])
print(df[df.A > 8])
A 4
B 5
C 6
D 7
Name: 2013-01-02 00:00:00, dtype: int32
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21
A 4
B 5
Name: 2013-01-02 00:00:00, dtype: int32
A 12
B 13
C 14
D 15
Name: 2013-01-04 00:00:00, dtype: int32
mix
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
C:\Users\dell\Anaconda\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
if __name__ == '__main__':
df.iloc[2,2] = 1111
df.loc['20130101', 'B'] = 2222
df.B[df.A>4] = 0
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130101', periods = 6))
print(df)
A B C D F E
2013-01-01 0 2222 2 3 NaN 1
2013-01-02 4 5 6 7 NaN 2
2013-01-03 8 0 1111 11 NaN 3
2013-01-04 12 0 14 15 NaN 4
2013-01-05 16 0 18 19 NaN 5
2013-01-06 20 0 22 23 NaN 6
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df.dropna(axis = 0, how = 'any')) #某一行含有nan的应该丢弃 how == any 只有一个 , all 某一行全部为nan 则全部丢弃
print(df.dropna(axis = 1, how = 'any')) #某一列含有nan的应该丢弃 how == any 只有一个 , all 某一列全部为nan 则全部丢弃
Empty DataFrame
Columns: [A, B, C, D, F, E]
Index: []
A D E
2013-01-01 0 3 1
2013-01-02 4 7 2
2013-01-03 8 11 3
2013-01-04 12 15 4
2013-01-05 16 19 5
2013-01-06 20 23 6
print(df.fillna(value = 0)) #将 nan 替换为 value
A B C D F E
2013-01-01 0 0.0 2.0 3 0.0 1
2013-01-02 4 5.0 0.0 7 0.0 2
2013-01-03 8 0.0 1111.0 11 0.0 3
2013-01-04 12 0.0 14.0 15 0.0 4
2013-01-05 16 0.0 18.0 19 0.0 5
2013-01-06 20 0.0 22.0 23 0.0 6
print(df.isnull()) #是否含有nan
A B C D F E
2013-01-01 False True False False True False
2013-01-02 False False True False True False
2013-01-03 False False False False True False
2013-01-04 False False False False True False
2013-01-05 False False False False True False
2013-01-06 False False False False True False
print(np.any(df.isnull()) == True) #大数据中直接进行判断
True
data = pd.read_csv('student.csv')
print(data)
Student ID name age gender
0 1101 df 1 Female
1 1102 sdf 2 Female
2 1103 asdf 3 Male
3 1104 gf 4 Female
4 1105 cb 5 Male
5 1106 ec 6 Female
6 1107 h 7 Male
7 1108 rty 8 Male
8 1109 rty 9 Male
9 1110 hhh 10 Female
10 1111 t 11 Female
11 1112 y 12 Female
12 1113 u 13 Male
13 1114 NaN 14 Male
14 1115 p 15 Female
15 1116 o 16 Female
16 1117 k 17 Female
17 1118 l 18 Female
18 1119 m 19 Male
19 1120 e 20 Male
20 1121 hh 21 Male
data.to_pickle('student.pickle')
data1 = pd.read_pickle('student.pickle')
print(data1)
Student ID name age gender
0 1101 df 1 Female
1 1102 sdf 2 Female
2 1103 asdf 3 Male
3 1104 gf 4 Female
4 1105 cb 5 Male
5 1106 ec 6 Female
6 1107 h 7 Male
7 1108 rty 8 Male
8 1109 rty 9 Male
9 1110 hhh 10 Female
10 1111 t 11 Female
11 1112 y 12 Female
12 1113 u 13 Male
13 1114 NaN 14 Male
14 1115 p 15 Female
15 1116 o 16 Female
16 1117 k 17 Female
17 1118 l 18 Female
18 1119 m 19 Male
19 1120 e 20 Male
20 1121 hh 21 Male
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd'])
print(df1)
print(df2)
print(df3)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
res = pd.concat([df1, df2, df3], axis = 0) #axis = 0 竖向的, axis = 1横向的
print(res)
res = pd.concat([df1, df2, df3], axis = 1)
print(res)
res = pd.concat([df1, df2, df3], axis = 1, ignore_index = True) # 忽略id 重新排序
print(res)
res = pd.concat([df1, df2, df3], axis = 0, ignore_index = True) # 忽略id 重新排序
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
0 1 2 3 4 5 6 7 8 9 10 11
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns = ['a', 'b', 'c', 'd'], index= [1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
print(df1)
print(df2)
res = pd.concat([df1, df2])
print(res)
res = pd.concat([df1, df2], join = 'inner') #把相同的列的合并在一起
print(res)
res = pd.concat([df1, df2], join = 'inner', ignore_index = True) #把相同的列的合并在一起, id 重新进行排序
print(res)
res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index]) #把相同的列的合并在一起, id 重新进行排序
print(res)
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
C:\Users\dell\Anaconda\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""
res = df1.append([df2, df3])
print(res)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0 NaN
1 2.0 2.0 2.0 2.0 NaN
2 2.0 2.0 2.0 2.0 NaN
C:\Users\dell\Anaconda\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
sort=sort)
s1 = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index =True)
print('df1\n', df1)
print('s1\n', s1)
print(res)
df1
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
s1
a 1
b 2
c 3
d 4
dtype: int64
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
left = pd.DataFrame({'key': ['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0','K1','K2','K3'],
'C':['C0', 'C1', 'C2', 'C3'],
'D':['D0', 'D1', 'D2', 'D3']})
print(left, right)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3 key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
res = pd.merge(left, right, on='key') #on 表示按照 某一个属性,然后值相同的进行合并
print(res)
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
left = pd.DataFrame({'key1': ['K0','K0','K1','K2'],
'key2': ['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0','K1','K1','K2'],
'key2': ['K0','K0','K0','K0'],
'C':['C0', 'C1', 'C2', 'C3'],
'D':['D0', 'D1', 'D2', 'D3']})
print(left,'\n', right)
res = pd.merge(left, right, on=['key1', 'key2']) #默认 how = 'inner', 两个必须为相同值 'outer' 一不一样都进行合并,没有值为 nan, left
print(res)
res = pd.merge(left, right, on=['key1', 'key2'], how = 'left') #how = 'left', 'left' 一不一样都进行合并,没有值为 nan, left
print('how == left\n', res)
res = pd.merge(left, right, on=['key1', 'key2'], how = 'right') #默认 how = 'inner', 'outer' 一不一样都进行合并,没有值为 nan, left
print('how == right\n', res)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
how == left
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
how == right
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
res = pd.merge(left, right, on=['key1', 'key2'], how = 'right', indicator = True) # 显示如何 how 合并
print(res)
key1 key2 A B C D _merge
0 K0 K0 A0 B0 C0 D0 both
1 K1 K0 A2 B2 C1 D1 both
2 K1 K0 A2 B2 C2 D2 both
3 K2 K0 NaN NaN C3 D3 right_only
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000), index = np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
data = pd.DataFrame(np.random.randn(1000, 4),
index = np.arange(1000),
columns = list("ABCD"))
print(data.head(3))
A B C D
0 1.867890 -1.189657 0.055617 2.624250
1 1.211239 -0.961285 1.139995 -2.090965
2 0.597090 -1.287032 -0.622284 -1.484287
data = data.cumsum()
data.plot()
plt.show()
#plot methods:
# 'bar', hist, box, kde, area, scatter, hexbin, pie
ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class1')
data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class2', ax = ax)
plt.show()