Python Numpy & Pandas
需要安装Numpy 和 Pandas
Numpy
基础创建
| import numpy as np |
| |
| |
| |
| array = np.array([[1, 2, 3], |
| [4, 5, 6]], dtype=np.int) |
| |
| |
| array = np.zeros((3, 4)) |
| print(array) |
| ''' |
| output: |
| [[0. 0. 0. 0.] |
| [0. 0. 0. 0.] |
| [0. 0. 0. 0.]] |
| ''' |
| |
| |
| array = np.ones((3, 4)) |
| print(array) |
| ''' |
| output: |
| [[1. 1. 1. 1.] |
| [1. 1. 1. 1.] |
| [1. 1. 1. 1.]] |
| ''' |
| |
| |
| array = np.empty( (2,3) ) |
| print(array) |
| ''' |
| [[6.23042070e-307 1.89146896e-307 1.37961302e-306] |
| [1.05699242e-307 1.11261638e-306 1.24610927e-306]] |
| ''' |
| |
| |
| |
| array = np.linspace( 0, 2, 9 ) |
| print(array) |
| '''' |
| output: |
| [0. 0.25 0.5 0.75 1. 1.25 1.5 1.75 2. ] |
| ''' |
| |
| array = np.arange(15).reshape(3, 5) |
| print(array) |
| ''' |
| output: |
| [[ 0 1 2 3 4] |
| [ 5 6 7 8 9] |
| [10 11 12 13 14]] |
| ''' |
| print('number of dim: ', array.ndim) |
| print('shape: ', array.shape) |
| print('size: ', array.size) |
| print('type:', array.dtype) |
| ''' |
| output: |
| number of dim: 2 |
| shape: (3, 5) |
| size: 15 |
| type: int32 |
| ''' |
基础运算
| import numpy as np |
| |
| a = np.array( [20,30,40,50] ) |
| b = np.arange( 4 ) |
| print(b) |
| |
| |
| c = a-b |
| print(c) |
| |
| |
| print(b**2) |
| |
| print(10*np.sin(a)) |
| |
| |
| print(a<35) |
| |
矩阵运算
| import numpy as np |
| A = np.array( [[1,1], |
| [0,1]] ) |
| B = np.array( [[2,0], |
| [3,4]] ) |
| |
| print(A * B) |
| ''' |
| array([[2, 0], |
| [0, 4]]) |
| ''' |
| |
| |
| |
| print(A @ B) |
| ''' |
| array([[5, 4], |
| [3, 4]]) |
| ''' |
| |
| print(A.dot(B)) |
| ''' |
| array([[5, 4], |
| [3, 4]]) |
| ''' |
| |
| |
| a = np.random.random((2,3)) |
| ''' |
| [[0.70284491 0.67667991 0.60919611] |
| [0.65004259 0.98493693 0.23953326]] |
| ''' |
| print(a) |
| print(a.sum()) |
| print(a.min()) |
| print(a.max()) |
| ''' |
| 3.8632337099338896 |
| 0.23953325605165765 |
| 0.9849369271091678 |
| ''' |
| |
| print(a.sum(axis=1)) |
| print(a.min(axis=0)) |
| print(a.max(axis=1)) |
| ''' |
| [1.98872094 1.87451277] |
| [0.65004259 0.67667991 0.23953326] |
| [0.70284491 0.98493693] |
| ''' |
| |
| |
索引、均值
| import numpy as np |
| |
| a = np.arange(2, 14).reshape((3, 4)) |
| |
| print(np.argmin(a)) |
| print(np.argmax(a)) |
| ''' |
| 0 |
| 11 |
| ''' |
| |
| print(a.mean()) |
| print(np.average(a)) |
| ''' |
| 7.5 |
| 7.5 |
| ''' |
| |
| print(np.median(a)) |
| |
| |
| print(np.cumsum(a)) |
| ''' |
| [ 2 5 9 14 20 27 35 44 54 65 77 90] |
| ''' |
| |
| print(np.diff(a)) |
| ''' |
| [[1 1 1] |
| [1 1 1] |
| [1 1 1]] |
| ''' |
| |
| print(np.nonzero(a)) |
| |
| |
| print(np.sort(a)) |
| |
| |
| print(np.transpose(a)) |
| print(a.T) |
| ''' |
| [[ 2 6 10] |
| [ 3 7 11] |
| [ 4 8 12] |
| [ 5 9 13]] |
| ''' |
| |
| print((a.T).dot(a)) |
| ''' |
| [[140 158 176 194] |
| [158 179 200 221] |
| [176 200 224 248] |
| [194 221 248 275]] |
| ''' |
| |
| |
| print(np.clip(a, 5, 9)) |
| ''' |
| [[5 5 5 5] |
| [6 7 8 9] |
| [9 9 9 9]] |
| ''' |
矩阵合并
| import numpy as np |
| |
| a = np.array([1, 1, 1]) |
| b = np.array([2, 2, 2]) |
| |
| |
| c = np.vstack((a, b)) |
| print(c) |
| print(a.shape, c.shape) |
| ''' |
| [[1 1 1] |
| [2 2 2]] |
| (3,) (2, 3) |
| ''' |
| |
| d = np.hstack((a, b)) |
| print(d) |
| print(a.shape, d.shape) |
| ''' |
| [1 1 1 2 2 2] |
| (3,) (6,) |
| ''' |
| |
| a = np.array([1, 1, 1])[:, np.newaxis] |
| b = np.array([2, 2, 2])[:, np.newaxis] |
| |
| print(a) |
| ''' |
| [[1] |
| [1] |
| [1]] |
| ''' |
| |
| |
| e = np.concatenate((a, b, b, a), axis=0) |
| print(e) |
| ''' |
| [[1] |
| [1] |
| [1] |
| [2] |
| [2] |
| [2] |
| [2] |
| [2] |
| [2] |
| [1] |
| [1] |
| [1]] |
| ''' |
| e = np.concatenate((a, b, b, a), axis=1) |
| print(e) |
| ''' |
| [[1 2 2 1] |
| [1 2 2 1] |
| [1 2 2 1]] |
| ''' |
Pandas
基本操作
| import numpy as np |
| import pandas as pd |
| |
| s = pd.Series([1, 3, 5, np.nan, 6, 8]) |
| print(s) |
| ''' |
| 0 1.0 |
| 1 3.0 |
| 2 5.0 |
| 3 NaN |
| 4 6.0 |
| 5 8.0 |
| dtype: float64 |
| ''' |
| |
| dates = pd.date_range("20210101", periods=6) |
| print(dates) |
| |
| ''' |
| DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', |
| '2021-01-05', '2021-01-06'], |
| dtype='datetime64[ns]', freq='D') |
| ''' |
| |
| df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) |
| print(df) |
| |
| ''' |
| A B C D |
| 2021-01-01 -0.817100 0.113590 0.401565 -1.007716 |
| 2021-01-02 1.241367 0.816412 -0.460833 0.275248 |
| 2021-01-03 1.780875 -1.638779 0.895724 1.522088 |
| 2021-01-04 0.872075 -0.136323 -0.828276 -1.705793 |
| 2021-01-05 0.939255 -0.459394 -0.331591 0.107772 |
| 2021-01-06 0.187501 -1.024592 -1.118402 -2.061401 |
| ''' |
| |
| df2 = pd.DataFrame(np.arange(12).reshape(3, 4)) |
| print(df2) |
| ''' |
| 0 1 2 3 |
| 0 0 1 2 3 |
| 1 4 5 6 7 |
| 2 8 9 10 11 |
| ''' |
| |
| df2 = pd.DataFrame( |
| { |
| "A": 1.0, |
| "B": pd.Timestamp("20130102"), |
| "C": pd.Series(1, index=list(range(4)), dtype="float32"), |
| "D": np.array([3] * 4, dtype="int32"), |
| "E": pd.Categorical(["test", "train", "test", "train"]), |
| "F": "foo", |
| } |
| ) |
| print(df2) |
| ''' |
| A B C D E F |
| 0 1.0 2013-01-02 1.0 3 test foo |
| 1 1.0 2013-01-02 1.0 3 train foo |
| 2 1.0 2013-01-02 1.0 3 test foo |
| 3 1.0 2013-01-02 1.0 3 train foo |
| ''' |
| |
| print(df2.dtypes) |
| ''' |
| A float64 |
| B datetime64[ns] |
| C float32 |
| D int32 |
| E category |
| F object |
| dtype: object |
| ''' |
| |
| print(df2.index) |
| ''' |
| Int64Index([0, 1, 2, 3], dtype='int64') |
| ''' |
| |
| |
| print(df2.columns) |
| ''' |
| Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') |
| ''' |
| |
| |
| print(df2.values) |
| ''' |
| [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] |
| [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo'] |
| [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] |
| [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']] |
| ''' |
| |
| |
| print(df2.describe()) |
| ''' |
| A C D |
| count 4.0 4.0 4.0 |
| mean 1.0 1.0 3.0 |
| std 0.0 0.0 0.0 |
| min 1.0 1.0 3.0 |
| 25% 1.0 1.0 3.0 |
| 50% 1.0 1.0 3.0 |
| 75% 1.0 1.0 3.0 |
| max 1.0 1.0 3.0 |
| ''' |
| |
| |
| print(df2.T) |
| ''' |
| 0 ... 3 |
| A 1 ... 1 |
| B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00 |
| C 1 ... 1 |
| D 3 ... 3 |
| E test ... train |
| F foo ... foo |
| |
| [6 rows x 4 columns] |
| ''' |
| |
| |
| |
| print(df2.sort_index(axis=1, ascending=False)) |
| ''' |
| F E D C B A |
| 0 foo test 3 1.0 2013-01-02 1.0 |
| 1 foo train 3 1.0 2013-01-02 1.0 |
| 2 foo test 3 1.0 2013-01-02 1.0 |
| 3 foo train 3 1.0 2013-01-02 1.0 |
| ''' |
| |
| print(df2.sort_index(axis=0, ascending=False)) |
| ''' |
| A B C D E F |
| 3 1.0 2013-01-02 1.0 3 train foo |
| 2 1.0 2013-01-02 1.0 3 test foo |
| 1 1.0 2013-01-02 1.0 3 train foo |
| 0 1.0 2013-01-02 1.0 3 test foo |
| ''' |
| |
| |
| print(df2.sort_values(by='E')) |
| ''' |
| A B C D E F |
| 0 1.0 2013-01-02 1.0 3 test foo |
| 2 1.0 2013-01-02 1.0 3 test foo |
| 1 1.0 2013-01-02 1.0 3 train foo |
| 3 1.0 2013-01-02 1.0 3 train foo |
| ''' |
选择数据
| import numpy as np |
| import pandas as pd |
| |
| |
| dates = pd.date_range("20210101", periods=6) |
| df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD")) |
| print(df) |
| ''' |
| A B C D |
| 2021-01-01 0 1 2 3 |
| 2021-01-02 4 5 6 7 |
| 2021-01-03 8 9 10 11 |
| 2021-01-04 12 13 14 15 |
| 2021-01-05 16 17 18 19 |
| 2021-01-06 20 21 22 23 |
| ''' |
| |
| |
| print(df['A']) |
| print(df.A) |
| ''' |
| 2021-01-01 0 |
| 2021-01-02 4 |
| 2021-01-03 8 |
| 2021-01-04 12 |
| 2021-01-05 16 |
| 2021-01-06 20 |
| Freq: D, Name: A, dtype: int32 |
| ''' |
| |
| |
| print(df[0:3]) |
| print(df['20210102':'20210104']) |
| ''' |
| A B C D |
| 2021-01-01 0 1 2 3 |
| 2021-01-02 4 5 6 7 |
| 2021-01-03 8 9 10 11 |
| A B C D |
| 2021-01-02 4 5 6 7 |
| 2021-01-03 8 9 10 11 |
| 2021-01-04 12 13 14 15 |
| |
| ''' |
| |
| |
| print(df.loc['20210102']) |
| print(df.loc[:,['A','B']]) |
| print(df.loc['20210102', ['A','B']]) |
| ''' |
| A 4 |
| B 5 |
| C 6 |
| D 7 |
| Name: 2021-01-02 00:00:00, dtype: int32 |
| ---------------------------------------- |
| A B |
| 2021-01-01 0 1 |
| 2021-01-02 4 5 |
| 2021-01-03 8 9 |
| 2021-01-04 12 13 |
| 2021-01-05 16 17 |
| 2021-01-06 20 21 |
| --------------------------------------- |
| A 4 |
| B 5 |
| Name: 2021-01-02 00:00:00, dtype: int32 |
| |
| ''' |
| |
| |
| print(df.iloc[3]) |
| ''' |
| A 12 |
| B 13 |
| C 14 |
| D 15 |
| Name: 2021-01-04 00:00:00, dtype: int32 |
| ''' |
| print(df.iloc[3, 1]) |
| |
| print(df.iloc[3:5,0:2]) |
| ''' |
| A B |
| 2021-01-04 12 13 |
| 2021-01-05 16 17 |
| ''' |
| print(df.iloc[[1,2,4],[0,2]]) |
| ''' |
| A C |
| 2021-01-02 4 6 |
| 2021-01-03 8 10 |
| 2021-01-05 16 18 |
| ''' |
| |
| |
| print(df.ix[:3, ['A', 'C']]) |
| ''' |
| A C |
| 2021-01-01 0 2 |
| 2021-01-02 4 6 |
| 2021-01-03 8 10 |
| ''' |
| |
| |
| print(df[df.A > 0]) |
| ''' |
| A B C D |
| 2021-01-02 4 5 6 7 |
| 2021-01-03 8 9 10 11 |
| 2021-01-04 12 13 14 15 |
| 2021-01-05 16 17 18 19 |
| 2021-01-06 20 21 22 23 |
| ''' |
赋值
| import numpy as np |
| import pandas as pd |
| |
| |
| dates = pd.date_range("20210101", periods=6) |
| df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD")) |
| print(df) |
| ''' |
| A B C D |
| 2021-01-01 0 1 2 3 |
| 2021-01-02 4 5 6 7 |
| 2021-01-03 8 9 10 11 |
| 2021-01-04 12 13 14 15 |
| 2021-01-05 16 17 18 19 |
| 2021-01-06 20 21 22 23 |
| ''' |
| df.iloc[2, 2] = 1111 |
| df.iloc[2,2] = 1111 |
| df.loc['2021-01-03', 'D'] = 2222 |
| df.A[df.A>0] = 0 |
| df['F'] = np.nan |
| df['G'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20210101', periods=6)) |
| print(df) |
| ''' |
| A B C D F G |
| 2021-01-01 0 1 2 3 NaN 1 |
| 2021-01-02 0 5 6 7 NaN 2 |
| 2021-01-03 0 9 1111 2222 NaN 3 |
| 2021-01-04 0 13 14 15 NaN 4 |
| 2021-01-05 0 17 18 19 NaN 5 |
| 2021-01-06 0 21 22 23 NaN 6 |
| ''' |
处理丢失数据
| import pandas as pd |
| import numpy as np |
| |
| dates = pd.date_range('20210101', periods=6) |
| df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D']) |
| |
| df.iloc[0,1] = np.nan |
| df.iloc[1,2] = np.nan |
| print(df) |
| ''' |
| A B C D |
| 2021-01-01 0 NaN 2.0 3 |
| 2021-01-02 4 5.0 NaN 7 |
| 2021-01-03 8 9.0 10.0 11 |
| 2021-01-04 12 13.0 14.0 15 |
| 2021-01-05 16 17.0 18.0 19 |
| 2021-01-06 20 21.0 22.0 23 |
| ''' |
| |
| |
| print(df.dropna(axis=0, how='any')) |
| ''' |
| A B C D |
| 2021-01-03 8 9.0 10.0 11 |
| 2021-01-04 12 13.0 14.0 15 |
| 2021-01-05 16 17.0 18.0 19 |
| 2021-01-06 20 21.0 22.0 23 |
| ''' |
| |
| |
| print(df.dropna(axis=1, how='any')) |
| ''' |
| A D |
| 2021-01-01 0 3 |
| 2021-01-02 4 7 |
| 2021-01-03 8 11 |
| 2021-01-04 12 15 |
| 2021-01-05 16 19 |
| 2021-01-06 20 23 |
| ''' |
| |
| |
| print(df.fillna(value=0)) |
| ''' |
| A B C D |
| 2021-01-01 0 0.0 2.0 3 |
| 2021-01-02 4 5.0 0.0 7 |
| 2021-01-03 8 9.0 10.0 11 |
| 2021-01-04 12 13.0 14.0 15 |
| 2021-01-05 16 17.0 18.0 19 |
| 2021-01-06 20 21.0 22.0 23 |
| ''' |
| |
| |
| print(pd.isnull(df)) |
| ''' |
| A B C D |
| 2021-01-01 False True False False |
| 2021-01-02 False False True False |
| 2021-01-03 False False False False |
| 2021-01-04 False False False False |
| 2021-01-05 False False False False |
| 2021-01-06 False False False False |
| ''' |
| |
| |
| |
| print(np.any(df.isnull())) |
| |
导入导出数据
支持的数据类型:
CSV 推荐使用
HDF5
Excel
Gotchas
| import pandas as pd |
| |
| |
| data = pd.read_csv('student.csv') |
| print(data) |
| |
| |
| data.to_pickle('student.pickle') |
数据合并
concat
| import pandas as pd |
| import numpy as np |
| |
| |
| |
| df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d']) |
| df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d']) |
| df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d']) |
| |
| ''' |
| a b c d |
| 0 0.0 0.0 0.0 0.0 |
| 1 0.0 0.0 0.0 0.0 |
| 2 0.0 0.0 0.0 0.0 |
| ''' |
| |
| ''' |
| a b c d |
| 0 1.0 1.0 1.0 1.0 |
| 1 1.0 1.0 1.0 1.0 |
| 2 1.0 1.0 1.0 1.0 |
| |
| ''' |
| |
| ''' |
| a b c d |
| 0 2.0 2.0 2.0 2.0 |
| 1 2.0 2.0 2.0 2.0 |
| 2 2.0 2.0 2.0 2.0 |
| |
| ''' |
| |
| res = pd.concat([df1, df2, df3], axis=0, ignore_index=True) |
| |
| ''' |
| a b c d |
| 0 0.0 0.0 0.0 0.0 |
| 1 0.0 0.0 0.0 0.0 |
| 2 0.0 0.0 0.0 0.0 |
| 3 1.0 1.0 1.0 1.0 |
| 4 1.0 1.0 1.0 1.0 |
| 5 1.0 1.0 1.0 1.0 |
| 6 2.0 2.0 2.0 2.0 |
| 7 2.0 2.0 2.0 2.0 |
| 8 2.0 2.0 2.0 2.0 |
| ''' |
| |
| |
| df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3]) |
| df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4]) |
| print(df1) |
| ''' |
| a b c d |
| 1 0.0 0.0 0.0 0.0 |
| 2 0.0 0.0 0.0 0.0 |
| 3 0.0 0.0 0.0 0.0 |
| ''' |
| |
| print(df2) |
| ''' |
| b c d e |
| 2 1.0 1.0 1.0 1.0 |
| 3 1.0 1.0 1.0 1.0 |
| 4 1.0 1.0 1.0 1.0 |
| ''' |
| |
| |
| res = pd.concat([df1, df2], axis=0, join='outer') |
| print(res) |
| ''' |
| a b c d e |
| 1 0.0 0.0 0.0 0.0 NaN |
| 2 0.0 0.0 0.0 0.0 NaN |
| 3 0.0 0.0 0.0 0.0 NaN |
| 2 NaN 1.0 1.0 1.0 1.0 |
| 3 NaN 1.0 1.0 1.0 1.0 |
| 4 NaN 1.0 1.0 1.0 1.0 |
| ''' |
| |
| res = pd.concat([df1, df2], axis=0, join='inner') |
| print(res) |
| ''' |
| b c d |
| 1 0.0 0.0 0.0 |
| 2 0.0 0.0 0.0 |
| 3 0.0 0.0 0.0 |
| 2 1.0 1.0 1.0 |
| 3 1.0 1.0 1.0 |
| 4 1.0 1.0 1.0 |
| ''' |
| |
| |
| |
| res = pd.concat([df1, df2], axis=1, join_axes=[df1.index]) |
| print(res) |
| ''' |
| a b c d b c d e |
| 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN |
| 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 |
| 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 |
| |
| ''' |
| |
| df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d']) |
| df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d']) |
| df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4]) |
| res = df1.append(df2, ignore_index=True) |
| res = df1.append([df2, df3]) |
| print(res) |
| |
| ''' |
| a b c d e |
| 0 0.0 0.0 0.0 0.0 NaN |
| 1 0.0 0.0 0.0 0.0 NaN |
| 2 0.0 0.0 0.0 0.0 NaN |
| 0 1.0 1.0 1.0 1.0 NaN |
| 1 1.0 1.0 1.0 1.0 NaN |
| 2 1.0 1.0 1.0 1.0 NaN |
| 2 NaN 1.0 1.0 1.0 1.0 |
| 3 NaN 1.0 1.0 1.0 1.0 |
| 4 NaN 1.0 1.0 1.0 1.0 |
| |
| ''' |
| |
| s1 = pd.Series([1,2,3,4], index=['a','b','c','d']) |
| res = df1.append(s1, ignore_index=True) |
| print(res) |
| ''' |
| a b c d |
| 0 0.0 0.0 0.0 0.0 |
| 1 0.0 0.0 0.0 0.0 |
| 2 0.0 0.0 0.0 0.0 |
| 3 1.0 2.0 3.0 4.0 |
| ''' |
| |
merge
| import pandas as pd |
| |
| |
| |
| left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], |
| 'A': ['A0', 'A1', 'A2', 'A3'], |
| 'B': ['B0', 'B1', 'B2', 'B3']}) |
| right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], |
| 'C': ['C0', 'C1', 'C2', 'C3'], |
| 'D': ['D0', 'D1', 'D2', 'D3']}) |
| print(left) |
| ''' |
| key A B |
| 0 K0 A0 B0 |
| 1 K1 A1 B1 |
| 2 K2 A2 B2 |
| 3 K3 A3 B3 |
| ''' |
| print(right) |
| ''' |
| key C D |
| 0 K0 C0 D0 |
| 1 K1 C1 D1 |
| 2 K2 C2 D2 |
| 3 K3 C3 D3 |
| ''' |
| |
| res = pd.merge(left, right, on='key') |
| print(res) |
| ''' |
| key A B C D |
| 0 K0 A0 B0 C0 D0 |
| 1 K1 A1 B1 C1 D1 |
| 2 K2 A2 B2 C2 D2 |
| 3 K3 A3 B3 C3 D3 |
| ''' |
| |
| |
| left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], |
| 'key2': ['K0', 'K1', 'K0', 'K1'], |
| 'A': ['A0', 'A1', 'A2', 'A3'], |
| 'B': ['B0', 'B1', 'B2', 'B3']}) |
| right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], |
| 'key2': ['K0', 'K0', 'K0', 'K0'], |
| 'C': ['C0', 'C1', 'C2', 'C3'], |
| 'D': ['D0', 'D1', 'D2', 'D3']}) |
| print(left) |
| ''' |
| key1 key2 A B |
| 0 K0 K0 A0 B0 |
| 1 K0 K1 A1 B1 |
| 2 K1 K0 A2 B2 |
| 3 K2 K1 A3 B3 |
| ''' |
| print(right) |
| ''' |
| key1 key2 C D |
| 0 K0 K0 C0 D0 |
| 1 K1 K0 C1 D1 |
| 2 K1 K0 C2 D2 |
| 3 K2 K0 C3 D3 |
| ''' |
| |
| |
| |
| res = pd.merge(left, right, on=['key1', 'key2'], how='inner') |
| print(res) |
| ''' |
| key1 key2 A B C D |
| 0 K0 K0 A0 B0 C0 D0 |
| 1 K1 K0 A2 B2 C1 D1 |
| 2 K1 K0 A2 B2 C2 D2 |
| ''' |
| |
| |
| |
| res = pd.merge(left, right, on=['key1', 'key2'], how='outer') |
| print(res) |
| ''' |
| key1 key2 A B C D |
| 0 K0 K0 A0 B0 C0 D0 |
| 1 K0 K1 A1 B1 NaN NaN |
| 2 K1 K0 A2 B2 C1 D1 |
| 3 K1 K0 A2 B2 C2 D2 |
| 4 K2 K1 A3 B3 NaN NaN |
| 5 K2 K0 NaN NaN C3 D3 |
| ''' |
| |
| res = pd.merge(left, right, on=['key1', 'key2'], how='left') |
| print(res) |
| ''' |
| key1 key2 A B C D |
| 0 K0 K0 A0 B0 C0 D0 |
| 1 K0 K1 A1 B1 NaN NaN |
| 2 K1 K0 A2 B2 C1 D1 |
| 3 K1 K0 A2 B2 C2 D2 |
| 4 K2 K1 A3 B3 NaN NaN |
| ''' |
| |
| |
| res = pd.merge(left, right, on=['key1', 'key2'], how='right') |
| print(res) |
| ''' |
| key1 key2 A B C D |
| 0 K0 K0 A0 B0 C0 D0 |
| 1 K1 K0 A2 B2 C1 D1 |
| 2 K1 K0 A2 B2 C2 D2 |
| 3 K2 K0 NaN NaN C3 D3 |
| ''' |
| |
| |
| df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) |
| df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) |
| print(df1) |
| ''' |
| col1 col_left |
| 0 0 a |
| 1 1 b |
| ''' |
| print(df2) |
| ''' |
| col1 col_right |
| 0 1 2 |
| 1 2 2 |
| 2 2 2 |
| ''' |
| |
| res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) |
| print(res) |
| ''' |
| col1 col_left col_right _merge |
| 0 0 a NaN left_only |
| 1 1 b 2.0 both |
| 2 2 NaN 2.0 right_only |
| 3 2 NaN 2.0 right_only |
| ''' |
| |
| res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') |
| print(res) |
| ''' |
| col1 col_left col_right indicator_column |
| 0 0 a NaN left_only |
| 1 1 b 2.0 both |
| 2 2 NaN 2.0 right_only |
| 3 2 NaN 2.0 right_only |
| ''' |
| |
| |
| left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], |
| 'B': ['B0', 'B1', 'B2']}, |
| index=['K0', 'K1', 'K2']) |
| right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], |
| 'D': ['D0', 'D2', 'D3']}, |
| index=['K0', 'K2', 'K3']) |
| print(left) |
| ''' |
| A B |
| K0 A0 B0 |
| K1 A1 B1 |
| K2 A2 B2 |
| ''' |
| print(right) |
| ''' |
| C D |
| K0 C0 D0 |
| K2 C2 D2 |
| K3 C3 D3 |
| ''' |
| |
| |
| res = pd.merge(left, right, left_index=True, right_index=True, how='outer') |
| print(res) |
| ''' |
| A B C D |
| K0 A0 B0 C0 D0 |
| K1 A1 B1 NaN NaN |
| K2 A2 B2 C2 D2 |
| K3 NaN NaN C3 D3 |
| ''' |
| res = pd.merge(left, right, left_index=True, right_index=True, how='inner') |
| print(res) |
| ''' |
| A B C D |
| K0 A0 B0 C0 D0 |
| K2 A2 B2 C2 D2 |
| |
| ''' |
| |
| |
| |
| |
| boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) |
| print(boys) |
| ''' |
| k age |
| 0 K0 1 |
| 1 K1 2 |
| 2 K2 3 |
| ''' |
| girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) |
| print(girls) |
| ''' |
| k age |
| 0 K0 4 |
| 1 K0 5 |
| 2 K3 6 |
| ''' |
| res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') |
| print(res) |
| ''' |
| k age_boy age_girl |
| 0 K0 1 4 |
| 1 K0 1 5 |
| ''' |
| |
| res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer') |
| print(res) |
| ''' |
| k age_boy age_girl |
| 0 K0 1.0 4.0 |
| 1 K0 1.0 5.0 |
| 2 K1 2.0 NaN |
| 3 K2 3.0 NaN |
| 4 K3 NaN 6.0 |
| ''' |
绘图
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| |
| |
| |
| |
| data = pd.Series(np.random.randn(1000), index=np.arange(1000)) |
| |
| data = data.cumsum() |
| data.plot() |
| |
| |
| data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD")) |
| data = data.cumsum() |
| |
| |
| ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1") |
| data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax) |
| |
| plt.show() |


参考资料
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)