python 基础数据处理(代码)

import numpy as np

array = np.array([[1,2,3],
                 [2,3,4]])
print(array)

[[1 2 3]
 [2 3 4]]

print(array)
print('number of dim:', array.ndim)
print('shape:', array.shape)
print('size', array.size)

[[1 2 3]
 [2 3 4]]
number of dim: 2
shape: (2, 3)
size 6

创建arrpy

a = np.array([2,23,4])
print(a)

[ 2 23  4]

b = np.array([2,23,4], dtype = np.int) #指定array的数据类型 np.int  np.float
print(b.dtype)

int32

a = np.zeros((3, 4)) #生成一个3行4列的矩阵,所有值为 0
print(a)
a = np.ones((3, 4)) #生成一个3行4列的矩阵,所有值为 1
print(a)
a = np.empty((3, 4), dtype = float) #生成一个3行4列的矩阵,所有值为 0
print(a)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

 a = np.arange(10, 20) #生成从 10 到 20 的数列  arange(start, end, setp) 
print(a)
a = np.arange(12).reshape((3, 4)) #重新定义大小
print(a)

[10 11 12 13 14 15 16 17 18 19]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

#生成一个线段
a = np.linspace(1, 10, 20)
print(a)
a = np.linspace(1, 10, 20).reshape((5, 4))

[ 1.          1.47368421  1.94736842  2.42105263  2.89473684  3.36842105
  3.84210526  4.31578947  4.78947368  5.26315789  5.73684211  6.21052632
  6.68421053  7.15789474  7.63157895  8.10526316  8.57894737  9.05263158
  9.52631579 10.        ]

a = np.array([10, 20, 30, 40])
b = np.arange(4)
print(a, b)

[10 20 30 40] [0 1 2 3]

c = a - b
print(c)
c = a + b
print(c)
c = b**2  # b 的 平方
print(c)
c = 10 * np.sin(a)
print(c)
c = 10 * np.cos(a)
print(c)

[10 19 28 37]
[10 21 32 43]
[0 1 4 9]
[-5.44021111  9.12945251 -9.88031624  7.4511316 ]
[-8.39071529  4.08082062  1.5425145  -6.66938062]

print(b, b < 3, b == 3)

[0 1 2 3] [ True  True  True False] [False False False  True]

a = np.array([[1, 1],
             [0,1]])
b = np.arange(4).reshape((2, 2))

print(a)
print(b)
c = a * b  #元素对应相乘
c_dot = np.dot(a, b) #矩阵乘法

print(c)
print(c_dot)

[[1 1]
 [0 1]]
[[0 1]
 [2 3]]
[[0 1]
 [0 3]]
[[2 4]
 [2 3]]

a = np.random.random((2, 4)) #随机生成[0, 1]的小数

print(a)
print(np.sum(a))
print(np.min(a))
print(np.max(a))

[[0.42529947 0.17039602 0.19660961 0.46083628]
 [0.94819732 0.24865026 0.94451173 0.08625252]]
3.48075321320014
0.08625252476463197
0.9481973213759178

A = np.arange(2, 14).reshape((3, 4))
print(A)
print(np.argmin(A)) #找到最小值的索引
print(np.argmax(A))
print(A.mean())  #平均数
print(np.mean(A))
print(np.median(A)) #中位数
print(np.cumsum(A)) #逐步累加
print(np.diff(A))  #累差
print(np.sort(A)) #逐行排序
print(np.transpose(A)) #转置
print(A.T)
print(np.clip(A, 5, 10))  #小于5的数让他等于5，大于9的数等于9
print(np.mean(A, axis=0)) #列的平均值
print(np.mean(A, axis=1)) #行的平均值

[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
0
11
7.5
7.5
7.5
[ 2  5  9 14 20 27 35 44 54 65 77 90]
[[1 1 1]
 [1 1 1]
 [1 1 1]]
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
[[ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]]
[[ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]]
[[ 5  5  5  5]
 [ 6  7  8  9]
 [10 10 10 10]]
[6. 7. 8. 9.]
[ 3.5  7.5 11.5]

A = np.arange(3, 15)
print(A)
print(A[3])
A = np.arange(3, 15).reshape((3, 4))
print(A)
print(A[2])
print(A[2][1])
print(A[:, 1]) #所有行的第一行数

#迭代行
for row in A:
    print(row)

#迭代列
for column in A.T:
    print(column)

print(A.flatten()) #返回值

for item in A.flat:
    print(item)

[ 3  4  5  6  7  8  9 10 11 12 13 14]
6
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
[11 12 13 14]
12
[ 4  8 12]
[3 4 5 6]
[ 7  8  9 10]
[11 12 13 14]
[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
[ 6 10 14]
[ 3  4  5  6  7  8  9 10 11 12 13 14]
3
4
5
6
7
8
9
10
11
12
13
14

A = np.array([1,1,1])
B = np.array([2,2,2])

C = np.vstack((A, B))
D = np.hstack((A, B))
print(D)
print(A.shape)
print(D.shape)
print(A[np.newaxis, :])  #加了一个维度
print(A[np.newaxis, :].shape) 
print(A[:,np.newaxis]) 
print(A[:,np.newaxis].shape)

[1 1 1 2 2 2]
(3,)
(6,)
[[1 1 1]]
(1, 3)
[[1]
 [1]
 [1]]
(3, 1)

A = np.arange(12).reshape((3, 4))

print(A)
print(np.split(A, 2, axis = 1)) #按列分割成 2 个 列数能被块数整数
print(np.split(A, 3, axis = 0)) #按行分割成 2 个 行数能被块数整数
print(np.array_split(A, 3, axis = 1)) #不等分割
print(np.vsplit(A, 3))  #按行分割
print(np.hsplit(A,2))  #按列分割

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]

import pandas as pd

s = pd.Series([1,3,4, np.nan,44,1]) #创建一个序列
print(s)

0     1.0
1     3.0
2     4.0
3     NaN
4    44.0
5     1.0
dtype: float64

dates = pd.date_range('20210922', periods=6) #生成日期序列

print(dates)

DatetimeIndex(['2021-09-22', '2021-09-23', '2021-09-24', '2021-09-25',
               '2021-09-26', '2021-09-27'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4), index = dates, columns=['a', 'b', 'c', 'd'])

df

	a	b	c	d
2021-09-22	1.032756	-0.529079	-0.161499	-0.436609
2021-09-23	-1.059394	2.122808	-0.519834	-0.822384
2021-09-24	-0.002292	0.119694	-0.749122	1.148803
2021-09-25	0.456498	-1.414832	-0.386781	-2.410006
2021-09-26	0.540748	-1.741398	0.339414	-0.667571
2021-09-27	-0.591441	2.414537	-0.105859	0.755189

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))

print(df1)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

df2 = pd.DataFrame({'A' : 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index = list(range(4)), dtype = 'float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E':pd.Categorical(["test", "train", "test", "train"]),
                   'F':'foo'})

print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

print(df2.index)

Int64Index([0, 1, 2, 3], dtype='int64')

print(df2.columns) #打印列的名字

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

print(df2.describe())  #描述方差，平均值，最大最小值
print(df2.T)  #转置

         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
                     0                    1                    2  \
A                    1                    1                    1   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2013-01-02 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo

print(df2.sort_index(axis = 1, ascending = False)) # index 排序

     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0

print(df2.sort_values(by='E'))  #按照某行的值进行排序

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
3  1.0 2013-01-02  1.0  3  train  foo

dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index = dates, columns = ['A', 'B', 'C', 'D'])

print(df)

             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23

print(df['A'], df.A)
print(df[0:3]) #切片选择

2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32 2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32
            A  B   C   D
2013-01-01  0  1   2   3
2013-01-02  4  5   6   7
2013-01-03  8  9  10  11

#select by lable
print(df.loc['20130102']) #某行数据
print(df.loc[:,['A', 'B']])
print(df.loc['20130102',['A', 'B']])
#select by position iloc
print(df.iloc[3])
#mixed selection:ix
print("mix")
print(df.ix[:3,['A','C']])
print(df[df.A > 8])

A    4
B    5
C    6
D    7
Name: 2013-01-02 00:00:00, dtype: int32
             A   B
2013-01-01   0   1
2013-01-02   4   5
2013-01-03   8   9
2013-01-04  12  13
2013-01-05  16  17
2013-01-06  20  21
A    4
B    5
Name: 2013-01-02 00:00:00, dtype: int32
A    12
B    13
C    14
D    15
Name: 2013-01-04 00:00:00, dtype: int32
mix
            A   C
2013-01-01  0   2
2013-01-02  4   6
2013-01-03  8  10
             A   B   C   D
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23


C:\Users\dell\Anaconda\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':

df.iloc[2,2] = 1111
df.loc['20130101', 'B'] = 2222
df.B[df.A>4] = 0
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130101', periods = 6))
print(df)

             A     B     C   D   F  E
2013-01-01   0  2222     2   3 NaN  1
2013-01-02   4     5     6   7 NaN  2
2013-01-03   8     0  1111  11 NaN  3
2013-01-04  12     0    14  15 NaN  4
2013-01-05  16     0    18  19 NaN  5
2013-01-06  20     0    22  23 NaN  6

df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan

print(df.dropna(axis = 0, how = 'any')) #某一行含有nan的应该丢弃 how == any 只有一个 , all 某一行全部为nan 则全部丢弃
print(df.dropna(axis = 1, how = 'any')) #某一列含有nan的应该丢弃 how == any 只有一个 , all 某一列全部为nan 则全部丢弃

Empty DataFrame
Columns: [A, B, C, D, F, E]
Index: []
             A   D  E
2013-01-01   0   3  1
2013-01-02   4   7  2
2013-01-03   8  11  3
2013-01-04  12  15  4
2013-01-05  16  19  5
2013-01-06  20  23  6

print(df.fillna(value = 0)) #将 nan 替换为 value

             A    B       C   D    F  E
2013-01-01   0  0.0     2.0   3  0.0  1
2013-01-02   4  5.0     0.0   7  0.0  2
2013-01-03   8  0.0  1111.0  11  0.0  3
2013-01-04  12  0.0    14.0  15  0.0  4
2013-01-05  16  0.0    18.0  19  0.0  5
2013-01-06  20  0.0    22.0  23  0.0  6

print(df.isnull()) #是否含有nan

                A      B      C      D     F      E
2013-01-01  False   True  False  False  True  False
2013-01-02  False  False   True  False  True  False
2013-01-03  False  False  False  False  True  False
2013-01-04  False  False  False  False  True  False
2013-01-05  False  False  False  False  True  False
2013-01-06  False  False  False  False  True  False

print(np.any(df.isnull()) == True) #大数据中直接进行判断

True

data = pd.read_csv('student.csv')
print(data)

    Student ID  name  age  gender
0         1101    df    1  Female
1         1102   sdf    2  Female
2         1103  asdf    3    Male
3         1104    gf    4  Female
4         1105    cb    5    Male
5         1106    ec    6  Female
6         1107     h    7    Male
7         1108   rty    8    Male
8         1109   rty    9    Male
9         1110   hhh   10  Female
10        1111     t   11  Female
11        1112     y   12  Female
12        1113     u   13    Male
13        1114   NaN   14    Male
14        1115     p   15  Female
15        1116     o   16  Female
16        1117     k   17  Female
17        1118     l   18  Female
18        1119     m   19    Male
19        1120     e   20    Male
20        1121    hh   21    Male

data.to_pickle('student.pickle')

data1 = pd.read_pickle('student.pickle')
print(data1)

    Student ID  name  age  gender
0         1101    df    1  Female
1         1102   sdf    2  Female
2         1103  asdf    3    Male
3         1104    gf    4  Female
4         1105    cb    5    Male
5         1106    ec    6  Female
6         1107     h    7    Male
7         1108   rty    8    Male
8         1109   rty    9    Male
9         1110   hhh   10  Female
10        1111     t   11  Female
11        1112     y   12  Female
12        1113     u   13    Male
13        1114   NaN   14    Male
14        1115     p   15  Female
15        1116     o   16  Female
16        1117     k   17  Female
17        1118     l   18  Female
18        1119     m   19    Male
19        1120     e   20    Male
20        1121    hh   21    Male

df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd'])
print(df1)
print(df2)
print(df3)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

res = pd.concat([df1, df2, df3], axis = 0) #axis = 0 竖向的, axis = 1横向的
print(res)
res = pd.concat([df1, df2, df3], axis = 1)
print(res)
res = pd.concat([df1, df2, df3], axis = 1, ignore_index = True) # 忽略id 重新排序
print(res)
res = pd.concat([df1, df2, df3], axis = 0, ignore_index = True) # 忽略id 重新排序
print(res)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
     a    b    c    d    a    b    c    d    a    b    c    d
0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
    0    1    2    3    4    5    6    7    8    9    10   11
0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns = ['a', 'b', 'c', 'd'], index= [1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
print(df1)
print(df2)
res = pd.concat([df1, df2])
print(res)
res = pd.concat([df1, df2], join = 'inner') #把相同的列的合并在一起
print(res)
res = pd.concat([df1, df2], join = 'inner', ignore_index = True) #把相同的列的合并在一起, id 重新进行排序
print(res)
res = pd.concat([df1, df2], axis = 1, join_axes  = [df1.index]) #把相同的列的合并在一起, id 重新进行排序
print(res)

     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0


C:\Users\dell\Anaconda\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """

res = df1.append([df2, df3])
print(res)

     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0  NaN
1  2.0  2.0  2.0  2.0  NaN
2  2.0  2.0  2.0  2.0  NaN


C:\Users\dell\Anaconda\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)

s1 = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index =True)
print('df1\n', df1)
print('s1\n', s1)
print(res)

df1
      a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
s1
 a    1
b    2
c    3
d    4
dtype: int64
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0

left = pd.DataFrame({'key': ['K0','K1','K2','K3'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0','K1','K2','K3'],
                    'C':['C0', 'C1', 'C2', 'C3'],
                    'D':['D0', 'D1', 'D2', 'D3']})
print(left, right)

  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3   key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3

res = pd.merge(left, right, on='key') #on 表示按照 某一个属性，然后值相同的进行合并
print(res)

  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3

left = pd.DataFrame({'key1': ['K0','K0','K1','K2'],
                     'key2': ['K0','K1','K0','K1'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0','K1','K1','K2'],
                     'key2': ['K0','K0','K0','K0'],
                    'C':['C0', 'C1', 'C2', 'C3'],
                    'D':['D0', 'D1', 'D2', 'D3']})
print(left,'\n', right)
res = pd.merge(left, right, on=['key1', 'key2']) #默认 how = 'inner', 两个必须为相同值 'outer' 一不一样都进行合并，没有值为 nan， left 
print(res)
res = pd.merge(left, right, on=['key1', 'key2'], how = 'left') #how = 'left', 'left' 一不一样都进行合并，没有值为 nan， left 
print('how == left\n', res)
res = pd.merge(left, right, on=['key1', 'key2'], how = 'right') #默认 how = 'inner', 'outer' 一不一样都进行合并，没有值为 nan， left 
print('how == right\n', res)

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3 
   key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
how == left
   key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
how == right
   key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3

res = pd.merge(left, right, on=['key1', 'key2'], how = 'right', indicator = True) # 显示如何 how 合并
print(res)

  key1 key2    A    B   C   D      _merge
0   K0   K0   A0   B0  C0  D0        both
1   K1   K0   A2   B2  C1  D1        both
2   K1   K0   A2   B2  C2  D2        both
3   K2   K0  NaN  NaN  C3  D3  right_only

import matplotlib.pyplot as plt

data = pd.Series(np.random.randn(1000), index = np.arange(1000))
data = data.cumsum()

data.plot()
plt.show()

data = pd.DataFrame(np.random.randn(1000, 4),
                   index = np.arange(1000),
                   columns = list("ABCD"))
print(data.head(3))

          A         B         C         D
0  1.867890 -1.189657  0.055617  2.624250
1  1.211239 -0.961285  1.139995 -2.090965
2  0.597090 -1.287032 -0.622284 -1.484287

data = data.cumsum()
data.plot()
plt.show()

#plot methods:
# 'bar', hist, box, kde, area, scatter, hexbin, pie
ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class1')
data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class2', ax = ax)
plt.show()

posted @ 2021-09-26 15:52 owo_owo 阅读(115) 评论(0) 编辑收藏举报

刷新页面返回顶部

lalalala~

python 基础数据处理(代码)

公告