Python pandas模块快速掌握
文章目录
1. 基本介绍
1.1 创建序列
import pandas as pd
import numpy as np
# 创建序列
s = pd.Series([1,2,6,np.nan,44,1])
print(s)
# 0 1.0
# 1 2.0
# 2 6.0
# 3 NaN
# 4 44.0
# 5 1.0
# dtype: float64
import pandas as pd
import numpy as np
dates = pd.date_range('20160101',periods=6) # periods=6 生成 6 个数据
print(dates)
# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
# '2016-01-05', '2016-01-06'],
# dtype='datetime64[ns]', freq='D')
1.2 创建DataFrame
用index 和 columns 来分别指定行名和列名
import pandas as pd
import numpy as np
dates = pd.date_range('20160101',periods=6) # periods=6 生成 6 个数据
print(dates)
# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
# '2016-01-05', '2016-01-06'],
# dtype='datetime64[ns]', freq='D')
# 定义一个DataFrame 数据为 np.random.randn(6,4) 6行4列
# 行的索引为 index = dates dates 中的数据
# 列的索引为 columns = ['a','b','c','d']
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
# a b c d
# 2016-01-01 1.022074 0.569502 -0.100940 0.115481
# 2016-01-02 -0.297564 0.323012 -0.006327 0.397941
# 2016-01-03 -0.153181 -1.084450 1.296298 0.004898
# 2016-01-04 -0.133755 0.917742 -0.526164 -0.510440
# 2016-01-05 -0.452221 0.527037 1.007127 -1.242781
# 2016-01-06 -0.599568 -0.102234 0.498403 1.041024
当不指定列名和行名时
import pandas as pd
import numpy as np
# 使用默认的行名和列名
new_df = pd.DataFrame(np.arange(12).reshape(3,4))
print(new_df)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
用字典创建字典
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2)
A B C D E F
# 0 1.0 2020-07-07 1 3 test foo
# 1 1.0 2020-07-07 1 3 train foo
# 2 1.0 2020-07-07 1 3 test foo
# 3 1.0 2020-07-07 1 3 train foo
1.3 DataFrame的一些属性
dtypes返回数据类型
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2.dtypes)
# A float64
# B datetime64[ns]
# C int64
# D int32
# E category
# F object
# dtype: object
index 返回所有的行名 columns 返回所有的列名
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2.index)
# Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
values 返回所有的值
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2.values)
# [[1.0 Timestamp('2020-07-07 00:00:00') 1 3 'test' 'foo']
# [1.0 Timestamp('2020-07-07 00:00:00') 1 3 'train' 'foo']
# [1.0 Timestamp('2020-07-07 00:00:00') 1 3 'test' 'foo']
# [1.0 Timestamp('2020-07-07 00:00:00') 1 3 'train' 'foo']]
1.4 describe描述
describe() 描述,返回每行的每列的和,平均值,最小值,最大值等
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df.describe())
# a b c d
# count 6.000000 6.000000 6.000000 6.000000
# mean -0.017155 0.384125 0.130974 0.212341
# std 1.873479 0.688730 1.259808 1.516203
# min -2.103500 -0.258377 -1.265493 -0.993860
# 25% -1.552483 -0.107568 -1.019783 -0.796689
# 50% -0.200061 0.141639 0.340015 -0.327878
# 75% 1.420934 0.790088 1.224424 0.568211
# max 2.442717 1.464250 1.325128 3.009710
1.5 transpose转置
transpose 进行转置
import pandas as pd
import numpy as np
# 使用字典来创建DataFrame
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200707'),
'C':pd.Series(1,index=list(range(4))),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2.T)
# 0 ... 3
# A 1 ... 1
# B 2020-07-07 00:00:00 ... 2020-07-07 00:00:00
# C 1 ... 1
# D 3 ... 3
# E test ... train
# F foo ... foo
#
# [6 rows x 4 columns]
1.6 sort排序
1.6.1 通过索引进行排序
print(df2.sort_index(axis=1,ascending=False)) # 按列名进行 倒序排序
# F E D C B A
# 0 foo test 3 1 2020-07-07 1.0
# 1 foo train 3 1 2020-07-07 1.0
# 2 foo test 3 1 2020-07-07 1.0
# 3 foo train 3 1 2020-07-07 1.0
print(df2.sort_index(axis=0,ascending=False)) # 按行名进行 倒序排序
# A B C D E F
# 3 1.0 2020-07-07 1 3 train foo
# 2 1.0 2020-07-07 1 3 test foo
# 1 1.0 2020-07-07 1 3 train foo
# 0 1.0 2020-07-07 1 3 test foo
1.6.2 通过值进行排序
# 通过值进行排序
print(df2.sort_values(by='E')) # by 指定对哪一列进行排序
# A B C D E F
# 0 1.0 2020-07-07 1 3 test foo
# 2 1.0 2020-07-07 1 3 test foo
# 1 1.0 2020-07-07 1 3 train foo
# 3 1.0 2020-07-07 1 3 train foo
2. pandas选择数据
2.1 根据列名选择某一列
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 选择某一列
print(df['A'])
# 2020-07-07 0
# 2020-07-08 4
# 2020-07-09 8
# 2020-07-10 12
# 2020-07-11 16
# 2020-07-12 20
# Freq: D, Name: A, dtype: int32
# 选择某一列 和 df['A']效果相同
print(df.A)
# 2020-07-07 0
# 2020-07-08 4
# 2020-07-09 8
# 2020-07-10 12
# 2020-07-11 16
# 2020-07-12 20
# Freq: D, Name: A, dtype: int32
2.2 通过切片进行选择
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 通过切片进行选择行
# 选择 0 - 3 行
print(df[0:3])
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 选择 从2020-07-10 到 2020-07-12的行
print(df['2020-07-10':'2020-07-12'])
# A B C D
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
2.3 select by lable: loc
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 通过loc选择行
print(df.loc['2020-07-10'])
# A 12
# B 13
# C 14
# D 15
# Name: 2020-07-10 00:00:00, dtype: int32
# 保留所有行的数据 筛选出 A列 和C列的数据
print(df.loc[:,['A','C']])
# A C
# 2020-07-07 0 2
# 2020-07-08 4 6
# 2020-07-09 8 10
# 2020-07-10 12 14
# 2020-07-11 16 18
# 2020-07-12 20 22
# 选择 20200709行 的A B C 三列的数据
print(df.loc['20200709',['A','B','C']])
# A 8
# B 9
# C 10
2.4 select by position: iloc
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# select by position: iloc
# 选择第三行的数据
print(df.iloc[3])
# A 12
# B 13
# C 14
# D 15
# Name: 2020-07-10 00:00:00, dtype: int32
# 选择2到4行之间的数据
print(df.iloc[2:4])
# A B C D
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 选择2到4行之间 第 0 和 第 3 列的数据
print(df.iloc[2:5,[0,3]])
# A D
# 2020-07-09 8 11
# 2020-07-10 12 15
# 2020-07-11 16 19
# 选择 1 3 5 行 第1到第3列之间的数据
print(df.iloc[[1,3,5],1:3])
# B C
# 2020-07-08 5 6
# 2020-07-10 13 14
# 2020-07-12 21 22
2.5 mixed selection: ix
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# mixed selection: ix
# 选择第 0 行 到 第3 行 A 和 C两列的数据
print(pd.ix[:3,['A','C']])
但是,很可惜哈哈,ix现在已经被弃用了!
2.6 Boolean indexing
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# Boolean indexing
# 筛选出 A 列 大于8的数据
print(df[df.A>8])
# A B C D
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
3. pandas 设置值
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 通过 iloc 选择元素 修改值
df.iloc[3,1] = 1133
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 1133 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 通过 loc 选择元素 修改值
df.loc['20200709','B'] = 222
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 222 10 11
# 2020-07-10 12 1133 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
# 通过Boolean indexing 的方式选择元素 进行赋值
# 选择A列大于8的元素 ,并将其修改为100
df.A[df.A>8] = 100
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 222 10 11
# 2020-07-10 100 1133 14 15
# 2020-07-11 100 17 18 19
# 2020-07-12 100 21 22 23
# 修改指定列的值
df['F'] = np.nan
print(df)
# A B C D F
# 2020-07-07 0 1 2 3 NaN
# 2020-07-08 4 5 6 7 NaN
# 2020-07-09 8 222 10 11 NaN
# 2020-07-10 100 1133 14 15 NaN
# 2020-07-11 100 17 18 19 NaN
# 2020-07-12 100 21 22 23 NaN
# 新添加一列
df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20200707',periods=6))
print(df)
# A B C D F E
# 2020-07-07 0 1 2 3 NaN 1
# 2020-07-08 4 5 6 7 NaN 2
# 2020-07-09 8 222 10 11 NaN 3
# 2020-07-10 100 1133 14 15 NaN 4
# 2020-07-11 100 17 18 19 NaN 5
# 2020-07-12 100 21 22 23 NaN 6
4. pandas 处理丢失数据
4.1 删除掉NaN的数据
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
# 当存在数据为nan时,丢弃掉这一行
# axis = 0 删除行 axis = 1 删除列
# how={'any','all'} all 当这一行所有值都为nan时,才会丢弃 any 当这一行存在一个nan时,就会丢弃这一行
print(df.dropna(axis=0,how='any'))
# A B C D
# 2020-07-09 8 9.0 10.0 11
# 2020-07-10 12 13.0 14.0 15
# 2020-07-11 16 17.0 18.0 19
# 2020-07-12 20 21.0 22.0 23
4.2 填充NaN的数据
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
# 当数据为NaN时,用0进行替换填充
print(df.fillna(value=0))
# A B C D
# 2020-07-07 0 0.0 2.0 3
# 2020-07-08 4 5.0 0.0 7
# 2020-07-09 8 9.0 10.0 11
# 2020-07-10 12 13.0 14.0 15
# 2020-07-11 16 17.0 18.0 19
# 2020-07-12 20 21.0 22.0 23
4.3 查询是否存在NaN的数据
import pandas as pd
import numpy as np
dates = pd.date_range('20200707',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df)
# A B C D
# 2020-07-07 0 1 2 3
# 2020-07-08 4 5 6 7
# 2020-07-09 8 9 10 11
# 2020-07-10 12 13 14 15
# 2020-07-11 16 17 18 19
# 2020-07-12 20 21 22 23
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
# 查询是否存在NaN的数据
print(df.isnull())
A B C D
# 2020-07-07 False True False False
# 2020-07-08 False False True False
# 2020-07-09 False False False False
# 2020-07-10 False False False False
# 2020-07-11 False False False False
# 2020-07-12 False False False False
# 当数据中至少有一个为NaN时,则返回True
print(np.any(df.isnull()) == True) # True
5. pandas 导入导出
Format Type | Data Description | Reader | Writer |
---|---|---|---|
text | CSV | read_csv | to_csv |
text | Fixed-Width Text File | read_fwf | |
text | JSON | read_json | to_json |
text | HTML | read_html | to_html |
text | Local clipboard | read_clipboard | to_clipboard |
MS Excel | read_excel | to_excel | |
binary | OpenDocument | read_excel | |
binary | HDF5 Format | read_hdf | to_hdf |
binary | Feather Format | read_feather | to_feather |
binary | Parquet Format | read_parquet | to_parquet |
binary | ORC Format | read_orc | |
binary | Msgpack | read_msgpack | to_msgpack |
binary | Stata | read_stata | to_stata |
binary | SAS | read_sas | |
binary | SPSS | read_spss | |
binary | Python Pickle Format | read_pickle | to_pickle |
SQL | SQL | read_sql | to_sql |
SQL | Google BigQuery | read_gbq | to_gbq |
我们通常在处理数据的时候,或存储数据的时候,要用excel读取的时候,我们一般使用CSV格式
# 导入文件
import pandas as pd
data = pd.read_csv('F:/student.csv')
print(data)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GdicQsdZ-1594205084459)(upload\image-20200707194944318.png)]
# 导入文件
import pandas as pd
data = pd.read_csv('F:/student.csv')
# 导出文件
data.to_pickle('D:/student.pickle')
6. pandas 合并
6.1 concat合并
concatenating
import pandas as pd
import numpy as np
# concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
print(df2)
# a b c d
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
print(df3)
# 合并
# axis = 0 合并每一行 axis = 1 合并每一列
res = pd.concat([df1,df2,df3],axis=0)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
# 解决合并索引问题
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
属性:join,[‘inner’,‘outer’]
import pandas as pd
import numpy as np
# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
print(df1)
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
print(df2)
# b c d e
# 2 1.0 1.0 1.0 1.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 默认join模式为outer 保留所有,不存的用NaN填充
res = pd.concat([df1,df2],join='outer')
print(res)
# a b c d e
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 0.0 0.0 0.0 0.0 NaN
# 2 NaN 1.0 1.0 1.0 1.0
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0
# inner 只保留两者共有的部分
res = pd.concat([df1,df2],join='inner')
print(res)
# b c d
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 0.0 0.0 0.0
# 2 1.0 1.0 1.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
res = pd.concat([df1,df2],join='inner',ignore_index=True)
print(res)
# b c d
# 0 0.0 0.0 0.0
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
# 5 1.0 1.0 1.0
属性:join_axes 已弃用!!
import numpy as np
import pandas as pd
# join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
print(df1)
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
属性:append
import numpy as np
import pandas as pd
# append 向下添加数据
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[2,3,4])
res = df1.append(df2,ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
res = df1.append([df2,df3])
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
6.2 merge合并
import pandas as pd
df1 = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(df1)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
print(df2)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3
# 基于 key 这一列 进行合并
res = pd.merge(df1,df2,on='key')
print(res)
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3
存在两个不同的key时 how 有4中取值 how = [‘outer’,‘inner’,‘left’,‘right’]
outer : 保留不存在的数据,用NaN进行补充
inner : 如果存在NaN的数据,则不保留该行
left : 基于左边的指定的数据进行合并
right : 基于右边的指定的数据进行合并
import pandas as pd
# consider tow keys
df1 = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(df1)
print(df2)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3
# 默认合并 how = 'inner'的方式
res = pd.merge(df1,df2,on=['key1','key2'])
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# outer 方式合并
res = pd.merge(df1,df2,on=['key1','key2'],how='outer')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
# 5 K2 K0 NaN NaN C3 D3
# left 合并
res = pd.merge(df1,df2,on=['key1','key2'],how='left')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
# right 合并
res = pd.merge(df1,df2,on=['key1','key2'],how='right')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# 3 K2 K0 NaN NaN C3 D3
参数 indicator
import pandas as pd
# indicator 显示出指定的key 是否left 和 right都有数据
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
# col1 col_left
# 0 0 a
# 1 1 b
print(df2)
# col1 col_right
# 0 1 2
# 1 2 2
# 2 2 2
res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)
print(res)
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
# give the indicator a custom name 自定义indicator列的名字
res = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')
print(res)
# col1 col_left col_right indicator_column
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
merged by index
import pandas as pd
# merged by index
df1 = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['K0','K1','K2'])
df2 = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index=['K0','K2','K3'])
print(df1)
print(df2)
res = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
res = pd.merge(df1,df2,left_index=True,right_index=True,how='inner')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
handle overlapping
suffixes 添加后缀 解决重复列名问题
import pandas as pd
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 k0 4
# 1 k0 5
# 2 k3 6
# suffixes
new_res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(new_res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5
7. pandas plot 画图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
data = data.cumsum()
print(data.head()) # 打印前5个数据
# A B C D
# 0 0.933066 -2.517019 -0.857727 -0.089567
# 1 -0.816617 -1.805794 0.239254 -0.797639
# 2 -2.233603 -1.963204 -0.538308 0.058029
# 3 -0.767217 -2.666023 -0.795966 -0.321899
# 4 -0.333429 -3.775623 0.171886 -1.584609
data.plot()
plt.show()
plot methods
- bar
- hist
- box
- kde
- area
- scatter
- hexbin
- pie
scatter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
data = data.cumsum()
print(data.head()) # 打印前5个数据
# A B C D
# 0 0.933066 -2.517019 -0.857727 -0.089567
# 1 -0.816617 -1.805794 0.239254 -0.797639
# 2 -2.233603 -1.963204 -0.538308 0.058029
# 3 -0.767217 -2.666023 -0.795966 -0.321899
# 4 -0.333429 -3.775623 0.171886 -1.584609
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label="Class 1")
# ax = ax 执行和ax在同一张图中显示出来
data.plot.scatter(x='A',y='C',color='Pink',label='Class 2',ax=ax)
plt.show()