Pandas 学习
核心数据结构
DataFrame
import numpy as np
data = np.random.normal(0 ,1 ,(10 ,5 ))
data
array([[ 1.21218494, -0.30678674, -0.66781485, 0.13414137, 0.79799153] ,
[-1.35105539, 1.26378017, 0.04932303, -0.93626636, -0.1161889 ] ,
[-0.01031587, -0.5311115 , 0.17573352, 1.44961339, 0.9510395 ] ,
[-0.38485007, -0.11271772, 0.86289105, -0.7046349 , -0.67357859] ,
[-0.23342328, -1.15661425, 1.729237 , -1.1405832 , -0.06817234] ,
[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989] ,
[-0.02421679, 0.45015635, -0.57010218, 1.00306539, 0.49455672] ,
[ 1.12410909, -0.59830918, -0.96323314, 1.2907916 , 1.0353455 ] ,
[-1.5828247 , -0.60952617, 1.10117806, 1.57818777, 1.69498839] ,
[-0.02737215, 0.51650423, -0.12322063, 1.84409656, -1.20511615] ])
import pandas as pd
pd.DataFrame(data)
0 1 2 3 4 0 1.212185 -0.306787 -0.667815 0.134141 0.797992 1 -1.351055 1.263780 0.049323 -0.936266 -0.116189 2 -0.010316 -0.531111 0.175734 1.449613 0.951039 3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579 4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172 5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420 6 -0.024217 0.450156 -0.570102 1.003065 0.494557 7 1.124109 -0.598309 -0.963233 1.290792 1.035346 8 -1.582825 -0.609526 1.101178 1.578188 1.694988 9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
num = ["num{}" .format (i) for i in range (10 )]
pd.DataFrame(data,index=num)
0 1 2 3 4 num0 1.212185 -0.306787 -0.667815 0.134141 0.797992 num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189 num2 -0.010316 -0.531111 0.175734 1.449613 0.951039 num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579 num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172 num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420 num6 -0.024217 0.450156 -0.570102 1.003065 0.494557 num7 1.124109 -0.598309 -0.963233 1.290792 1.035346 num8 -1.582825 -0.609526 1.101178 1.578188 1.694988 num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
num2 = ["index{}" .format (i) for i in range (5 )]
data = pd.DataFrame(data,index=num,columns=num2)
data
index0 index1 index2 index3 index4 num0 1.212185 -0.306787 -0.667815 0.134141 0.797992 num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189 num2 -0.010316 -0.531111 0.175734 1.449613 0.951039 num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579 num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172 num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420 num6 -0.024217 0.450156 -0.570102 1.003065 0.494557 num7 1.124109 -0.598309 -0.963233 1.290792 1.035346 num8 -1.582825 -0.609526 1.101178 1.578188 1.694988 num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
属性
data.shape
(10 , 5 )
data.index
Index (['num0' , 'num1' , 'num2' , 'num3' , 'num4' , 'num5' , 'num6' , 'num7' , 'num8' ,
'num9' ],
dtype='object' )
data.columns
Index (['index0' , 'index1' , 'index2' , 'index3' , 'index4' ], dtype='object' )
data.values
array([[ 1.21218494, -0.30678674, -0.66781485, 0.13414137, 0.79799153] ,
[-1.35105539, 1.26378017, 0.04932303, -0.93626636, -0.1161889 ] ,
[-0.01031587, -0.5311115 , 0.17573352, 1.44961339, 0.9510395 ] ,
[-0.38485007, -0.11271772, 0.86289105, -0.7046349 , -0.67357859] ,
[-0.23342328, -1.15661425, 1.729237 , -1.1405832 , -0.06817234] ,
[-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989] ,
[-0.02421679, 0.45015635, -0.57010218, 1.00306539, 0.49455672] ,
[ 1.12410909, -0.59830918, -0.96323314, 1.2907916 , 1.0353455 ] ,
[-1.5828247 , -0.60952617, 1.10117806, 1.57818777, 1.69498839] ,
[-0.02737215, 0.51650423, -0.12322063, 1.84409656, -1.20511615] ])
data.T
num0 num1 num2 num3 num4 num5 num6 num7 num8 num9 index0 1.212185 -1.351055 -0.010316 -0.384850 -0.233423 -1.129472 -0.024217 1.124109 -1.582825 -0.027372 index1 -0.306787 1.263780 -0.531111 -0.112718 -1.156614 -0.281252 0.450156 -0.598309 -0.609526 0.516504 index2 -0.667815 0.049323 0.175734 0.862891 1.729237 -0.710164 -0.570102 -0.963233 1.101178 -0.123221 index3 0.134141 -0.936266 1.449613 -0.704635 -1.140583 -0.226608 1.003065 1.290792 1.578188 1.844097 index4 0.797992 -0.116189 0.951039 -0.673579 -0.068172 -1.885420 0.494557 1.035346 1.694988 -1.205116
data.head()
index0 index1 index2 index3 index4 num0 1.212185 -0.306787 -0.667815 0.134141 0.797992 num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189 num2 -0.010316 -0.531111 0.175734 1.449613 0.951039 num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579 num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172
data.tail()
index0 index1 index2 index3 index4 num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420 num6 -0.024217 0.450156 -0.570102 1.003065 0.494557 num7 1.124109 -0.598309 -0.963233 1.290792 1.035346 num8 -1.582825 -0.609526 1.101178 1.578188 1.694988 num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
data.reset_index()
index index0 index1 index2 index3 index4 0 num0 1.212185 -0.306787 -0.667815 0.134141 0.797992 1 num1 -1.351055 1.263780 0.049323 -0.936266 -0.116189 2 num2 -0.010316 -0.531111 0.175734 1.449613 0.951039 3 num3 -0.384850 -0.112718 0.862891 -0.704635 -0.673579 4 num4 -0.233423 -1.156614 1.729237 -1.140583 -0.068172 5 num5 -1.129472 -0.281252 -0.710164 -0.226608 -1.885420 6 num6 -0.024217 0.450156 -0.570102 1.003065 0.494557 7 num7 1.124109 -0.598309 -0.963233 1.290792 1.035346 8 num8 -1.582825 -0.609526 1.101178 1.578188 1.694988 9 num9 -0.027372 0.516504 -0.123221 1.844097 -1.205116
df = pd.DataFrame({'month' : [1 , 4 , 7 , 10 ],
'year' : [2012 , 2014 , 2013 , 2014 ],
'sale' :[55 , 40 , 84 , 31 ]})
df
month year sale 0 1 2012 55 1 4 2014 40 2 7 2013 84 3 10 2014 31
df.set_index("month" , drop=True )
year sale month 1 2012 55 4 2014 40 7 2013 84 10 2014 31
new_df = df.set_index(["year" , "month" ])
new_df.index
MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]] ,
labels=[[0, 2, 1, 2], [0, 1, 2, 3]] ,
names=['year' , 'month' ])
new_df.index.names
FrozenList(['year' , 'month' ] )
new_df.index.levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]] )
Panel
p = pd.Panel(np.arange(24 ).reshape(4 ,3 ,2 ),
items=list ('ABCD' ),
major_axis=pd.date_range('20130101' , periods=3 ),
minor_axis=['first' , 'second' ])
p
C:\Users\28599 \AppData\Roaming\Python\Python37\site-packages\IPython\core\interactiveshell.py:3418 : FutureWarning:
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3 -dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http:
Pandas provides a `.to_xarray()` method to help automate this conversion.
exec(code_obj, self.user_global_ns, self.user_ns)
<class 'pandas.core.panel.Panel' >
Dimensions: 4 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: A to D
Major_axis axis: 2013 -01 -01 00 :00 :00 to 2013 -01 -03 00 :00 :00
Minor_axis axis: first to second
p["A" ]
first second 2013-01-01 0 1 2013-01-02 2 3 2013-01-03 4 5
p.major_xs("2013-01-03" )
A B C D first 4 10 16 22 second 5 11 17 23
p.minor_xs("first" )
A B C D 2013-01-01 0 6 12 18 2013-01-02 2 8 14 20 2013-01-03 4 10 16 22
Series
sr = data.iloc[1 ,:]
sr
Name: num1, dtype: float64
sr.index
Index (['index0' , 'index1' , 'index2' , 'index3' , 'index4' ], dtype='object' )
sr.values
array ([-1.35105539 , 1.26378017 , 0.04932303 , -0.93626636 , -0.1161889 ])
pd.Series(np.arange(10 ))
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
pd.Series(np.arange(3 , 9 , 2 ), index=["a" , "b" , "c" ])
a 3
b 5
c 7
dtype: int32
基本数据操作
data = pd.read_csv("./stock_day.csv" )
data.head()
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
data = data.drop(["ma5" ,"ma10" ,"ma20" ,"v_ma5" ,"v_ma10" ,"v_ma20" ], axis=1 )
data.head()
open high close low volume price_change p_change turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
data["open" ]["2018-02-26" ]
22.8
data.loc["2018-02-26" ]["open" ]
22.8
data.iloc[1 ,0 ]
22.8
data.loc[data.index[0 :4 ], ['open' , 'close' , 'high' , 'low' ]]
open close high low 2018-02-27 23.53 24.16 25.88 23.53 2018-02-26 22.80 23.53 23.78 22.80 2018-02-23 22.88 22.82 23.37 22.71 2018-02-22 22.25 22.28 22.76 22.02
data.open =100
data.head()
open high close low volume price_change p_change turnover 2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-22 100 22.76 22.28 22.02 36105.01 0.36 1.64 0.90 2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
data.sort_values(by=['high' ,'p_change' ],ascending=True ).head()
open high close low volume price_change p_change turnover 2015-03-02 100 12.67 12.52 12.20 96291.73 0.32 2.62 3.30 2015-03-04 100 12.92 12.90 12.61 67075.44 0.20 1.57 2.30 2015-03-03 100 13.06 12.70 12.52 139071.61 0.18 1.44 4.76 2015-09-07 100 13.38 12.77 12.63 52490.04 0.37 2.98 1.80 2015-03-05 100 13.45 13.16 12.87 93180.39 0.26 2.02 3.19
data.sort_index().head()
open high close low volume price_change p_change turnover 2015-03-02 100 12.67 12.52 12.20 96291.73 0.32 2.62 3.30 2015-03-03 100 13.06 12.70 12.52 139071.61 0.18 1.44 4.76 2015-03-04 100 12.92 12.90 12.61 67075.44 0.20 1.57 2.30 2015-03-05 100 13.45 13.16 12.87 93180.39 0.26 2.02 3.19 2015-03-06 100 14.48 14.28 13.13 179831.72 1.12 8.51 6.16
运算
data["open" ].sub(3 ).head()
2018-02-27 97
2018-02-26 97
2018-02-23 97
2018-02-22 97
2018-02-14 97
Name: open, dtype: int64
data[data['p_change' ] > 2 ].head()
open high close low volume price_change p_change turnover 2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58 2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
data[(data["p_change" ] > 2 ) & (data["low" ] > 15 )].head()
open high close low volume price_change p_change turnover 2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58 2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
data.query("p_change > 2 & low > 15" ).head()
open high close low volume price_change p_change turnover 2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 100 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 100 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-14 100 21.99 21.92 21.48 23331.04 0.44 2.05 0.58 2018-02-12 100 21.40 21.19 20.63 32445.39 0.82 4.03 0.81
data[data["turnover" ].isin([4.19 , 2.39 ])]
open high close low volume price_change p_change turnover 2018-02-27 100 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2017-07-25 100 24.20 23.70 22.64 167489.48 0.67 2.91 4.19 2016-09-28 100 20.98 20.86 19.71 95580.75 0.98 4.93 2.39 2015-04-07 100 17.98 17.54 16.50 122471.85 0.88 5.28 4.19
data.describe()
open high close low volume price_change p_change turnover count 643.0 643.000000 643.000000 643.000000 643.000000 643.000000 643.000000 643.000000 mean 100.0 21.900513 21.336267 20.771835 99905.519114 0.018802 0.190280 2.936190 std 0.0 4.077578 3.942806 3.791968 73879.119354 0.898476 4.079698 2.079375 min 100.0 12.670000 12.360000 12.200000 1158.120000 -3.520000 -10.030000 0.040000 25% 100.0 19.500000 19.045000 18.525000 48533.210000 -0.390000 -1.850000 1.360000 50% 100.0 21.970000 21.450000 20.980000 83175.930000 0.050000 0.260000 2.500000 75% 100.0 24.065000 23.415000 22.850000 127580.055000 0.455000 2.305000 3.915000 max 100.0 36.350000 35.210000 34.010000 501915.410000 3.030000 10.030000 12.560000
data["p_change" ].cumsum().head()
2018-02-27 2.68
2018-02-26 5.70
2018-02-23 8.12
2018-02-22 9.76
2018-02-14 11.81
Name: p_change, dtype: float64
data.max (axis=0 )
open 100.00
high 36.35
close 35.21
low 34.01
volume 501915.41
price_change 3.03
p_change 10.03
turnover 12.56
dtype: float64
data.idxmax(axis=0 )
open 2018-02-27
high 2015-06-10
close 2015-06-12
low 2015-06-12
volume 2017-10-26
price_change 2015-06-09
p_change 2015-08-28
turnover 2017-10-26
dtype: object
data["p_change" ].sort_index().cumsum().plot()
<matplotlib. axes. _subplots. AxesSubplot at 0x1959a1c6b38 >
data.apply(lambda x: x.max () - x.min ())
open 0.00
high 23.68
close 22.85
low 21.81
volume 500757.29
price_change 6.55
p_change 20.06
turnover 12.52
dtype: float64
data["volume" ].max () - data["volume" ].min ()
500757.29
data.plot(x="volume" , y="turnover" , kind="scatter" )
<matplotlib. axes. _subplots. AxesSubplot at 0x1959a228438 >
data.plot(x="high" , y="low" , kind="scatter" )
<matplotlib. axes. _subplots. AxesSubplot at 0x19597e1e3c8 >
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效