Pandas 学习笔记一

Pandas 学习

核心数据结构

DataFrame
import numpy as np
data = np.random.normal(0,1,(10,5))
data
array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],
       [-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],
       [-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],
       [-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],
       [-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],
       [-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],
       [-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],
       [ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],
       [-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],
       [-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])
import pandas as pd
pd.DataFrame(data)
01234
01.212185-0.306787-0.6678150.1341410.797992
1-1.3510551.2637800.049323-0.936266-0.116189
2-0.010316-0.5311110.1757341.4496130.951039
3-0.384850-0.1127180.862891-0.704635-0.673579
4-0.233423-1.1566141.729237-1.140583-0.068172
5-1.129472-0.281252-0.710164-0.226608-1.885420
6-0.0242170.450156-0.5701021.0030650.494557
71.124109-0.598309-0.9632331.2907921.035346
8-1.582825-0.6095261.1011781.5781881.694988
9-0.0273720.516504-0.1232211.844097-1.205116
# 添加行索引
num = ["num{}".format(i) for i in range(10)]
pd.DataFrame(data,index=num)
01234
num01.212185-0.306787-0.6678150.1341410.797992
num1-1.3510551.2637800.049323-0.936266-0.116189
num2-0.010316-0.5311110.1757341.4496130.951039
num3-0.384850-0.1127180.862891-0.704635-0.673579
num4-0.233423-1.1566141.729237-1.140583-0.068172
num5-1.129472-0.281252-0.710164-0.226608-1.885420
num6-0.0242170.450156-0.5701021.0030650.494557
num71.124109-0.598309-0.9632331.2907921.035346
num8-1.582825-0.6095261.1011781.5781881.694988
num9-0.0273720.516504-0.1232211.844097-1.205116
# 添加列索引
num2 = ["index{}".format(i) for i in range(5)]
data = pd.DataFrame(data,index=num,columns=num2)
data
index0index1index2index3index4
num01.212185-0.306787-0.6678150.1341410.797992
num1-1.3510551.2637800.049323-0.936266-0.116189
num2-0.010316-0.5311110.1757341.4496130.951039
num3-0.384850-0.1127180.862891-0.704635-0.673579
num4-0.233423-1.1566141.729237-1.140583-0.068172
num5-1.129472-0.281252-0.710164-0.226608-1.885420
num6-0.0242170.450156-0.5701021.0030650.494557
num71.124109-0.598309-0.9632331.2907921.035346
num8-1.582825-0.6095261.1011781.5781881.694988
num9-0.0273720.516504-0.1232211.844097-1.205116
属性
data.shape
(10, 5)
data.index
Index(['num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8',
       'num9'],
      dtype='object')
data.columns
Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')
data.values
array([[ 1.21218494, -0.30678674, -0.66781485,  0.13414137,  0.79799153],
       [-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ],
       [-0.01031587, -0.5311115 ,  0.17573352,  1.44961339,  0.9510395 ],
       [-0.38485007, -0.11271772,  0.86289105, -0.7046349 , -0.67357859],
       [-0.23342328, -1.15661425,  1.729237  , -1.1405832 , -0.06817234],
       [-1.12947171, -0.28125216, -0.71016417, -0.22660793, -1.88541989],
       [-0.02421679,  0.45015635, -0.57010218,  1.00306539,  0.49455672],
       [ 1.12410909, -0.59830918, -0.96323314,  1.2907916 ,  1.0353455 ],
       [-1.5828247 , -0.60952617,  1.10117806,  1.57818777,  1.69498839],
       [-0.02737215,  0.51650423, -0.12322063,  1.84409656, -1.20511615]])
data.T
num0num1num2num3num4num5num6num7num8num9
index01.212185-1.351055-0.010316-0.384850-0.233423-1.129472-0.0242171.124109-1.582825-0.027372
index1-0.3067871.263780-0.531111-0.112718-1.156614-0.2812520.450156-0.598309-0.6095260.516504
index2-0.6678150.0493230.1757340.8628911.729237-0.710164-0.570102-0.9632331.101178-0.123221
index30.134141-0.9362661.449613-0.704635-1.140583-0.2266081.0030651.2907921.5781881.844097
index40.797992-0.1161890.951039-0.673579-0.068172-1.8854200.4945571.0353461.694988-1.205116
data.head()
index0index1index2index3index4
num01.212185-0.306787-0.6678150.1341410.797992
num1-1.3510551.2637800.049323-0.936266-0.116189
num2-0.010316-0.5311110.1757341.4496130.951039
num3-0.384850-0.1127180.862891-0.704635-0.673579
num4-0.233423-1.1566141.729237-1.140583-0.068172
data.tail()
index0index1index2index3index4
num5-1.129472-0.281252-0.710164-0.226608-1.885420
num6-0.0242170.450156-0.5701021.0030650.494557
num71.124109-0.598309-0.9632331.2907921.035346
num8-1.582825-0.6095261.1011781.5781881.694988
num9-0.0273720.516504-0.1232211.844097-1.205116
# 重设索引
data.reset_index()
indexindex0index1index2index3index4
0num01.212185-0.306787-0.6678150.1341410.797992
1num1-1.3510551.2637800.049323-0.936266-0.116189
2num2-0.010316-0.5311110.1757341.4496130.951039
3num3-0.384850-0.1127180.862891-0.704635-0.673579
4num4-0.233423-1.1566141.729237-1.140583-0.068172
5num5-1.129472-0.281252-0.710164-0.226608-1.885420
6num6-0.0242170.450156-0.5701021.0030650.494557
7num71.124109-0.598309-0.9632331.2907921.035346
8num8-1.582825-0.6095261.1011781.5781881.694988
9num9-0.0273720.516504-0.1232211.844097-1.205116
# 设置新索引
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df
monthyearsale
01201255
14201440
27201384
310201431
# 以月份设置新的索引
df.set_index("month", drop=True)
yearsale
month
1201255
4201440
7201384
10201431
# 设置多个索引,以年和月份
new_df = df.set_index(["year", "month"])
new_df.index
MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]],
           labels=[[0, 2, 1, 2], [0, 1, 2, 3]],
           names=['year', 'month'])
new_df.index.names # levels的名称
FrozenList(['year', 'month'])
new_df.index.levels # 每个level的元组值
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
Panel
# panel 三维数据,需要从不同维度访问
p = pd.Panel(np.arange(24).reshape(4,3,2),
                 items=list('ABCD'),
                 major_axis=pd.date_range('20130101', periods=3),
                 minor_axis=['first', 'second'])
p
C:\Users\28599\AppData\Roaming\Python\Python37\site-packages\IPython\core\interactiveshell.py:3418: FutureWarning: 
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)





<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: A to D
Major_axis axis: 2013-01-01 00:00:00 to 2013-01-03 00:00:00
Minor_axis axis: first to second
p["A"]
firstsecond
2013-01-0101
2013-01-0223
2013-01-0345
p.major_xs("2013-01-03")
ABCD
first4101622
second5111723
p.minor_xs("first")
ABCD
2013-01-01061218
2013-01-02281420
2013-01-034101622
Series
# Series 带索引的一维数组
sr = data.iloc[1,:]
sr
index0   -1.351055
index1    1.263780
index2    0.049323
index3   -0.936266
index4   -0.116189
Name: num1, dtype: float64
sr.index
Index(['index0', 'index1', 'index2', 'index3', 'index4'], dtype='object')
sr.values
array([-1.35105539,  1.26378017,  0.04932303, -0.93626636, -0.1161889 ])
pd.Series(np.arange(10))
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
pd.Series(np.arange(3, 9, 2), index=["a", "b", "c"])
a    3
b    5
c    7
dtype: int32

基本数据操作

data = pd.read_csv("./stock_day.csv")
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
# 先列后行
data["open"]["2018-02-26"] 
22.8
# 按名字索引
data.loc["2018-02-26"]["open"]
# 同 data.loc["2018-02-26","open"]
22.8
# 按数字索引
data.iloc[1,0]
22.8
# 组合索引
# 获取行第1天到第4天,['open', 'close', 'high', 'low']这个四个指标的结果
# data.ix[:4, ['open', 'close', 'high', 'low']] # 将要废弃,不推荐
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
# 同 data.iloc[0:4,data.columns.get_indexer(['open', 'close', 'high', 'low'])]
openclosehighlow
2018-02-2723.5324.1625.8823.53
2018-02-2622.8023.5323.7822.80
2018-02-2322.8822.8223.3722.71
2018-02-2222.2522.2822.7622.02
# 赋值操作 (先索引再赋值即可)
data.open=100
data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2610023.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-2210022.7622.2822.0236105.010.361.640.90
2018-02-1410021.9921.9221.4823331.040.442.050.58
# 对内容排序
# ascending=True表示升序 high相同就看p_change
data.sort_values(by=['high','p_change'],ascending=True).head() 
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0210012.6712.5212.2096291.730.322.623.30
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0510013.4513.1612.8793180.390.262.023.19
data.sort_index().head()
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0210012.6712.5212.2096291.730.322.623.30
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-03-0610014.4814.2813.13179831.721.128.516.16

运算

# 算术运算
data["open"].sub(3).head()
# data.open + 3
2018-02-27    97
2018-02-26    97
2018-02-23    97
2018-02-22    97
2018-02-14    97
Name: open, dtype: int64
# 逻辑运算
data[data['p_change'] > 2].head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2610023.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-1410021.9921.9221.4823331.040.442.050.58
2018-02-1210021.4021.1920.6332445.390.824.030.81
data[(data["p_change"] > 2) & (data["low"] > 15)].head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2610023.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-1410021.9921.9221.4823331.040.442.050.58
2018-02-1210021.4021.1920.6332445.390.824.030.81
# 逻辑运算函数
data.query("p_change > 2 & low > 15").head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2610023.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-1410021.9921.9221.4823331.040.442.050.58
2018-02-1210021.4021.1920.6332445.390.824.030.81
# 判断'turnover'是否为4.19, 2.39
data[data["turnover"].isin([4.19, 2.39])]
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2017-07-2510024.2023.7022.64167489.480.672.914.19
2016-09-2810020.9820.8619.7195580.750.984.932.39
2015-04-0710017.9817.5416.50122471.850.885.284.19
# 统计运算
data.describe()
openhighcloselowvolumeprice_changep_changeturnover
count643.0643.000000643.000000643.000000643.000000643.000000643.000000643.000000
mean100.021.90051321.33626720.77183599905.5191140.0188020.1902802.936190
std0.04.0775783.9428063.79196873879.1193540.8984764.0796982.079375
min100.012.67000012.36000012.2000001158.120000-3.520000-10.0300000.040000
25%100.019.50000019.04500018.52500048533.210000-0.390000-1.8500001.360000
50%100.021.97000021.45000020.98000083175.9300000.0500000.2600002.500000
75%100.024.06500023.41500022.850000127580.0550000.4550002.3050003.915000
max100.036.35000035.21000034.010000501915.4100003.03000010.03000012.560000
# 累计统计函数 (类似前缀和)
data["p_change"].cumsum().head()
2018-02-27     2.68
2018-02-26     5.70
2018-02-23     8.12
2018-02-22     9.76
2018-02-14    11.81
Name: p_change, dtype: float64
data.max(axis=0)
open               100.00
high                36.35
close               35.21
low                 34.01
volume          501915.41
price_change         3.03
p_change            10.03
turnover            12.56
dtype: float64
data.idxmax(axis=0)
open            2018-02-27
high            2015-06-10
close           2015-06-12
low             2015-06-12
volume          2017-10-26
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object
# 累计统计函数
data["p_change"].sort_index().cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1959a1c6b38>

33324

# 自定义计算
data.apply(lambda x: x.max() - x.min()) # 每一列最大值减最小值
open                 0.00
high                23.68
close               22.85
low                 21.81
volume          500757.29
price_change         6.55
p_change            20.06
turnover            12.52
dtype: float64
data["volume"].max() - data["volume"].min() # volume列最大值减最小值
500757.29
data.plot(x="volume", y="turnover", kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0x1959a228438>

22222

data.plot(x="high", y="low", kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0x19597e1e3c8>


1png

posted @   嘿,抬头!  阅读(29)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效
点击右上角即可分享
微信分享提示