python学习-时间序列

时间序列需要导入

from datetime import datetime #以毫秒形式存储日期和时间
from datetime import timedelta #表示两个datetime的时间差

1、日期类型和字符串互转

#日期转字符串
stamp = datetime(2011,1,3)#datetime.datetime(2011, 1, 3, 0, 0)
str(stamp)#'2011-01-03 00:00:00'
stamp.strftime('%Y-%m-%d') #'2011-01-03'

#字符串转日期
value='2011-01-03'
datetime.strptime(value,'%Y-%m-%d')#datetime.datetime(2011, 1, 3, 0, 0)

datestrs = ["7/6/2011", "8/6/2011"]
[datetime.strptime(x, "%m/%d/%Y") for x in datestrs]


from dateutil.parser import parse #日期解析工具,可以解析大部分格式

parse('Jan 31 1997 10:45 PM') 
#datetime.datetime(1997, 1, 31, 22, 45)

#pandas处理成组日期
datestrs = ["2011-07-06 12:00:00", "2011-08-06 00:00:00"]
pd.to_datetime(datestrs)

#处理缺失值空字符串或者None值
pd.to_datetime(datestrs+[None])

2、时间序列基础

AliasOffset typeDescription
D Day Calendar daily
B BusinessDay Business daily
H Hour Hourly
T or min Minute Minutely
S Second Secondly
L or ms Milli Millisecond (1/1,000 of 1 second)
U Micro Microsecond (1/1,000,000 of 1 second)
M MonthEnd Last calendar day of month
BM BusinessMonthEnd Last business day (weekday) of month
MS MonthBegin First calendar day of month
BMS BusinessMonthBegin First weekday of month
W-MON, W-TUE, ... Week Weekly on given day of week (MON, TUE, WED, THU, FRI, SAT, or SUN)
WOM-1MON, WOM-2MON, ... WeekOfMonth Generate weekly dates in the first, second, third, or fourth week of the month (e.g., WOM-3FRI for the third Friday of each month)
Q-JAN, Q-FEB, ... QuarterEnd Quarterly dates anchored on last calendar day of each month, for year ending in indicated month (JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, or DEC)
BQ-JAN, BQ-FEB, ... BusinessQuarterEnd Quarterly dates anchored on last weekday day of each month, for year ending in indicated month
QS-JAN, QS-FEB, ... QuarterBegin Quarterly dates anchored on first calendar day of each month, for year ending in indicated month
BQS-JAN, BQS-FEB, ... BusinessQuarterBegin Quarterly dates anchored on first weekday day of each month, for year ending in indicated month
A-JAN, A-FEB, ... YearEnd Annual dates anchored on last calendar day of given month (JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, or DEC)
BA-JAN, BA-FEB, ... BusinessYearEnd Annual dates anchored on last weekday of given month
AS-JAN, AS-FEB, ... YearBegin Annual dates anchored on first day of given month
BAS-JAN, BAS-FEB, ... BusinessYearBegin Annual dates anchored on first weekday of given month
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.standard_normal(6), index=dates)

stamp=ts.index[1]#获取索引
ts[stamp] #获取索引对应的值

#或者传入可以被解释为日期的字符串
ts['20110110']
ts['2011-01-10']

#较长的时间序列的切片获取
longer_ts = pd.Series(np.random.standard_normal(1000),index=pd.date_range("2000-01-01", periods=1000))

longer_ts['2001'] #通过传入年获取切片
longer_ts['2001-05']#通过传入年月获取切片

#通过日期进行切片的方式只对Series有效
ts[datetime(2011, 1, 8):]
ts[datetime(2011, 1, 7):datetime(2011, 1, 10)] #也可用于不存在与该时间序列里的时间戳进行切片(即时间查询)

ts.truncate(after="2011-01-09") #另一个切片方法

#以上对dataFrame同样适用
dates = pd.date_range("2000-01-01", periods=100, freq="W-WED")
long_df = pd.DataFrame(np.random.standard_normal((100, 4)),
                       index=dates,columns=["Colorado", "Texas","New York", "Ohio"])
long_df.loc["2001-05"]

带有重复索引的时间序列

dates = pd.DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-02","2000-01-02", "2000-01-03"])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts.index.is_unique
grouped = dup_ts.groupby(level=0)
grouped.mean()
grouped.count()

日期的频率、范围、移动

resampler =ts.resample('D') #按日频率产生样本

#日期范围
index = pd.date_range("2012-04-01", "2012-06-01")
pd.date_range(start="2012-04-01", periods=20)
pd.date_range(end="2012-06-01", periods=20)

#频率为每个月的月底
pd.date_range("2000-01-01", "2000-12-01", freq="BM")

pd.date_range("2012-05-02 12:56:31", periods=5) #保留时间戳信息
pd.date_range("2012-05-02 12:56:31", periods=5, normalize=True)#不保留时间戳信息

#时间偏移量
from pandas.tseries.offsets import Hour, Minute

pd.date_range("2000-01-01", "2000-01-03 23:59", freq="4H")

#WOM-3FRI 每月第三个星期五 
monthly_dates = pd.date_range("2012-01-01", "2012-09-01", freq="WOM-3FRI")

#shift  沿着时间轴将数据前移或者后移
ts = pd.Series(np.random.standard_normal(4), index=pd.date_range("2000-01-01", periods=4, freq="M"))
ts.shift(2)
ts.shift(-2) 
ts.shift(2,freq='M')#偏移2月
ts.shift(3, freq="D")#偏移3天
ts.shift(1, freq="90T")#偏移1个半小时

from pandas.tseries.offsets import Day, MonthEnd
now = datetime(2011, 11, 17)
now + 3 * Day() #加3天
#Timestamp('2011-11-20 00:00:00')

now + MonthEnd() # #日期滚动到月底
Timestamp('2011-11-30 00:00:00')
now+MonthEnd(2)
Timestamp('2011-12-31 00:00:00')

#显示的将日期先前或者向后滚动
offset = MonthEnd()
offset.rollforward(now)
offset.rollback(now)

#利用rollforward 、rollback巧妙分组
ts = pd.Series(np.random.standard_normal(20),index=pd.date_range("2000-01-15", periods=20, freq="4D"))
ts.groupby(MonthEnd().rollforward).mean()

ts.resample("M").mean()

时区和本地化

import pytz

pytz.common_timezones[-5:] #获取时区名称列表

tz = pytz.timezone("Asia/Shanghai") #获取时区对象

#时间序列的时区操作

ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

 pd.date_range("2012-03-09 09:30", periods=10, tz="UTC")#生成日期范围时加时区
ts_utc = ts.tz_localize("UTC")#通过tz_localize转换为本地化时区
ts_utc.index

ts_utc.tz_convert("America/New_York")#本地化到特定时区后可以用tz_convert 转换到别的时区

ts.index.tz_localize("Asia/Shanghai")

#Timestamp时区操作
stamp = pd.Timestamp("2011-03-12 04:00")
stamp_utc = stamp.tz_localize("utc")
stamp_utc.tz_convert("Asia/Shanghai")
stamp_Shanghai = pd.Timestamp("2011-03-12 04:00", tz="Asia/Shanghai")#生成时间时加时区
# Timestamp对象在内部保存了一个UTC时间戳(1970年1月1日算起的秒数),这个值在时区转换时不发生变化
stamp_utc.value
stamp_utc.tz_convert("America/New_York").value

#通过数组创建时期
data = pd.read_csv("examples/macrodata.csv")
index = pd.PeriodIndex(year=data["year"], quarter=data["quarter"],freq="Q-DEC")
data.index=index

Period和算术运算

p = pd.Period("2022", freq="A-DEC")#表示2022年1月1日到2022年12月31日之间的整个时间段
p+5

periods = pd.period_range("2000-01-01", "2000-06-30", freq="M")
#PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]')
pd.Series(np.random.standard_normal(6), index=periods)

#构造函数直接使用字符串
values = ["2001Q3", "2002Q2", "2003Q1"]
index = pd.PeriodIndex(values, freq="Q-DEC")

#时期的频率转换
p = pd.Period("2011", freq="A-DEC")
p
p.asfreq("M", how="start")
p.asfreq("M", how="end")
p.asfreq("M")

p = pd.Period("2011", freq="A-JUN")
p.asfreq("M", how="start")
p.asfreq("M", how="end")

p = pd.Period("Aug-2011", "M")
#Period('2011-08', 'M')
p.asfreq("A-JUN")
#Period('2012', 'A-JUN') 2011-08属于2012年这个时期

#Series频率转换也类似
periods = pd.period_range("2006", "2009", freq="A-DEC")
ts = pd.Series(np.random.standard_normal(len(periods)), index=periods)
ts.asfreq("M", how="start")
ts.asfreq("B", how="end")

#按季度计算时期频率
p = pd.Period("2012Q4", freq="Q-JAN")
p.asfreq("D", how="start")
p.asfreq("D", how="end")
#该季度倒数第二个工作日下午4点的时间戳
p4pm = (p.asfreq("B", how="end") - 1).asfreq("T", how="start") + 16 * 60 

#季度型范围
periods = pd.period_range("2011Q3", "2012Q4", freq="Q-JAN")
ts = pd.Series(np.arange(len(periods)), index=periods)

#Timestamp和period互转
dates = pd.date_range("2000-01-01", periods=3, freq="M")
ts = pd.Series(np.random.standard_normal(3), index=dates)
pts = ts.to_period()
dates = pd.date_range("2000-01-29", periods=6)
ts2 = pd.Series(np.random.standard_normal(6), index=dates)
ts2.to_period("M")
pts = ts2.to_period()
pts.to_timestamp(how="end")

3、采样

#重采样resample及频率转换
dates = pd.date_range("2000-01-01", periods=100)
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)
ts.resample("M").mean()
ts.resample("M", kind="period").mean()

#降采样
dates = pd.date_range("2000-01-01", periods=12, freq="T")
ts = pd.Series(np.arange(len(dates)), index=dates)
ts.resample("5min").sum()
ts.resample("5min", closed="right").sum()
ts.resample("5min", closed="right", label="right").sum()

#OHLC重采样
ts = pd.Series(np.random.permutation(np.arange(len(dates))), index=dates)
ts.resample("5min").ohlc()

#升采样
frame = pd.DataFrame(np.random.standard_normal((2, 4)),
                     index=pd.date_range("2000-01-01", periods=2,freq="W-WED"),
                     columns=["Colorado", "Texas", "New York", "Ohio"])
df_daily = frame.resample("D").asfreq()
frame.resample("D").ffill()
frame.resample("D").ffill(limit=2)
frame.resample("W-THU").ffill()

#通过时期进行重采样
frame = pd.DataFrame(np.random.standard_normal((24, 4)),
                     index=pd.period_range("1-2000", "12-2001",freq="M"),
                     columns=["Colorado", "Texas", "New York", "Ohio"])
annual_frame = frame.resample("A-DEC").mean()
annual_frame.resample("Q-DEC").ffill()
annual_frame.resample("Q-DEC", convention="end").asfreq()
annual_frame.resample("Q-MAR").ffill()

3、时间序列绘图

close_px_all = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0)
close_px = close_px_all[["AAPL", "MSFT", "XOM"]]
close_px["AAPL"].plot() #每日价格图表
close_px["AAPL"].rolling(250).mean().plot() #250日均值图表

#移动窗口函数  自动排除缺失值
plt.figure()
std250 = close_px["AAPL"].pct_change().rolling(250, min_periods=10).std()
std250.plot()
expanding_mean = std250.expanding().mean()
close_px.rolling(60).mean().plot(logy=True)
close_px.rolling("20D").mean()

 

posted @ 2022-05-30 23:52  行者无疆  阅读(476)  评论(0编辑  收藏  举报