python数据分析-07时间序列处理

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

#时间序列的操作基础
from datetime import datetime
"""
t = datetime(2016,9,10)
print(t)#2016-09-10 00:00:00

data_list = [
    datetime(2016,9,1),
    datetime(2016,9,10),
    datetime(2017,9,1),
    datetime(2017,9,20),
    datetime(2017,10,1)
]
print(data_list)
# [datetime.datetime(2016, 9, 1, 0, 0), datetime.datetime(2016, 9, 10, 0, 0), datetime.datetime(2017, 9, 1, 0, 0), datetime.datetime(2017, 9, 20, 0, 0), datetime.datetime(2017, 10, 1, 0, 0)]
s1 = Series(np.random.rand(5),index=data_list)
print(s1)
# 2016-09-01    0.437216
# 2016-09-10    0.002021
# 2017-09-01    0.990085
# 2017-09-20    0.635123
# 2017-10-01    0.504584
# dtype: float64
print(s1.values)#[0.74523743 0.67846232 0.33464572 0.66881491 0.34169192]
print(s1.index)
# DatetimeIndex(['2016-09-01', '2016-09-10', '2017-09-01', '2017-09-20',
#                '2017-10-01'],
#               dtype='datetime64[ns]', freq=None)
print(s1[1])#0.3247607714134729
print(s1[datetime(2016,9,10)])#0.3247607714134729
print(s1['2016-09-10'])#0.3247607714134729
print(s1['20160910'])#0.3247607714134729
print(s1['2016-09'])
# 2016-09-01    0.713300
# 2016-09-10    0.265708
# dtype: float64
print(s1['2016'])
# 2016-09-01    0.139233
# 2016-09-10    0.595806
# dtype: float64
"""

"""
data_list_new = pd.date_range("2016-01-01",periods=100,freq="5H")#表示每5小时生成一个时间，一共生成100个
print(data_list_new)
# DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 05:00:00',
#                '2016-01-01 10:00:00', '2016-01-01 15:00:00',
#                '2016-01-01 20:00:00', '2016-01-02 01:00:00',
#                '2016-01-02 06:00:00', '2016-01-02 11:00:00',
#                '2016-01-02 16:00:00', '2016-01-02 21:00:00',
#                '2016-01-03 02:00:00', '2016-01-03 07:00:00',
#                '2016-01-03 12:00:00', '2016-01-03 17:00:00',
#                '2016-01-03 22:00:00', '2016-01-04 03:00:00',
#                '2016-01-04 08:00:00', '2016-01-04 13:00:00',
#                '2016-01-04 18:00:00', '2016-01-04 23:00:00',
#                '2016-01-05 04:00:00', '2016-01-05 09:00:00',
#                '2016-01-05 14:00:00', '2016-01-05 19:00:00',
#                '2016-01-06 00:00:00', '2016-01-06 05:00:00',
#                '2016-01-06 10:00:00', '2016-01-06 15:00:00',
#                '2016-01-06 20:00:00', '2016-01-07 01:00:00',
#                '2016-01-07 06:00:00', '2016-01-07 11:00:00',
#                '2016-01-07 16:00:00', '2016-01-07 21:00:00',
#                '2016-01-08 02:00:00', '2016-01-08 07:00:00',
#                '2016-01-08 12:00:00', '2016-01-08 17:00:00',
#                '2016-01-08 22:00:00', '2016-01-09 03:00:00',
#                '2016-01-09 08:00:00', '2016-01-09 13:00:00',
#                '2016-01-09 18:00:00', '2016-01-09 23:00:00',
#                '2016-01-10 04:00:00', '2016-01-10 09:00:00',
#                '2016-01-10 14:00:00', '2016-01-10 19:00:00',
#                '2016-01-11 00:00:00', '2016-01-11 05:00:00',
#                '2016-01-11 10:00:00', '2016-01-11 15:00:00',
#                '2016-01-11 20:00:00', '2016-01-12 01:00:00',
#                '2016-01-12 06:00:00', '2016-01-12 11:00:00',
#                '2016-01-12 16:00:00', '2016-01-12 21:00:00',
#                '2016-01-13 02:00:00', '2016-01-13 07:00:00',
#                '2016-01-13 12:00:00', '2016-01-13 17:00:00',
#                '2016-01-13 22:00:00', '2016-01-14 03:00:00',
#                '2016-01-14 08:00:00', '2016-01-14 13:00:00',
#                '2016-01-14 18:00:00', '2016-01-14 23:00:00',
#                '2016-01-15 04:00:00', '2016-01-15 09:00:00',
#                '2016-01-15 14:00:00', '2016-01-15 19:00:00',
#                '2016-01-16 00:00:00', '2016-01-16 05:00:00',
#                '2016-01-16 10:00:00', '2016-01-16 15:00:00',
#                '2016-01-16 20:00:00', '2016-01-17 01:00:00',
#                '2016-01-17 06:00:00', '2016-01-17 11:00:00',
#                '2016-01-17 16:00:00', '2016-01-17 21:00:00',
#                '2016-01-18 02:00:00', '2016-01-18 07:00:00',
#                '2016-01-18 12:00:00', '2016-01-18 17:00:00',
#                '2016-01-18 22:00:00', '2016-01-19 03:00:00',
#                '2016-01-19 08:00:00', '2016-01-19 13:00:00',
#                '2016-01-19 18:00:00', '2016-01-19 23:00:00',
#                '2016-01-20 04:00:00', '2016-01-20 09:00:00',
#                '2016-01-20 14:00:00', '2016-01-20 19:00:00',
#                '2016-01-21 00:00:00', '2016-01-21 05:00:00',
#                '2016-01-21 10:00:00', '2016-01-21 15:00:00'],
#               dtype='datetime64[ns]', freq='5H')
s2 = Series(np.random.rand(100),index=data_list_new)
print(s2)
"""

"""
#-----------------
#时间序列数据的采样和画图
#t_range = pd.date_range("2016-01-01","2016-12-31")
#print(t_range)
# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
#                '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
#                '2016-01-09', '2016-01-10',
#                ...
#                '2016-12-22', '2016-12-23', '2016-12-24', '2016-12-25',
#                '2016-12-26', '2016-12-27', '2016-12-28', '2016-12-29',
#                '2016-12-30', '2016-12-31'],
#               dtype='datetime64[ns]', length=366, freq='D')
#s1 = Series(np.random.randn(len(t_range)),index=t_range)
#print(s1)
#print(s1["2016-01"].mean())#0.05316056209771481
# s1_month = s1.resample("M").mean()#取样，每个月取一个值，值为平均值
# print(s1_month)
# 2016-01-31    0.175917
# 2016-02-29   -0.018886
# 2016-03-31   -0.131760
# 2016-04-30   -0.134704
# 2016-05-31    0.147767
# 2016-06-30    0.382015
# 2016-07-31    0.163278
# 2016-08-31   -0.079203
# 2016-09-30    0.184607
# 2016-10-31    0.055851
# 2016-11-30    0.284106
# 2016-12-31   -0.030083
# Freq: M, dtype: float64

#print(s1.resample("H").ffill())
# 2016-01-01 00:00:00   -2.031085
# 2016-01-01 01:00:00   -2.031085
# 2016-01-01 02:00:00   -2.031085
# ........



t_range = pd.date_range("2016-01-01","2016-12-31",freq="H")
print(t_range)
# DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
#                '2016-01-01 02:00:00', '2016-01-01 03:00:00',
#                '2016-01-01 04:00:00', '2016-01-01 05:00:00',
#                '2016-01-01 06:00:00', '2016-01-01 07:00:00',
#                '2016-01-01 08:00:00', '2016-01-01 09:00:00',
#                ...
#                '2016-12-30 15:00:00', '2016-12-30 16:00:00',
#                '2016-12-30 17:00:00', '2016-12-30 18:00:00',
#                '2016-12-30 19:00:00', '2016-12-30 20:00:00',
#                '2016-12-30 21:00:00', '2016-12-30 22:00:00',
#                '2016-12-30 23:00:00', '2016-12-31 00:00:00'],
#               dtype='datetime64[ns]', length=8761, freq='H')
stock_df = DataFrame(index=t_range)
print(stock_df.head())
#Empty DataFrame
# Columns: []
# Index: [2016-01-01 00:00:00, 2016-01-01 01:00:00, 2016-01-01 02:00:00, 2016-01-01 03:00:00, 2016-01-01 04:00:00]
stock_df["BABA"] = np.random.randint(80,160,size=len(t_range))
stock_df["TENCENT"] = np.random.randint(30,50,size=len(t_range))
print(stock_df.head())
#                      BABA  TENCENT
# 2016-01-01 00:00:00   147       47
# 2016-01-01 01:00:00    88       40
# 2016-01-01 02:00:00   143       33
# 2016-01-01 03:00:00   132       47
# 2016-01-01 04:00:00    93       44
# stock_df.plot()
import matplotlib.pyplot as plt
# plt.show()

weekly_df = DataFrame()
weekly_df["BABA"] = stock_df["BABA"].resample("W").mean()
weekly_df["TENCENT"] = stock_df["TENCENT"].resample("W").mean()
print(weekly_df.head())
#                   BABA    TENCENT
# 2016-01-03  113.819444  39.597222
# 2016-01-10  122.696429  39.029762
# 2016-01-17  120.458333  38.845238
# 2016-01-24  119.196429  39.690476
# 2016-01-31  118.315476  38.690476
weekly_df.plot()
plt.show()
"""

"""
#------------------------------
#数据分箱技术Binning
score_list = np.random.randint(25,100,size=20)
print(score_list)#[41 88 82 66 83 84 77 29 72 97 77 81 80 45 30 74 84 46 95 54]
bins = [0,59,70,80,100]
score_cut = pd.cut(score_list,bins)
print(score_cut)
# [(0, 59], (0, 59], (80, 100], (70, 80], (59, 70], ..., (80, 100], (0, 59], (0, 59], (59, 70], (80, 100]]
# Length: 20
# Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
print(pd.value_counts(score_cut))
# (0, 59]      11
# (80, 100]     4
# (59, 70]      3
# (70, 80]      2
# dtype: int64
df = DataFrame()
df["score"] = score_list
df["student"] = [pd.util.testing.rands(3) for i in range(20)]
df["Categories"] = pd.cut(df["score"],bins,labels=["Low","Ok","Good","Great"])
print(df)
#     score student Categories
# 0      71     sCO       Good
# 1      40     AgI        Low
# 2      61     ubC         Ok
# 3      65     P1K         Ok
# 4      78     ebd       Good
# 5      75     oxG       Good
# 6      81     JN0      Great
# 7      35     LpS        Low
# 8      53     L7l        Low
# 9      60     puw         Ok
# 10     27     3KJ        Low
# 11     77     2ID       Good
# 12     63     D26         Ok
# 13     96     jA7      Great
# 14     46     txB        Low
# 15     85     8NF      Great
# 16     96     jne      Great
# 17     71     xBX       Good
# 18     75     3HP       Good
# 19     93     Svl      Great
"""

#------------------------------------------------
#数据分组技术GroupBy
"""
df = pd.read_csv("city_weather.csv")
# print(df)
#           data city  temperature  wind
# 0   03/01/2016   BJ            8     5
# 1   17/01/2016   BJ           12     2
# 2   31/01/2016   BJ           19     2
# 3   03/02/2016   BJ           -3     3
# 4   14/02/2016   BJ           19     2
# 5   13/03/2016   BJ            5     3
# 6   10/03/2016   SH           -4     4
# 7   03/04/2016   SH           19     3
# 8   24/04/2016   SH           20     3
# 9   08/05/2016   SH           17     3
# 10  22/05/2016   SH            4     2
# 11  05/06/2016   SH          -10     4
# 12  19/06/2016   SH            0     5
# 13  03/07/2016   SH            9     5
# 14  17/07/2016   GZ           10     2
# 15  31/07/2016   GZ           -1     5
# 16  14/08/2016   GZ            1     5
# 17  28/08/2016   GZ           25     4
# 18  11/09/2016   SZ           20     1
# 19  25/09/2016   SZ          -10     4

g = df.groupby(df["city"])
print(g)
#<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000297D780>
print(g.groups)
# {'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'), 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'), 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'), 'SZ': Int64Index([18, 19], dtype='int64')}
print(g.get_group("BJ"))
#          data city  temperature  wind
# 0  03/01/2016   BJ            8     5
# 1  17/01/2016   BJ           12     2
# 2  31/01/2016   BJ           19     2
# 3  03/02/2016   BJ           -3     3
# 4  14/02/2016   BJ           19     2
# 5  13/03/2016   BJ            5     3
df_bj = g.get_group("BJ")
print(df_bj.mean())
# temperature    10.000000
# wind            2.833333
# dtype: float64
print(g.mean())
#       temperature      wind
# city
# BJ         10.000  2.833333
# GZ          8.750  4.000000
# SH          6.875  3.625000
# SZ          5.000  2.500000
"""

#数据聚合技术Aggregation
df = pd.read_csv("city_weather.csv")
g = df.groupby("city")
print(g.agg("min"))
#             data  temperature  wind
# city
# BJ    03/01/2016           -3     2
# GZ    14/08/2016           -1     2
# SH    03/04/2016          -10     2
# SZ    11/09/2016          -10     1
def foo(attr):
    print(type(attr)),print(attr)
    return np.nan

print(g.agg(foo))
# <class 'pandas.core.series.Series'>
# 0    03/01/2016
# 1    17/01/2016
# 2    31/01/2016
# 3    03/02/2016
# 4    14/02/2016
# 5    13/03/2016
# Name: data, dtype: object
# <class 'pandas.core.series.Series'>
# 14    17/07/2016
# 15    31/07/2016
# 16    14/08/2016
# 17    28/08/2016
# Name: data, dtype: object
# <class 'pandas.core.series.Series'>
# 6     10/03/2016
# 7     03/04/2016
# 8     24/04/2016
# 9     08/05/2016
# 10    22/05/2016
# 11    05/06/2016
# 12    19/06/2016
# 13    03/07/2016
# Name: data, dtype: object
# <class 'pandas.core.series.Series'>
# 18    11/09/2016
# 19    25/09/2016
# Name: data, dtype: object
# <class 'pandas.core.series.Series'>
# 0     8
# 1    12
# 2    19
# 3    -3
# 4    19
# 5     5
# Name: temperature, dtype: int64
# <class 'pandas.core.series.Series'>
# 14    10
# 15    -1
# 16     1
# 17    25
# Name: temperature, dtype: int64
# <class 'pandas.core.series.Series'>
# 6     -4
# 7     19
# 8     20
# 9     17
# 10     4
# 11   -10
# 12     0
# 13     9
# Name: temperature, dtype: int64
# <class 'pandas.core.series.Series'>
# 18    20
# 19   -10
# Name: temperature, dtype: int64
# <class 'pandas.core.series.Series'>
# 0    5
# 1    2
# 2    2
# 3    3
# 4    2
# 5    3
# Name: wind, dtype: int64
# <class 'pandas.core.series.Series'>
# 14    2
# 15    5
# 16    5
# 17    4
# Name: wind, dtype: int64
# <class 'pandas.core.series.Series'>
# 6     4
# 7     3
# 8     3
# 9     3
# 10    2
# 11    4
# 12    5
# 13    5
# Name: wind, dtype: int64
# <class 'pandas.core.series.Series'>
# 18    1
# 19    4
# Name: wind, dtype: int64
#       data  temperature  wind
# city
# BJ     NaN          NaN   NaN
# GZ     NaN          NaN   NaN
# SH     NaN          NaN   NaN
# SZ     NaN          NaN   NaN

def foo(attr):
    return attr.max() - attr.min()

print(g.agg(foo))
#       temperature  wind
# city
# BJ             22     3
# GZ             26     3
# SH             30     3
# SZ             30     3


g_new = df.groupby(["city","wind"])
print(g_new.groups)
# {('BJ', 2): Int64Index([1, 2, 4], dtype='int64'),
#  ('BJ', 3): Int64Index([3, 5], dtype='int64'),
#  ('BJ', 5): Int64Index([0], dtype='int64'),
#  ('GZ', 2): Int64Index([14], dtype='int64'),
#  ('GZ', 4): Int64Index([17], dtype='int64'),
#  ('GZ', 5): Int64Index([15, 16], dtype='int64'),
#  ('SH', 2): Int64Index([10], dtype='int64'),
#  ('SH', 3): Int64Index([7, 8, 9], dtype='int64'),
#  ('SH', 4): Int64Index([6, 11], dtype='int64'),
#  ('SH', 5): Int64Index([12, 13], dtype='int64'),
#  ('SZ', 1): Int64Index([18], dtype='int64'),
#  ('SZ', 4): Int64Index([19], dtype='int64')}
print(g_new.get_group(("BJ",3)))
#          data city  temperature  wind
# 3  03/02/2016   BJ           -3     3
# 5  13/03/2016   BJ            5     3

#32
posted @ 2019-07-04 10:40 nikecode 阅读(392) 评论(0) 收藏举报
刷新页面返回顶部
nikecode

python数据分析-07时间序列处理

公告