05 - pandas
pandas: 数据分析核心工具包
简介
一维数据对象
In [324]: import pandas as pd
In [325]: pd.Series([2,3,4,5])
Out[325]:
0 2
1 3
2 4
3 5
dtype: int64
In [326]: pd.Series([2,3,4,5],index=['a','b','c','d'])
Out[326]:
a 2
b 3
c 4
d 5
dtype: int64
Series - 使用特性
In [324]: import pandas as pd
In [330]: import numpy as np
In [331]: pd.Series(np.arange(5))
Out[331]:
0 0
1 1
2 2
3 3
4 4
dtype: int32
In [346]: sr = pd.Series([2,3,4,5],index=['a','b','c','d'])
In [347]: sr
Out[347]:
a 2
b 3
c 4
d 5
dtype: int64
In [348]: sr[0]
Out[348]: 2
In [349]: sr["a"]
Out[349]: 2
In [351]: sr+2
Out[351]:
a 4
b 5
c 6
d 7
dtype: int64
In [352]: sr+sr
Out[352]:
a 4
b 6
c 8
d 10
dtype: int64
In [361]: sr[[1,2]]
Out[361]:
b 3
c 4
dtype: int64
In [366]: sr[0:2]
Out[366]:
a 2
b 3
dtype: int64
In [379]: sr[sr>3]
Out[379]:
c 4
d 5
dtype: int64
In [389]: sr = pd.Series({"a":1,"b":2})
In [390]: sr
Out[390]:
a 1
b 2
In [400]: "a" in sr
Out[400]: True
In [405]: for i in sr:
...: print(i)
...:
1
2
In [406]: sr.index
Out[406]: Index(['a', 'b'], dtype='object')
In [407]: sr.index[0]
Out[407]: 'a'
In [408]: sr.index[1]
Out[408]: 'b'
In [409]: sr.values
Out[409]: array([1, 2], dtype=int64)
In [431]: sr = pd.Series([1,2,3,4],index=['a','b','c','d'])
In [411]: sr[["a","b"]] # 花式索引
Out[411]:
a 1
b 2
In [430]: sr['a':'c'] # 切片
Out[430]:
a 1
b 2
c 3
dtype: int64
Series - 整数索引
Series - 数据对齐
In [450]: sr1 = pd.Series([12,23,34],index=["c","a","d"])
In [451]: sr1
Out[451]:
c 12
a 23
d 34
dtype: int64
In [452]: sr2 = pd.Series([11,20,10],index=["d","c","a"])
In [453]: sr1
Out[453]:
c 12
a 23
d 34
dtype: int64
In [454]: sr2
Out[454]:
d 11
c 20
a 10
dtype: int64
In [455]: sr1 + sr2
Out[455]:
a 33
c 32
d 45
dtype: int64
In [456]: sr1 = pd.Series([12,23,34],index=["c","a","d"])
In [458]: sr2 = pd.Series([11,20,10,5],index=["d","c","a","b"])
In [459]: sr1
Out[459]:
c 12
a 23
d 34
dtype: int64
In [460]: sr2
Out[460]:
d 11
c 20
a 10
b 5
dtype: int64
In [461]: sr1 + sr2
Out[461]:
a 33.0
b NaN # 数据缺失值
c 32.0
d 45.0
dtype: float64
In [462]: sr1.add(sr2)
Out[462]:
a 33.0
b NaN
c 32.0
d 45.0
dtype: float64
In [463]: sr1.add(sr2,fill_value=0)
Out[463]:
a 33.0
b 5.0
c 32.0
d 45.0
dtype: float64
缺失值处理:
In [3]: sr1 = pd.Series([12,23,34],index=["c","a","d"])
In [4]: sr2 = pd.Series([11,20,10,5],index=["d","c","a","b"])
In [7]: sr = sr1 + sr2
In [8]: sr
Out[8]:
a 33.0
b NaN
c 32.0
d 45.0
dtype: float64
In [9]: sr.isnull()
Out[9]:
a False
b True
c False
d False
dtype: bool
In [11]: sr.notnull()
Out[11]:
a True
b False
c True
d True
dtype: bool
In [13]: sr[sr.notnull()] # 扔掉值
Out[13]:
a 33.0
c 32.0
d 45.0
dtype: float64
In [15]: sr.dropna() # 扔掉值
Out[15]:
a 33.0
c 32.0
d 45.0
dtype: float64
In [22]: sr.fillna(0) # 填充值 0
Out[22]:
a 33.0
b 0.0
c 32.0
d 45.0
dtype: float64
In [27]: sr.fillna(sr.mean()) # 填充 平均数
Out[27]:
a 33.000000
b 36.666667
c 32.000000
d 45.000000
dtype: float64
series小结:
数组 + 字典
整数索引 loc 和 iloc
数据对齐 nan
缺失数据处理 dropna fillna
DataFrame - 二维数据对象
In [29]: pd.DataFrame({'one':[1,2,3],"two":[4,5,6]})
Out[29]:
one two
0 1 4
1 2 5
2 3 6
In [31]: pd.DataFrame({'one':[1,2,3],"two":[4,5,6]},index=["a","b","c"])
Out[31]:
one two
a 1 4
b 2 5
c 3 6
In [34]: pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3,4],index=['b','a','c','d'])}
...: )
Out[34]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
In [46]: pd.read_csv('test.csv') # 读取
Out[46]:
a b c
0 1 2 3
1 4 5 6
2 7 8 9
In [49]: df.to_csv('test.csv') # 写入
In [50]: df
Out[50]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
DataFrame - 常用属性
In [60]: df
Out[60]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
In [61]: df.index
Out[61]: Index(['a', 'b', 'c', 'd'], dtype='object')
In [62]: df.values
Out[62]:
array([[ 1., 2.],
[ 2., 1.],
[ 3., 3.],
[nan, 4.]])
In [63]: df.T
Out[63]:
a b c d
one 1.0 2.0 3.0 NaN
two 2.0 1.0 3.0 4.0
In [66]: df.columns
Out[66]: Index(['one', 'two'], dtype='object')
In [69]: df.describe()
Out[69]:
one two
count 3.0 4.000000
mean 2.0 2.500000
std 1.0 1.290994
min 1.0 1.000000
25% 1.5 1.750000
50% 2.0 2.500000
75% 2.5 3.250000
max 3.0 4.000000
DataFrame小结:
index 行索引
columns 列索引
values 值 (二维数组)
T 转置
describe()
DataFrame索引与切片:
In [93]: df
Out[93]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
In [94]: df['one']['a']
Out[94]: 1.0
In [95]: df.loc['a','one'] # 一般用这种
Out[95]: 1.0
In [109]: df.loc['a']
Out[109]:
one 1.0
two 2.0
Name: a, dtype: float64
In [114]: df.loc[['a','c'],:]
Out[114]:
one two
a 1.0 2
c 3.0 3
In [116]: df.loc[['a','c'],'two']
Out[116]:
a 2
c 3
Name: two, dtype: int64
DataFrame - 索引和切片,数据对齐与缺失数据
In [119]: df = pd.DataFrame({'two':[1,2,3,4],'one':[4,5,6,7]},index=['c','d','b','a'])
In [124]: df
Out[124]:
two one
c 1 4
d 2 5
b 3 6
a 4 7
In [125]: df2
Out[125]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
In [126]: df + df2
Out[126]:
one two
a 8.0 6
b 8.0 4
c 7.0 4
d NaN 6
In [129]: df2
Out[129]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d NaN 4
In [130]: df2.fillna(0)
Out[130]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
d 0.0 4
In [134]: df2.dropna()
Out[134]:
one two
a 1.0 2
b 2.0 1
c 3.0 3
In [140]: df2.loc['d','two'] = np.nan
In [141]: df2.loc['c','two'] = np.nan
In [142]: df2
Out[142]:
one two
a 1.0 2.0
b 2.0 1.0
c 3.0 NaN
d NaN NaN
In [145]: df2.dropna(how="all")
Out[145]:
one two
a 1.0 2.0
b 2.0 1.0
c 3.0 NaN
In [146]: df2.dropna(how="any")
Out[146]:
one two
a 1.0 2.0
b 2.0 1.0
In [143]: df2.dropna()
Out[143]:
one two
a 1.0 2.0
b 2.0 1.0
In [156]: df
Out[156]:
two one
c NaN 4
d 2.0 5
b 3.0 6
a 4.0 7
In [157]: df.dropna(axis=1)
Out[157]:
one
c 4
d 5
b 6
a 7
pandas - 其他常用方法
In [167]: df
Out[167]:
two one
c NaN 4
d 2.0 5
b 3.0 6
a 4.0 7
In [168]: df.mean()
Out[168]:
two 3.0
one 5.5
dtype: float64
In [169]: df.mean(axis=1)
Out[169]:
c 4.0
d 3.5
b 4.5
a 5.5
dtype: float64
In [170]: df.sum()
Out[170]:
two 9.0
one 22.0
dtype: float64
In [173]: df.sum(axis=1)
Out[173]:
c 4.0
d 7.0
b 9.0
a 11.0
dtype: float64
In [190]: df.sort_values(by="two")
Out[190]:
two one
d 2.0 5
b 3.0 6
a 4.0 7
c NaN 4
In [189]: df.sort_values(by="two",ascending=False)
Out[189]:
two one
a 4.0 7
b 3.0 6
d 2.0 5
c NaN 4
In [187]: df.sort_values(by="a",ascending=False,axis=1)
Out[187]:
one two
c 4 NaN
d 5 2.0
b 6 3.0
a 7 4.0
In [192]: df.sort_index()
Out[192]:
two one
a 4.0 7
b 3.0 6
c NaN 4
d 2.0 5
In [194]: df.sort_index(ascending=False)
Out[194]:
two one
d 2.0 5
c NaN 4
b 3.0 6
a 4.0 7
In [200]: df.sort_index(ascending=False,axis=1)
Out[200]:
two one
c NaN 4
d 2.0 5
b 3.0 6
a 4.0 7
pandas - 时间对象处理
In [202]: import datetime
In [205]: datetime.datetime.strptime("2010/01/01",'%Y/%m/%d')
Out[205]: datetime.datetime(2010, 1, 1, 0, 0)
In [206]: datetime.datetime.strptime("2010-01-01",'%Y-%m-%d')
Out[206]: datetime.datetime(2010, 1, 1, 0, 0)
In [207]: import dateutil
In [208]: dateutil.parser.parse('2001-1-1')
Out[208]: datetime.datetime(2001, 1, 1, 0, 0)
In [209]: dateutil.parser.parse('2001/01/01')
Out[209]: datetime.datetime(2001, 1, 1, 0, 0)
In [210]: dateutil.parser.parse('2001-01-01')
Out[210]: datetime.datetime(2001, 1, 1, 0, 0)
In [211]: dateutil.parser.parse('02/03/2001')
Out[211]: datetime.datetime(2001, 2, 3, 0, 0)
In [212]: dateutil.parser.parse('2001-JAN-01')
Out[212]: datetime.datetime(2001, 1, 1, 0, 0)
In [213]: pd.to_datetime(['2001-01-01','2010/Feb/02'])
Out[213]: DatetimeIndex(['2001-01-01', '2010-02-02'], dtype='datetime64[ns]', freq=None)
In [216]: pd.date_range('2010-01-01','2010-5-1')
Out[216]:
DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
'2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
'2010-01-09', '2010-01-10',
...
'2010-04-22', '2010-04-23', '2010-04-24', '2010-04-25',
'2010-04-26', '2010-04-27', '2010-04-28', '2010-04-29',
'2010-04-30', '2010-05-01'],
dtype='datetime64[ns]', length=121, freq='D')
In [217]: pd.date_range('2010-01-01',periods=60) # 60天的
Out[217]:
DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
'2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
'2010-01-09', '2010-01-10', '2010-01-11', '2010-01-12',
'2010-01-13', '2010-01-14', '2010-01-15', '2010-01-16',
'2010-01-17', '2010-01-18', '2010-01-19', '2010-01-20',
'2010-01-21', '2010-01-22', '2010-01-23', '2010-01-24',
'2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28',
'2010-01-29', '2010-01-30', '2010-01-31', '2010-02-01',
'2010-02-02', '2010-02-03', '2010-02-04', '2010-02-05',
'2010-02-06', '2010-02-07', '2010-02-08', '2010-02-09',
'2010-02-10', '2010-02-11', '2010-02-12', '2010-02-13',
'2010-02-14', '2010-02-15', '2010-02-16', '2010-02-17',
'2010-02-18', '2010-02-19', '2010-02-20', '2010-02-21',
'2010-02-22', '2010-02-23', '2010-02-24', '2010-02-25',
'2010-02-26', '2010-02-27', '2010-02-28', '2010-03-01'],
dtype='datetime64[ns]', freq='D')
In [218]: pd.date_range('2010-01-01',periods=60,freq='h') # 按小时算的
Out[218]:
DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:00:00',
'2010-01-01 02:00:00', '2010-01-01 03:00:00',
'2010-01-01 04:00:00', '2010-01-01 05:00:00',
'2010-01-01 06:00:00', '2010-01-01 07:00:00',
'2010-01-01 08:00:00', '2010-01-01 09:00:00',
'2010-01-01 10:00:00', '2010-01-01 11:00:00',
'2010-01-01 12:00:00', '2010-01-01 13:00:00',
'2010-01-01 14:00:00', '2010-01-01 15:00:00',
'2010-01-01 16:00:00', '2010-01-01 17:00:00',
'2010-01-01 18:00:00', '2010-01-01 19:00:00',
'2010-01-01 20:00:00', '2010-01-01 21:00:00',
'2010-01-01 22:00:00', '2010-01-01 23:00:00',
'2010-01-02 00:00:00', '2010-01-02 01:00:00',
'2010-01-02 02:00:00', '2010-01-02 03:00:00',
'2010-01-02 04:00:00', '2010-01-02 05:00:00',
'2010-01-02 06:00:00', '2010-01-02 07:00:00',
'2010-01-02 08:00:00', '2010-01-02 09:00:00',
'2010-01-02 10:00:00', '2010-01-02 11:00:00',
'2010-01-02 12:00:00', '2010-01-02 13:00:00',
'2010-01-02 14:00:00', '2010-01-02 15:00:00',
'2010-01-02 16:00:00', '2010-01-02 17:00:00',
'2010-01-02 18:00:00', '2010-01-02 19:00:00',
'2010-01-02 20:00:00', '2010-01-02 21:00:00',
'2010-01-02 22:00:00', '2010-01-02 23:00:00',
'2010-01-03 00:00:00', '2010-01-03 01:00:00',
'2010-01-03 02:00:00', '2010-01-03 03:00:00',
'2010-01-03 04:00:00', '2010-01-03 05:00:00',
'2010-01-03 06:00:00', '2010-01-03 07:00:00',
'2010-01-03 08:00:00', '2010-01-03 09:00:00',
'2010-01-03 10:00:00', '2010-01-03 11:00:00'],
dtype='datetime64[ns]', freq='H')
In [219]: pd.date_range('2010-01-01',periods=60,freq='w') # 每个周日输出
Out[219]:
DatetimeIndex(['2010-01-03', '2010-01-10', '2010-01-17', '2010-01-24',
'2010-01-31', '2010-02-07', '2010-02-14', '2010-02-21',
'2010-02-28', '2010-03-07', '2010-03-14', '2010-03-21',
'2010-03-28', '2010-04-04', '2010-04-11', '2010-04-18',
'2010-04-25', '2010-05-02', '2010-05-09', '2010-05-16',
'2010-05-23', '2010-05-30', '2010-06-06', '2010-06-13',
'2010-06-20', '2010-06-27', '2010-07-04', '2010-07-11',
'2010-07-18', '2010-07-25', '2010-08-01', '2010-08-08',
'2010-08-15', '2010-08-22', '2010-08-29', '2010-09-05',
'2010-09-12', '2010-09-19', '2010-09-26', '2010-10-03',
'2010-10-10', '2010-10-17', '2010-10-24', '2010-10-31',
'2010-11-07', '2010-11-14', '2010-11-21', '2010-11-28',
'2010-12-05', '2010-12-12', '2010-12-19', '2010-12-26',
'2011-01-02', '2011-01-09', '2011-01-16', '2011-01-23',
'2011-01-30', '2011-02-06', '2011-02-13', '2011-02-20'],
dtype='datetime64[ns]', freq='W-SUN')
In [220]: pd.date_range('2010-01-01',periods=60,freq='w-MON') # 每个周一输出
Out[220]:
DatetimeIndex(['2010-01-04', '2010-01-11', '2010-01-18', '2010-01-25',
'2010-02-01', '2010-02-08', '2010-02-15', '2010-02-22',
'2010-03-01', '2010-03-08', '2010-03-15', '2010-03-22',
'2010-03-29', '2010-04-05', '2010-04-12', '2010-04-19',
'2010-04-26', '2010-05-03', '2010-05-10', '2010-05-17',
'2010-05-24', '2010-05-31', '2010-06-07', '2010-06-14',
'2010-06-21', '2010-06-28', '2010-07-05', '2010-07-12',
'2010-07-19', '2010-07-26', '2010-08-02', '2010-08-09',
'2010-08-16', '2010-08-23', '2010-08-30', '2010-09-06',
'2010-09-13', '2010-09-20', '2010-09-27', '2010-10-04',
'2010-10-11', '2010-10-18', '2010-10-25', '2010-11-01',
'2010-11-08', '2010-11-15', '2010-11-22', '2010-11-29',
'2010-12-06', '2010-12-13', '2010-12-20', '2010-12-27',
'2011-01-03', '2011-01-10', '2011-01-17', '2011-01-24',
'2011-01-31', '2011-02-07', '2011-02-14', '2011-02-21'],
dtype='datetime64[ns]', freq='W-MON')
In [221]: pd.date_range('2010-01-01',periods=60,freq='B') # 只有工作日出现
Out[221]:
DatetimeIndex(['2010-01-01', '2010-01-04', '2010-01-05', '2010-01-06',
'2010-01-07', '2010-01-08', '2010-01-11', '2010-01-12',
'2010-01-13', '2010-01-14', '2010-01-15', '2010-01-18',
'2010-01-19', '2010-01-20', '2010-01-21', '2010-01-22',
'2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28',
'2010-01-29', '2010-02-01', '2010-02-02', '2010-02-03',
'2010-02-04', '2010-02-05', '2010-02-08', '2010-02-09',
'2010-02-10', '2010-02-11', '2010-02-12', '2010-02-15',
'2010-02-16', '2010-02-17', '2010-02-18', '2010-02-19',
'2010-02-22', '2010-02-23', '2010-02-24', '2010-02-25',
'2010-02-26', '2010-03-01', '2010-03-02', '2010-03-03',
'2010-03-04', '2010-03-05', '2010-03-08', '2010-03-09',
'2010-03-10', '2010-03-11', '2010-03-12', '2010-03-15',
'2010-03-16', '2010-03-17', '2010-03-18', '2010-03-19',
'2010-03-22', '2010-03-23', '2010-03-24', '2010-03-25'],
dtype='datetime64[ns]', freq='B')
In [237]: pd.date_range('2010-01-01',periods=60,freq='1h20min') # 1小时20分钟
Out[237]:
DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:20:00',
'2010-01-01 02:40:00', '2010-01-01 04:00:00',
'2010-01-01 05:20:00', '2010-01-01 06:40:00',
'2010-01-01 08:00:00', '2010-01-01 09:20:00',
'2010-01-01 10:40:00', '2010-01-01 12:00:00',
'2010-01-01 13:20:00', '2010-01-01 14:40:00',
'2010-01-01 16:00:00', '2010-01-01 17:20:00',
'2010-01-01 18:40:00', '2010-01-01 20:00:00',
'2010-01-01 21:20:00', '2010-01-01 22:40:00',
'2010-01-02 00:00:00', '2010-01-02 01:20:00',
'2010-01-02 02:40:00', '2010-01-02 04:00:00',
'2010-01-02 05:20:00', '2010-01-02 06:40:00',
'2010-01-02 08:00:00', '2010-01-02 09:20:00',
'2010-01-02 10:40:00', '2010-01-02 12:00:00',
'2010-01-02 13:20:00', '2010-01-02 14:40:00',
'2010-01-02 16:00:00', '2010-01-02 17:20:00',
'2010-01-02 18:40:00', '2010-01-02 20:00:00',
'2010-01-02 21:20:00', '2010-01-02 22:40:00',
'2010-01-03 00:00:00', '2010-01-03 01:20:00',
'2010-01-03 02:40:00', '2010-01-03 04:00:00',
'2010-01-03 05:20:00', '2010-01-03 06:40:00',
'2010-01-03 08:00:00', '2010-01-03 09:20:00',
'2010-01-03 10:40:00', '2010-01-03 12:00:00',
'2010-01-03 13:20:00', '2010-01-03 14:40:00',
'2010-01-03 16:00:00', '2010-01-03 17:20:00',
'2010-01-03 18:40:00', '2010-01-03 20:00:00',
'2010-01-03 21:20:00', '2010-01-03 22:40:00',
'2010-01-04 00:00:00', '2010-01-04 01:20:00',
'2010-01-04 02:40:00', '2010-01-04 04:00:00',
'2010-01-04 05:20:00', '2010-01-04 06:40:00'],
dtype='datetime64[ns]', freq='80T')
In [234]: dt
Out[234]:
DatetimeIndex(['2010-01-01', '2010-01-04', '2010-01-05', '2010-01-06',
'2010-01-07', '2010-01-08', '2010-01-11', '2010-01-12',
'2010-01-13', '2010-01-14', '2010-01-15', '2010-01-18',
'2010-01-19', '2010-01-20', '2010-01-21', '2010-01-22',
'2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28',
'2010-01-29', '2010-02-01', '2010-02-02', '2010-02-03',
'2010-02-04', '2010-02-05', '2010-02-08', '2010-02-09',
'2010-02-10', '2010-02-11', '2010-02-12', '2010-02-15',
'2010-02-16', '2010-02-17', '2010-02-18', '2010-02-19',
'2010-02-22', '2010-02-23', '2010-02-24', '2010-02-25',
'2010-02-26', '2010-03-01', '2010-03-02', '2010-03-03',
'2010-03-04', '2010-03-05', '2010-03-08', '2010-03-09',
'2010-03-10', '2010-03-11', '2010-03-12', '2010-03-15',
'2010-03-16', '2010-03-17', '2010-03-18', '2010-03-19',
'2010-03-22', '2010-03-23', '2010-03-24', '2010-03-25'],
dtype='datetime64[ns]', freq='B')
In [236]: dt[0].to_pydatetime()
Out[236]: datetime.datetime(2010, 1, 1, 0, 0)
pandas - 时间序列
In [239]: sr = pd.Series(np.arange(100),index=pd.date_range('2017-01-01',periods=100))
In [240]: sr
Out[240]:
2017-01-01 0
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
2017-01-06 5
2017-01-07 6
2017-01-08 7
2017-01-09 8
2017-01-10 9
2017-01-11 10
2017-01-12 11
2017-01-13 12
2017-01-14 13
2017-01-15 14
2017-01-16 15
2017-01-17 16
2017-01-18 17
2017-01-19 18
2017-01-20 19
2017-01-21 20
2017-01-22 21
2017-01-23 22
2017-01-24 23
2017-01-25 24
2017-01-26 25
2017-01-27 26
2017-01-28 27
2017-01-29 28
2017-01-30 29
..
2017-03-12 70
2017-03-13 71
2017-03-14 72
2017-03-15 73
2017-03-16 74
2017-03-17 75
2017-03-18 76
2017-03-19 77
2017-03-20 78
2017-03-21 79
2017-03-22 80
2017-03-23 81
2017-03-24 82
2017-03-25 83
2017-03-26 84
2017-03-27 85
2017-03-28 86
2017-03-29 87
2017-03-30 88
2017-03-31 89
2017-04-01 90
2017-04-02 91
2017-04-03 92
2017-04-04 93
2017-04-05 94
2017-04-06 95
2017-04-07 96
2017-04-08 97
2017-04-09 98
2017-04-10 99
Freq: D, Length: 100, dtype: int32
In [241]: sr.index
Out[241]:
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
'2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
'2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
'2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
'2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
'2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
'2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
'2017-01-29', '2017-01-30', '2017-01-31', '2017-02-01',
'2017-02-02', '2017-02-03', '2017-02-04', '2017-02-05',
'2017-02-06', '2017-02-07', '2017-02-08', '2017-02-09',
'2017-02-10', '2017-02-11', '2017-02-12', '2017-02-13',
'2017-02-14', '2017-02-15', '2017-02-16', '2017-02-17',
'2017-02-18', '2017-02-19', '2017-02-20', '2017-02-21',
'2017-02-22', '2017-02-23', '2017-02-24', '2017-02-25',
'2017-02-26', '2017-02-27', '2017-02-28', '2017-03-01',
'2017-03-02', '2017-03-03', '2017-03-04', '2017-03-05',
'2017-03-06', '2017-03-07', '2017-03-08', '2017-03-09',
'2017-03-10', '2017-03-11', '2017-03-12', '2017-03-13',
'2017-03-14', '2017-03-15', '2017-03-16', '2017-03-17',
'2017-03-18', '2017-03-19', '2017-03-20', '2017-03-21',
'2017-03-22', '2017-03-23', '2017-03-24', '2017-03-25',
'2017-03-26', '2017-03-27', '2017-03-28', '2017-03-29',
'2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02',
'2017-04-03', '2017-04-04', '2017-04-05', '2017-04-06',
'2017-04-07', '2017-04-08', '2017-04-09', '2017-04-10'],
dtype='datetime64[ns]', freq='D')
In [244]: sr['2017-03'] # 只选取3月的
Out[244]:
2017-03-01 59
2017-03-02 60
2017-03-03 61
2017-03-04 62
2017-03-05 63
2017-03-06 64
2017-03-07 65
2017-03-08 66
2017-03-09 67
2017-03-10 68
2017-03-11 69
2017-03-12 70
2017-03-13 71
2017-03-14 72
2017-03-15 73
2017-03-16 74
2017-03-17 75
2017-03-18 76
2017-03-19 77
2017-03-20 78
2017-03-21 79
2017-03-22 80
2017-03-23 81
2017-03-24 82
2017-03-25 83
2017-03-26 84
2017-03-27 85
2017-03-28 86
2017-03-29 87
2017-03-30 88
2017-03-31 89
Freq: D, dtype: int32
In [254]: sr['2017':'2018-03'] # 也可以
。。。
In [255]: sr['2017-12-25':'2018-02-01'] # 也可以
。。。
In [260]: sr.resample('W').sum() # 每周的和
Out[260]:
2017-01-01 0
2017-01-08 28
2017-01-15 77
2017-01-22 126
2017-01-29 175
2017-02-05 224
In [261]: sr.resample('M').sum() # 每月的和
Out[261]:
2017-01-31 465
2017-02-28 1246
2017-03-31 2294
2017-04-30 3135
2017-05-31 4185
2017-06-30 4965
2017-07-31 6076
In [262]: sr.resample('M').mean() # 每月的平均
Out[262]:
2017-01-31 15.0
2017-02-28 44.5
2017-03-31 74.0
2017-04-30 104.5
2017-05-31 135.0
In [265]: sr.truncate(before='2018-2-3') # 切走之前的
Out[265]:
2018-02-03 398
2018-02-04 399
2018-02-05 400
2018-02-06 401
2018-02-07 402
2018-02-08 403
pandas - 文件处理
In [318]: pd.read_csv('test.csv')
Out[318]:
Unnamed: 0 one two date
0 a 1.0 2 2018/1/2
1 b 2.0 1 2018/1/3
2 c 3.0 3 2018/1/4
3 d NaN 4 2018/1/5
4 NaN 1.0 3 2018/1/6
5 NaN 2.0 3 2018/1/7
In [320]: pd.read_csv('test.csv',index_col='date') # 指定索引列。s = _ / s.index / date是字符串,不是时间对象
Out[320]:
Unnamed: 0 one two
date
2018/1/2 a 1.0 2
2018/1/3 b 2.0 1
2018/1/4 c 3.0 3
2018/1/5 d NaN 4
2018/1/6 NaN 1.0 3
2018/1/7 NaN 2.0 3
In [324]: pd.read_csv('test.csv',index_col='date', parse_dates=True) # 指定索引列。s = _ / s.index / date是时间对象
Out[324]:
Unnamed: 0 one two
date
2018-01-02 a 1.0 2
2018-01-03 b 2.0 1
2018-01-04 c 3.0 3
2018-01-05 d NaN 4
2018-01-06 NaN 1.0 3
2018-01-07 NaN 2.0 3
In [331]: pd.read_csv('test.csv',index_col='date', parse_dates=['date']) # 也可以, 指定索引列。s = _ / s.index / date是时间对象
。。。
文件没有列名的情况:
In [336]: pd.read_csv("test.csv")
Out[336]:
a 1 2 2018/1/2
0 b 2.0 1 2018/1/3
1 c 3.0 3 2018/1/4
2 d NaN 4 2018/1/5
3 NaN 1.0 3 2018/1/6
4 NaN 2.0 3 2018/1/7
In [337]: pd.read_csv("test.csv",header=None) # 自动生成列名 0 1 2 3 ,第一行不解释为name
Out[337]:
0 1 2 3
0 a 1.0 2 2018/1/2
1 b 2.0 1 2018/1/3
2 c 3.0 3 2018/1/4
3 d NaN 4 2018/1/5
4 NaN 1.0 3 2018/1/6
5 NaN 2.0 3 2018/1/7
In [342]: pd.read_csv("test.csv",names=['a','b','c','d']) # 指定生成的列名
Out[342]:
a b c d
0 a 1.0 2 2018/1/2
1 b 2.0 1 2018/1/3
2 c 3.0 3 2018/1/4
3 d NaN 4 2018/1/5
4 NaN 1.0 3 2018/1/6
5 NaN 2.0 3 2018/1/7
In [343]: pd.read_csv("test.csv",names=['a','b','c','d'],skiprows=[0,1]) # 跳过某些行
...
NaN会解释成浮点数,但None会解释成object
In [372]: pd.read_csv("test.csv",header=None,na_values=['None']) # 指定None被解释成NaN
这样缺失值,。。。都会被解释成NaN,被解释成浮点数,否则就会有object
In [431]: pd.read_*? # pandas可读取的文件
pd.read_clipboard
pd.read_csv
pd.read_excel
pd.read_feather
pd.read_fwf
pd.read_gbq
pd.read_hdf
pd.read_html
pd.read_json
pd.read_msgpack
pd.read_parquet
pd.read_pickle
pd.read_sas
pd.read_sql
pd.read_sql_query
pd.read_sql_table
pd.read_stata
pd.read_table
In [429]: df
Out[429]:
0 1 2 3
0 NaN 1.0 2 2018/1/2
1 b NaN 1 2018/1/3
2 c 3.0 3 2018/1/4
3 d NaN 4 2018/1/5
4 NaN 1.0 3 2018/1/6
5 NaN 2.0 3 2018/1/7
In [432]: df.to_*? # 可写进去的文件类型
df.to_clipboard
df.to_csv
df.to_dense
df.to_dict
df.to_excel # 需要安装 xlrd 模块
df.to_feather
df.to_gbq
df.to_hdf
df.to_html
df.to_json
df.to_latex
df.to_msgpack
df.to_panel
df.to_parquet
df.to_period
df.to_pickle
df.to_records
df.to_sparse
df.to_sql
df.to_stata
df.to_string
df.to_timestamp
df.to_xarray