pandas-Notes1
#coding = utf-8 import pandas as pd import numpy as np import matplotlib as plt # series, like vector, vertical aligned. s = pd.Series([1,2,np.nan,3]) print s ''' 0 1.0 1 2.0 2 NaN 3 3.0 dtype: float64 ''' ################################################## # pd.DataFrame like data.frame in R # create DataFrame from matrix. # freq='D' means day dates = pd.date_range('20170601', periods=6) print dates ''' DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04', '2017-06-05', '2017-06-06'], dtype='datetime64[ns]', freq='D') ''' # np.random.randn(d0,d1..dn) return 6*4 matrix whose data are # random floats sampled from a univariate "normal" distribution of mean 0 and variance 1 # index are rownames; columns are colnames df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) print df ''' A B C D 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-03 -1.516629 1.228905 0.949323 0.127440 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-06 0.566325 1.189855 0.206210 2.334218 ''' # create from dict of objects df2 = pd.DataFrame({'A' : 1., # float64 # pandas's date class, datetime64[ns] 'B' : pd.Timestamp('20170601'), # index are rownames. 'C' : pd.Series(1, index=list(range(4)),dtype='float32'), # array 'D' : np.array([1] * 4, dtype='int32'), 'E' : pd.Categorical(["test", "train", "test", "train"]), #'F' : 'foo' shows error. Use Series instead. 'F' : pd.Series(['foo'] * 4, dtype='object') }) print df2.dtypes ''' A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object ''' print df2 ''' A B C D E F 0 1.0 2017-06-01 1.0 1 test foo 1 1.0 2017-06-01 1.0 1 train foo 2 1.0 2017-06-01 1.0 1 test foo 3 1.0 2017-06-01 1.0 1 train foo ''' # view colnames and first n lines or last n lines print df2.head(2) print df2.tail(3) print df2.index print df2.columns ''' Int64Index([0, 1, 2, 3], dtype='int64') Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object') ''' # remove index and columns print df2.values # statistic summary to data print df.describe() ''' A B C D count 6.000000 6.000000 6.000000 6.000000 mean -0.175955 0.657275 0.854328 0.268951 std 0.817537 0.688410 0.983534 1.289192 min -1.516629 -0.599783 -0.186808 -1.198540 25% -0.497919 0.484815 0.132669 -0.454428 50% -0.098408 0.868573 0.577766 -0.127502 75% 0.491531 1.132509 1.666888 0.940453 max 0.600630 1.228905 2.143011 2.334218 ''' # transpose data print df.T print df ''' A B C D 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-03 -1.516629 1.228905 0.949323 0.127440 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-06 0.566325 1.189855 0.206210 2.334218 ''' # axis = 0 means sort by index, axis = 1 means sort by columns print df.sort_index(axis=0, ascending=False) ''' A B C D 2017-06-06 0.566325 1.189855 0.206210 2.334218 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 2017-06-03 -1.516629 1.228905 0.949323 0.127440 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 ''' print df.sort_values(by='B') ''' A B C D 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 2017-06-06 0.566325 1.189855 0.206210 2.334218 2017-06-03 -1.516629 1.228905 0.949323 0.127440 ''' ################################################## # extract data from DataFrame ################################################## # simple get # slice rows. use number or index print df[0:3] print df['20170601':'20170603'] # slice col. return Series print df['A'] # by Label # print first row print df.loc[dates[0]] # select some row and some col print df.loc[: , ['A','B']] # to get fast access to a scalar. use at print df.at[dates[0], 'A'] # by position # print first row print df.iloc[0] print df.iloc[3:5, 0:2] # faster access!!!! # only integer index. : is not allowed. print df.iat[1,1] # boolean index print df ''' A B C D 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-03 -1.516629 1.228905 0.949323 0.127440 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-06 0.566325 1.189855 0.206210 2.334218 ''' # print rows of value A>0 print df[df.A > 0] ''' A B C D 2017-06-02 0.267148 -0.599783 2.143011 1.211458 2017-06-05 0.600630 0.776675 1.906076 -0.382445 2017-06-06 0.566325 1.189855 0.206210 2.334218 ''' # print only positive values. others are NaN print df[df > 0] ''' A B C D 2017-06-01 NaN 0.960470 NaN NaN 2017-06-02 0.267148 NaN 2.143011 1.211458 2017-06-03 NaN 1.228905 0.949323 0.127440 2017-06-04 NaN 0.387529 0.108155 NaN 2017-06-05 0.600630 0.776675 1.906076 NaN 2017-06-06 0.566325 1.189855 0.206210 2.334218 ''' # copy a DataFrame df3 = df.copy() df3['E'] = ['one', 'one', 'two', 'three', 'four', 'five'] print df3 ''' A B C D E 2017-06-01 -0.463965 0.960470 -0.186808 -1.198540 one 2017-06-02 0.267148 -0.599783 2.143011 1.211458 one 2017-06-03 -1.516629 1.228905 0.949323 0.127440 two 2017-06-04 -0.509237 0.387529 0.108155 -0.478422 three 2017-06-05 0.600630 0.776675 1.906076 -0.382445 four 2017-06-06 0.566325 1.189855 0.206210 2.334218 five ''' # print selected rows with E.value='two' or 'five' print df3[df3['E'].isin(['two', 'five'])] ''' A B C D E 2017-06-03 -1.516629 1.228905 0.949323 0.127440 two 2017-06-06 0.566325 1.189855 0.206210 2.334218 five ''' # add another col. or use Series df3.loc[:,'F'] = np.array(['hello'] * len(df3)) print df3 ''' A B C D E F 2017-06-01 -0.246362 -1.968794 0.596064 1.656667 one hello 2017-06-02 0.212728 0.931468 -0.977221 -1.709449 one hello 2017-06-03 -0.129513 1.911554 0.998007 0.867370 two hello 2017-06-04 0.688660 0.010904 -0.391857 1.546751 three hello 2017-06-05 0.283462 0.082037 -1.050666 1.092778 four hello 2017-06-06 -1.084382 0.560529 -1.497804 -0.709840 five hello ''' ################################################## # NaN ################################################## # dates has been defined at first # reindex : change/add/delete index df4 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) # uninitiated value will be NaN df4.loc[dates[0], 'E'] = 1 print df4 ''' A B C D E 2017-06-01 0.142853 0.380009 -1.268463 0.463704 1.0 2017-06-02 0.831730 1.615873 0.657926 1.323841 NaN 2017-06-03 -0.739303 0.524235 0.877496 1.065300 NaN 2017-06-04 0.785783 -0.655868 0.631207 1.365685 NaN ''' # judge if there is NaN or not # return a DataFrame filled with true or false print pd.isnull(df4) # drop na print df4.dropna(how='any') ''' A B C D E 2017-06-01 0.071516 0.377737 1.203327 0.711661 1.0 ''' # fill NaN with some number print df4.fillna(value=5)