pandas-Notes2
#coding = utf-8 import pandas as pd import numpy as np import matplotlib as plt dates = pd.date_range('20170601', periods=6) # make a random 6*4 matrix df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) print df # statistic basics. exclude missing data in general # mean. mean of cols as default print df.mean() ''' A -0.640908 B -0.216183 C 0.316962 D -0.634263 dtype: float64 ''' # mean of rows print df.mean(1) # move down s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) #print s ''' 2017-06-01 NaN 2017-06-02 NaN 2017-06-03 1.0 2017-06-04 3.0 2017-06-05 5.0 2017-06-06 NaN Freq: D, dtype: float64 ''' # df-s. pandas will make Series into DataFrame # df will change print df print df.sub(s, axis='index') # cumulate by rows. default is by cols print df.apply(np.cumsum, axis=1) # apply lambda print df.apply(lambda x: x.max() - x.min()) s = pd.Series(np.random.randint(0, 7, size=10)) # there are duplicate values # value_counts behaves like histogram print s.value_counts() # string methods # s.str.lower() means to lowercase print df # first 3 rows. index by rows in default print df[:3] # concat. use list as parameter pieces = [df[:3], df[4:]] print pd.concat(pieces) # join. left = pd.DataFrame({'key':['1', '2'], 'lvar':['leftVar1', 'leftVar2']}) right = pd.DataFrame({'key':['1', '2'], 'rvar':['rightVar1', 'rightVar2']}) print left print right # merge by same key value print pd.merge(left, right, on='key') ''' key lvar rvar 0 1 leftVar1 rightVar1 1 2 leftVar2 rightVar2 ''' # append. add a row to the tail # ignore_index = False, the index will be appended too. If True, then all index will be 0...n (int) print df.append(df.iloc[3], ignore_index=False) # group df1 = pd.DataFrame({'A' : ['f', 'b', 'f', 'f', 'b'], 'B' : ['1', '2', '2', '1', '2'], 'C' : np.random.randn(5)}) # use sum() print df1.groupby(['A', 'B']).sum() # stack unstack means transformation between matrix and DataFrame # pivot_table means group by index and cols, use values. if there's function, execute it # pivot_table(df, values='D', index=['A', 'B'], columnes=['C']) # time series for time # categoricals # declare as category s1 = pd.Series(['A', 'B', 'B', 'C', 'A', 'E']).astype("category") # set category. Must same number of unique levels s1.cat.categories = ["good", "bad", 'A', 'B'] print s1 # df.sort_values(by="categoryName") # df.groupby("categoryName").size() # plot df2 = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) df2 = df2.cumsum() # four lines, four colors. with legend. df2.plot() #plt.pyplot.show() # file in & out df2.to_csv("df2.csv") df3 = pd.read_csv("df2.csv") print df3.head(3) #df2.to_hdf("df2.h5", 'df') #pd.read_hdf('df2.h5', 'df') # need module openpyxl... df2.to_excel('df2.xlsx', sheet_name='sheet1') pd.read_excel('df2.xlsx', 'sheet1', index_col=None, na_values=['NA'])