pandas-Notes2

#coding = utf-8
import pandas as pd
import numpy as np
import  matplotlib as plt

dates = pd.date_range('20170601', periods=6)
# make a random 6*4 matrix
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

print df

# statistic basics. exclude missing data in general
# mean. mean of cols as default
print df.mean()
'''
A   -0.640908
B   -0.216183
C    0.316962
D   -0.634263
dtype: float64
'''
# mean of rows
print df.mean(1)

# move down
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
#print s
'''
2017-06-01    NaN
2017-06-02    NaN
2017-06-03    1.0
2017-06-04    3.0
2017-06-05    5.0
2017-06-06    NaN
Freq: D, dtype: float64
'''

# df-s. pandas will make Series into DataFrame
# df will change
print df
print df.sub(s, axis='index')

# cumulate by rows. default is by cols
print df.apply(np.cumsum, axis=1)
# apply lambda
print df.apply(lambda x: x.max() - x.min())

s = pd.Series(np.random.randint(0, 7, size=10))
# there are duplicate values
# value_counts behaves like histogram
print s.value_counts()

# string methods
# s.str.lower() means to lowercase

print df
# first 3 rows. index by rows in default
print df[:3]

# concat. use list as parameter
pieces = [df[:3], df[4:]]
print pd.concat(pieces)

# join.
left = pd.DataFrame({'key':['1', '2'], 'lvar':['leftVar1', 'leftVar2']})
right = pd.DataFrame({'key':['1', '2'], 'rvar':['rightVar1', 'rightVar2']})
print left
print right
# merge by same key value
print pd.merge(left, right, on='key')
'''
  key      lvar       rvar
0   1  leftVar1  rightVar1
1   2  leftVar2  rightVar2
'''

# append. add a row to the tail
# ignore_index = False, the index will be appended too. If True, then all index will be 0...n (int)
print df.append(df.iloc[3], ignore_index=False)

# group
df1 = pd.DataFrame({'A' : ['f', 'b', 'f', 'f', 'b'],
                    'B' : ['1', '2', '2', '1', '2'],
                    'C' : np.random.randn(5)})
# use sum()
print df1.groupby(['A', 'B']).sum()

# stack unstack means transformation between matrix and DataFrame

# pivot_table means group by index and cols, use values. if there's function, execute it
# pivot_table(df, values='D', index=['A', 'B'], columnes=['C'])

# time series for time

# categoricals

# declare as category
s1 = pd.Series(['A', 'B', 'B', 'C', 'A', 'E']).astype("category")
# set category. Must same number of unique levels
s1.cat.categories = ["good", "bad", 'A', 'B']
print s1
# df.sort_values(by="categoryName")
# df.groupby("categoryName").size()

# plot
df2 = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
df2 = df2.cumsum()
# four lines, four colors. with legend.
df2.plot()
#plt.pyplot.show()

# file in & out
df2.to_csv("df2.csv")

df3 = pd.read_csv("df2.csv")
print df3.head(3)

#df2.to_hdf("df2.h5", 'df')
#pd.read_hdf('df2.h5', 'df')

# need module openpyxl...
df2.to_excel('df2.xlsx', sheet_name='sheet1')
pd.read_excel('df2.xlsx', 'sheet1', index_col=None, na_values=['NA'])

  

posted @ 2017-06-05 18:56  陆离可  阅读(145)  评论(0编辑  收藏  举报