- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
-
- ---------------numpy-----------------------
- arr = np.array([1,2,3], dtype=np.float64)
- np.zeros((3,6)) np.empty((2,3,2)) np.arange(15)
- arr.dtype arr.ndim arr.shape
- arr.astype(np.int32)
- arr * arr arr - arr 1/arr
- arr= np.arange(32).reshape((8,4))
- arr[1:3, : ]
- arr[[1,2,3]]
- arr.T arr.transpose((...)) arr.swapaxes(...)
- arr.dot
- np.sqrt(arr) np.exp(arr) randn(8)#正态分布值 np.maximum(x,y)
- np.where(cond, xarr, yarr) #当cond为真,取xarr,否则取yarr
- arr.mean() arr.mean(axis=1)
- arr.sum() arr.std() arr.var()
- arr.min() arr.max()
- arr.argmin() arr.argmax()
- arr.cumsum() arr.cumprod()
- arr.all() arr.any()
- arr.sort() arr.sort(1)
- arr.unique()
- np.in1d(arr1, arr2)
- np.load() np.loadtxt() np.save() np.savez() #读取、保存文件
- np.concatenate([arr, arr], axis=1) #连接两个arr,按行的方向
-
-
- ---------------pandas-----------------------
- ser = Series() ser = Series([...], index=[...])
- ser.values ser.index ser.reindex([...], fill_value=0)
- ser.isnull() pd.isnull(ser) pd.notnull(ser)
- ser.name= ser.index.name=
- ser.drop('x')
- ser +ser
- ser.sort_index() ser.order() #按索引排序、按值排序
- df = DataFrame(data, columns=[...], index=[...])
- df.ix['x']
- del df['ly']
- df.T
- df.index.name df.columns.name df.values
- df.drop([...])
- df + df df1.add(df2, fill_vaule=0)
- df -ser
- f=lambda x: x.max()-x.min() df.apply(f)
- df.sort_index(axis=1, ascending=False)
- df.sort_index(by=['a','b'])
- ser.rank() df.rank(axis=1)
- df.sum() df.sum(axis=1)
- df.mean(axis=1, skipna=False)
- df.idxmax()
- df.cumsum()
- df.describe() ser.describe()
- ser.unique()
- ser.value_counts() df.value_counts() #返回一个series,其索引为唯一值,值为频率
- ser.isin(['x', 'y'])
- ser.dropna() ser.isnull() ser.notnull() ser.fillna(0)
- df.unstack()
- df.swaplevel('key1','key2')
- df.sortlevel(1)
- df.set_index(['c','d'], drop=False)
- read_csv read_table read_fwf
- pd.read_csv('...', nrows=5)
- pd.read_csv('...', chunksize=1000)
- pd.load()
- pd.ExcelFile('...xls').parse('Sheet1')
- df.to_csv('...csv', sep='|', index=False, header=False)
- pd.merge(df1, df2, on='key', suffixes=('_left', '_right'))
- pd.merge(df1, df2, left_on='lkey', right_on='rkey')
- pd.merge(df1, df2, how='outer')
- df1.join(df2, on='key', how='outer')
- pd.concat([ser1, ser2, ser3], axis=1)
- ser1.combine_first(ser2) df1.combine_first(df2)
- df.stack() df.unstack()
- df.pivot()
- df.duplicated() df.drop_duplicates()
- df[''].map(lambda x: abs(x))
- ser.replace(-999, np.nan)
- df.rename(index={}, columns={}, inplace=True)
- pd.cut(ser, bins)
- df[(np.abs(df)>3).any(1)]
- permutation take
- pd.get_dummies(df['key'], prefix='key')
- df[...].str.contains() df[...].str.findall(pattern, flags=re.IGNORECASE) df[...].str.match(pattern, flags=...) df[...].str.get()
-
- ----绘图
- ser.plot() df.plot()
- kind='kde'
- kind='bar' kind='barh'
- ser.hist(bins=50)
- plt.scatter(x,y)
- pd.scatter_matrix(df, diagonal='kde', color='k', alpha='0.3')
-
- ----聚合分组
- groupby() 默认在axis=0轴上分组,也可以在1组上分组;可以用for进行分组迭代
- df.groupby(df['key1'])
- df['key2'].groupby(df['key1'])
- df['key3'].groupby(df['key1'], df['key2'])
- df['key2'].groupby(df['key1']).size()
- df.groupby(df['key1'])['data1'] 等价于 df['data1'].groupby(df['key1'])
- df.groupby(df['key1'])[['data1']] 等价于 df[['data1']].groupby(df['key1'])
- df.groupby(mapping, axis=1) ser(mapping)
- df.groupby(len)
- df.groupby(level='...', axis=1)
- df.groupby([], as_index=False)
- df.groupby(...).agg(['mean', 'std'])
- df.groupby(...).transform(np.mean)
- df.groupby().apply()
-
- ----透视交叉
- df.pivot_table(['',''], rows=['',''], cols='', margins=True)
- pd.crosstab(df.col1, df.col2, margins=True)
-
-
- ---------------matplotlib---------------
- fig=plt.figure() #图像所在的基对象
- ax=fig.add_subplot(2,2,1)
- fig, axes = plt.subplots(nrows, nclos, sharex, sharey)
- plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)
- ax.plot(x, y, linestyle='--', color='g')
- ax.set_xticks([...]) ax.set_xticklabels([...])
- ax.set_xlabel('...')
- ax.set_title('....') #设置图名
- ax.legend(loc='best')
- ax.text(x,y, 'hello', family='monospace', fontsize=10)
- ax.add_patch()
- plt.savefig('...png', dpi=400, bbox_inches='tight')
-
-
-
-
- ------------------------------------------
- from mpl_toolkits.basemap import Basemap
- import matplotlib.pyplot as plt
-
-
- -----------------时间序列--------------------------
- pd.to_datetime(datestrs)
- pd.date_range('1/1/2000', periods=1000)
- ts.resample('D', how='mean')
- #重采样会聚合,即将短频率(日)变成长频率(月),对应的值叠加;
- #升采样会插值,即将长频率变为短频率,中间产生新值
- ts.shift(2, freq='D') ts.shift(-2, freq='D')
- now+Day() now+MonthEnd()
- import pytz pytz.timezone('US/Eastern')
- pd.Period('2010', freq='A-DEC') #period表示时间区间,叫做时期
- pd.PeriodIndex
- ts.to_period('M')
- pd.rolling_mean(...) pd.rolling_std(...)