python学习-数据聚合与分组运算
1、groupby
df = pd.DataFrame({'key1':['a','a','b','b','a'], 'key2':['one','two','one','two','one'], 'data1':np.random.randn(5), 'data2':np.random.randn(5)}) df['data1'].groupby(df['key1']).mean() #按key1分组,并计算data1的均值 means =df['data1'].groupby([df['key1'],df['key2']]).mean() #传入多个数组进行分组 means.unstack() #分组键均为Series。实际上分组键可以是任何长度适当的数组 states = np.array(['Ohio','California','California','Ohio','Ohio']) years = np.array([2005,2005,2005,2005,2006]) df['data1'].groupby([states,years]).mean() #还可以将列名(可以是字符串、数字或其他python对象)用作分组键 df.groupby('key1').mean() df.groupby(['key1','key2']).mean() #size方法返回一个含有分组大小的Series df.groupby(['key1','key2']).size()
2、对分组进行迭代
1 #产生一组二元元组(由分组名和数据块组成) 2 for name ,group in df.groupby('key1'): 3 print(name) 4 print(group) 5 ’‘’ 6 a 7 key1 key2 data1 data2 8 0 a one 0.266786 0.430996 9 1 a two 0.168590 -0.488668 10 4 a one 0.585086 -0.215946 11 b 12 key1 key2 data1 data2 13 2 b one -1.239376 -0.860125 14 3 b two -0.022342 -0.169070 15 ‘’‘ 16 for (k1, k2) ,group in df.groupby(['key1','key2']): 17 print((k1,k2)) 18 print(group) 19 20 #将分组数据变为字典 21 pieces = dict(list(df.groupby('key1'))) 22 pieces['b']
3、选取一个或者一组列
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #下面两者是等价的 2 df.groupby('key1')['data1'] #返回的是已经分组的SeriesGroupBy 3 df['data1'].groupby(df['key1']) 4 5 df.groupby('key1')[['data1']] #返回的是已经分组的DataFrameGroupBy 6 df[['data1']].groupby(df['key1']) 7 8 9 df.groupby(['key1','key2'])[['data1']].mean() 10 11 12 #通过字典或者Series进行分组 13 people =pd.DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis']) 14 mapping ={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} 15 16 by_column = people.groupby(mapping,axis=1) 17 map_series =pd.Series(mapping) 18 people.groupby(map_series,axis=1).count()
4、通过函数进行分组
people.groupby(len).sum()
5、根据索引级别进行分组
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['city','tenor']) hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns) hier_df.groupby(level='city',axis=1).count()
6、聚合
#自定义聚合函数 def peak_to_peak(arr): return arr.max() - arr.min() grouped.agg(peak_to_peak) #将定义的函数传入agg方法即可 #面向列的多函数应用 tips = pd.read_csv('examples/tips.csv') tips.head() ''' total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 ''' #根据day、smoker对数据tips进行分组 grouped = tips.groupby(["day", "smoker"]) grouped_pct = grouped["tip_pct"] #将函数名以字符串形式传入 grouped_pct.agg('mean') #传入一组函数名得到的列就以相应的函数名命名 grouped_pct.agg(["mean", "std", peak_to_peak]) #传入以(name,function)为元组组成的列表,各元组的第一个名字就被作为列名 grouped_pct.agg([('foo','mean'),('bar',np.std)]) ''' foo bar day smoker Fri No 0.151650 0.028123 Yes 0.174783 0.051293 Sat No 0.158048 0.039767 Yes 0.147906 0.061375 Sun No 0.160113 0.042347 Yes 0.187250 0.154134 Thur No 0.160298 0.038774 Yes 0.163863 0.039389 ''' #可以定义一组应用于全部列的函数 functions = ["count", "mean", "max"] result = grouped1[["tip_pct", "total_bill"]].agg(functions) #可以传入带有自定义名称的元组列表 ftuples = [("Average", "mean"), ("Variance", np.var)] grouped[["tip_pct", "total_bill"]].agg(ftuples) #不同列应用不同函数 grouped.agg({"tip" : np.max, "size" : "sum"}) grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],"size" : "sum"})
7、”无索引“形式返回聚合数据
tips.groupby(["day", "smoker"], as_index=False).mean()
8、分组级运算和转换
k1_means = df.groupby('key1').mean().add_prefix('mean_') ''' mean_data1 mean_data2 key1 a 0.340154 -0.091206 b -0.630859 -0.514597 ''' pd.merge(df,k1_means,left_on='key1',right_index=True) #以上可以看作np.mean函数对两个数据列进行转换,使用transform方法如下 key =['one','two','one','two','one'] people.groupby(key).mean() people.groupby(key).transform(np.mean)#transform将函数应用到各分组
9、apply一般性的拆分-应用-合并
appley会将待处理的对象拆分成多个片段,然后对个片段调用传入的函数,最后尝试将各片段组合到一起
#根据分组选出最高5个tip_pct值 def top(df,n =5, column='tip_pct'): return df.sort_values(column, ascending=False)[:n] top(tips,n=6) tips.groupby("smoker").apply(top) #如果传给apply的函数能够接受其他参数或者关键字,则可以将这些内容放在函数后面一并传入 tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill") result = tips.groupby("smoker")["tip_pct"].describe() result.unstack("smoker") #如上在groupby中调用describe之类方法,实际上应用了以下代码 def f(group): return group.describe() grouped.apply(f)
禁止分组键
#group_keys=False to groupby 可以实现 tips.groupby("smoker", group_keys=False).apply(top)
分位数和桶分析
frame = pd.DataFrame({"data1": np.random.standard_normal(1000), "data2": np.random.standard_normal(1000)}) factor =pd.cut(frame.data1,4) #对cut返回的factor对象可直接用groupby def get_stats(group): return pd.DataFrame( {"min": group.min(), "max": group.max(),"count": group.count(), "mean": group.mean()}) grouped = frame.groupby(factor) grouped.apply(get_stats) grouped.agg(["min", "max", "count", "mean"]) #返回分位数编号 quartiles_samp = pd.qcut(frame["data1"], 4, labels=False) quartiles_samp.head() grouped = frame.groupby(quartiles_samp) grouped.apply(get_stats)
10、分组的值填充缺失值
s = pd.Series(np.random.standard_normal(6)) s[::2] = np.nan #赋空值 s.fillna(s.mean())#填充空值 states = ["Ohio", "New York", "Vermont", "Florida", "Oregon", "Nevada", "California", "Idaho"] group_key = ["East", "East", "East", "East", "West", "West", "West", "West"] data = pd.Series(np.random.standard_normal(8), index=states) data[["Vermont", "Nevada", "Idaho"]] = np.nan def fill_mean(group): return group.fillna(group.mean()) data.groupby(group_key).apply(fill_mean) #分组平均值填充空值 #预定义分组值,通过组名填充 fill_values = {"East": 0.5, "West": -1} def fill_func(group): return group.fillna(fill_values[group.name]) data.groupby(group_key).apply(fill_func)
11、随机采样和排列
#创建扑克Series suits = ["H", "S", "C", "D"] # Hearts, Spades, Clubs, Diamonds card_val = (list(range(1, 11)) + [10] * 3) * 4 base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"] cards = [] for suit in suits: cards.extend(str(num) + suit for num in base_names) deck = pd.Series(card_val, index=cards) #取样函数 def draw(deck, n=5): return deck.sample(n) draw(deck) #从每个花色中随机抽取2张 def get_suit(card): return card[-1] deck.groupby(get_suit).apply(draw, n=2) #或者 deck.groupby(get_suit, group_keys=False).apply(draw, n=2) #不分组展示
12、分组加权平均数及相关系数
df = pd.DataFrame({"category": ["a", "a", "a", "a","b", "b", "b", "b"], "data": np.random.standard_normal(8), "weights": np.random.uniform(size=8)}) grouped = df.groupby("category") def get_wavg(group): return np.average(group["data"], weights=group["weights"]) grouped.apply(get_wavg) #计算股票的相关系数 close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0) close_px.info() def spx_corr(group): return group.corrwith(group["SPX"]) rets = close_px.pct_change().dropna() #pct_change def get_year(x): return x.year by_year = rets.groupby(get_year) by_year.apply(spx_corr) #计算两只股票的相关系数 def corr_aapl_msft(group): return group["AAPL"].corr(group["MSFT"]) by_year.apply(corr_aapl_msft)
13、面向分组的线性回归
import statsmodels.api as sm def regress(data, yvar=None, xvars=None): Y = data[yvar] X = data[xvars] X["intercept"] = 1. result = sm.OLS(Y, X).fit() return result.params #按年执行AAPL 对SPX收益率的线性回归 by_year.apply(regress, yvar="AAPL", xvars=["SPX"])
14、透视表和交叉数据集
tips.pivot_table(index=["day", "smoker"]) tips.pivot_table(index=["time", "day"], columns="smoker",values=["tip_pct", "size"]) #columns="smoker"把smoker放到列上 tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True)#margins=True 添加分项小计 tips.pivot_table(index=["time", "smoker"], columns="day",values="tip_pct", aggfunc=len, margins=True)#通过count或者len得到有关分组大小的交叉表 tips.pivot_table(index=["time", "size", "smoker"], columns="day", values="tip_pct", fill_value=0)#空值填上默认值
交叉表
from io import StringIO data = """Sample Nationality Handedness 1 USA Right-handed 2 Japan Left-handed 3 USA Right-handed 4 Japan Right-handed 5 Japan Left-handed 6 Japan Right-handed 7 USA Right-handed 8 USA Left-handed 9 Japan Right-handed 10 USA Right-handed""" data = pd.read_table(StringIO(data), sep="\s+") #生成交叉表 pd.crosstab(data["Nationality"], data["Handedness"], margins=True) #crosstab 前两个参数可以是 数组 or Series or 数组列表 pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True).