python学习-数据聚合与分组运算

1、groupby

df = pd.DataFrame({'key1':['a','a','b','b','a'],
                  'key2':['one','two','one','two','one'],
                  'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})

 df['data1'].groupby(df['key1']).mean() #按key1分组,并计算data1的均值
means =df['data1'].groupby([df['key1'],df['key2']]).mean() #传入多个数组进行分组
means.unstack()

#分组键均为Series。实际上分组键可以是任何长度适当的数组
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2005,2005,2006])
df['data1'].groupby([states,years]).mean()

#还可以将列名(可以是字符串、数字或其他python对象)用作分组键
df.groupby('key1').mean()
df.groupby(['key1','key2']).mean()

#size方法返回一个含有分组大小的Series
df.groupby(['key1','key2']).size()

2、对分组进行迭代

 1 #产生一组二元元组(由分组名和数据块组成)
 2 for name ,group in df.groupby('key1'):
 3     print(name)
 4     print(group)
 5 ’‘’
 6 a
 7   key1 key2     data1     data2
 8 0    a  one  0.266786  0.430996
 9 1    a  two  0.168590 -0.488668
10 4    a  one  0.585086 -0.215946
11 b
12   key1 key2     data1     data2
13 2    b  one -1.239376 -0.860125
14 3    b  two -0.022342 -0.169070
15 ‘’‘
16 for (k1, k2) ,group in df.groupby(['key1','key2']):
17     print((k1,k2))
18     print(group)
19 
20 #将分组数据变为字典
21 pieces = dict(list(df.groupby('key1')))
22 pieces['b']

3、选取一个或者一组列

 1 #下面两者是等价的
 2 df.groupby('key1')['data1']  #返回的是已经分组的SeriesGroupBy
 3 df['data1'].groupby(df['key1'])
 4 
 5 df.groupby('key1')[['data1']]  #返回的是已经分组的DataFrameGroupBy
 6 df[['data1']].groupby(df['key1'])
 7 
 8 
 9 df.groupby(['key1','key2'])[['data1']].mean()
10 
11 
12 #通过字典或者Series进行分组 
13 people =pd.DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])
14 mapping ={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
15 
16 by_column = people.groupby(mapping,axis=1)
17 map_series =pd.Series(mapping)
18 people.groupby(map_series,axis=1).count()
View Code

4、通过函数进行分组

people.groupby(len).sum()

5、根据索引级别进行分组

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['city','tenor'])
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df.groupby(level='city',axis=1).count()

6、聚合

#自定义聚合函数
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak) #将定义的函数传入agg方法即可

#面向列的多函数应用
tips = pd.read_csv('examples/tips.csv')
tips.head()
'''
    total_bill    tip    smoker    day    time    size    tip_pct
0    16.99    1.01    No    Sun    Dinner    2    0.059447
1    10.34    1.66    No    Sun    Dinner    3    0.160542
2    21.01    3.50    No    Sun    Dinner    3    0.166587
3    23.68    3.31    No    Sun    Dinner    2    0.139780
4    24.59    3.61    No    Sun    Dinner    4    0.146808
'''
#根据day、smoker对数据tips进行分组
grouped = tips.groupby(["day", "smoker"])

grouped_pct = grouped["tip_pct"]
#将函数名以字符串形式传入
grouped_pct.agg('mean')
#传入一组函数名得到的列就以相应的函数名命名
grouped_pct.agg(["mean", "std", peak_to_peak])
#传入以(name,function)为元组组成的列表,各元组的第一个名字就被作为列名
grouped_pct.agg([('foo','mean'),('bar',np.std)])
'''
        foo    bar
day    smoker        
Fri    No    0.151650    0.028123
Yes    0.174783    0.051293
Sat    No    0.158048    0.039767
Yes    0.147906    0.061375
Sun    No    0.160113    0.042347
Yes    0.187250    0.154134
Thur    No    0.160298    0.038774
Yes    0.163863    0.039389
'''
#可以定义一组应用于全部列的函数
functions = ["count", "mean", "max"]
result = grouped1[["tip_pct", "total_bill"]].agg(functions)
#可以传入带有自定义名称的元组列表
ftuples = [("Average", "mean"), ("Variance", np.var)]
grouped[["tip_pct", "total_bill"]].agg(ftuples)
#不同列应用不同函数
grouped.agg({"tip" : np.max, "size" : "sum"})
grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],"size" : "sum"})

7、”无索引“形式返回聚合数据

tips.groupby(["day", "smoker"], as_index=False).mean()

8、分组级运算和转换

k1_means = df.groupby('key1').mean().add_prefix('mean_')
'''
    mean_data1    mean_data2
key1        
a    0.340154    -0.091206
b    -0.630859    -0.514597
'''
pd.merge(df,k1_means,left_on='key1',right_index=True)
#以上可以看作np.mean函数对两个数据列进行转换,使用transform方法如下
key =['one','two','one','two','one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)#transform将函数应用到各分组

9、apply一般性的拆分-应用-合并

  appley会将待处理的对象拆分成多个片段,然后对个片段调用传入的函数,最后尝试将各片段组合到一起

#根据分组选出最高5个tip_pct值
def top(df,n =5, column='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]
top(tips,n=6)

tips.groupby("smoker").apply(top)
#如果传给apply的函数能够接受其他参数或者关键字,则可以将这些内容放在函数后面一并传入
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


result = tips.groupby("smoker")["tip_pct"].describe()
result.unstack("smoker")
#如上在groupby中调用describe之类方法,实际上应用了以下代码
def f(group):
    return group.describe()

grouped.apply(f)

禁止分组键

#group_keys=False to groupby 可以实现
tips.groupby("smoker", group_keys=False).apply(top)

分位数和桶分析

frame = pd.DataFrame({"data1": np.random.standard_normal(1000), "data2": np.random.standard_normal(1000)})

factor =pd.cut(frame.data1,4)
#对cut返回的factor对象可直接用groupby

def get_stats(group):
    return pd.DataFrame( {"min": group.min(), "max": group.max(),"count": group.count(), "mean": group.mean()})
grouped = frame.groupby(factor)
grouped.apply(get_stats)
grouped.agg(["min", "max", "count", "mean"])

#返回分位数编号
quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
quartiles_samp.head()
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

 10、分组的值填充缺失值

s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan #赋空值
s.fillna(s.mean())#填充空值


states = ["Ohio", "New York", "Vermont", "Florida", "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East", "East", "East", "East", "West", "West", "West", "West"]
data = pd.Series(np.random.standard_normal(8), index=states)
data[["Vermont", "Nevada", "Idaho"]] = np.nan
def fill_mean(group):
    return group.fillna(group.mean())
data.groupby(group_key).apply(fill_mean) #分组平均值填充空值

#预定义分组值,通过组名填充
fill_values = {"East": 0.5, "West": -1} 
def fill_func(group):
    return group.fillna(fill_values[group.name])
data.groupby(group_key).apply(fill_func)

11、随机采样和排列

#创建扑克Series
suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)
deck = pd.Series(card_val, index=cards)

#取样函数
def draw(deck, n=5):
    return deck.sample(n)

draw(deck)

#从每个花色中随机抽取2张
def get_suit(card):
    return card[-1]
deck.groupby(get_suit).apply(draw, n=2)
#或者
deck.groupby(get_suit, group_keys=False).apply(draw, n=2) #不分组展示

12、分组加权平均数及相关系数

df = pd.DataFrame({"category": ["a", "a", "a", "a","b", "b", "b", "b"],
                   "data": np.random.standard_normal(8), 
                   "weights": np.random.uniform(size=8)})
 grouped = df.groupby("category")

def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])
grouped.apply(get_wavg)


#计算股票的相关系数
close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0)
close_px.info()

def spx_corr(group):
    return group.corrwith(group["SPX"])

rets = close_px.pct_change().dropna() #pct_change 
def get_year(x):
    return x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)
#计算两只股票的相关系数
def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])
by_year.apply(corr_aapl_msft)

13、面向分组的线性回归

import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

#按年执行AAPL 对SPX收益率的线性回归
by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

14、透视表和交叉数据集

tips.pivot_table(index=["day", "smoker"])

tips.pivot_table(index=["time", "day"], columns="smoker",values=["tip_pct", "size"]) #columns="smoker"把smoker放到列上

tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True)#margins=True 添加分项小计

tips.pivot_table(index=["time", "smoker"], columns="day",values="tip_pct", aggfunc=len, margins=True)#通过count或者len得到有关分组大小的交叉表

tips.pivot_table(index=["time", "size", "smoker"], columns="day", values="tip_pct", fill_value=0)#空值填上默认值

交叉表

from io import StringIO

data =  """Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""

data = pd.read_table(StringIO(data), sep="\s+")
#生成交叉表
pd.crosstab(data["Nationality"], data["Handedness"], margins=True)

#crosstab 前两个参数可以是 数组 or Series or 数组列表
pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True).

 

posted @ 2022-05-27 01:31  行者无疆  阅读(299)  评论(0编辑  收藏  举报