pandas之分组
import numpy as np import pandas as pd df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'], 'data1': np.random.randn(5), 'data2': np.random.randn(5)}) print(df) # data1列数据,按照key1来进行分组 g1 = df['data1'].groupby(by=df['key1']) print(g1) print(g1.mean()) # 一次传入多个数组,按照key1,key2分组计算 g2 = df['data1'].groupby(by=[df['key1'], df['key2']]) print(g2.sum()) # 可以将列名(可以是字符串、数字或者其他python对象)用作分组 g3 = df.groupby(by='key1') print(g3.mean()) g4 = df.groupby(by=['key1', 'key2']) print(g4.mean()) 输出结果: key1 key2 data1 data2 0 a one 0.374982 1.064534 1 a two -0.485860 0.071513 2 b one -0.741052 1.868335 3 b two -0.757161 -0.536510 4 a one -0.092603 -1.647309 <pandas.core.groupby.generic.SeriesGroupBy object at 0x000001720F810BA8> key1 a -0.067827 b -0.749106 Name: data1, dtype: float64 key1 key2 a one 0.282379 two -0.485860 b one -0.741052 two -0.757161 Name: data1, dtype: float64 data1 data2 key1 a -0.067827 -0.170421 b -0.749106 0.665912 data1 data2 key1 key2 a one 0.141189 -0.291387 two -0.485860 0.071513 b one -0.741052 1.868335 two -0.757161 -0.536510
import numpy as np import pandas as pd df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'], 'data1': np.random.randn(5), 'data2': np.random.randn(5)}) print(df) states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio']) years = np.array([2005, 2005, 2006, 2005, 2006]) # 按照states, years传入数组的方式对data1列数据分组后进行平均值计算 g1 = df['data1'].groupby(by=[states, years]) print(g1.mean()) people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) print(people) mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'} g2 = people.groupby(mapping, axis=1).sum() print(g2) 输出结果: key1 key2 data1 data2 0 a one -0.388889 0.083999 1 a two 0.109513 2.016369 2 b one -0.423841 0.800895 3 b two -0.899408 0.039836 4 a one -0.137741 -1.382602 California 2005 0.109513 2006 -0.423841 Ohio 2005 -0.644148 2006 -0.137741 Name: data1, dtype: float64 a b c d e Joe 1.179284 0.289269 0.303566 1.493424 -1.432677 Steve -0.386514 -0.154331 -0.771970 -0.597251 0.004637 Wes -0.933249 0.300309 -1.659948 0.404844 -0.375680 Jim -2.427930 0.574752 0.504091 -0.996975 -1.547615 Travis -1.555477 -1.065392 -1.149300 1.326839 -1.530914 blue red Joe 1.796990 0.035876 Steve -1.369220 -0.536208 Wes -1.255104 -1.008621 Jim -0.492884 -3.400793 Travis 0.177539 -4.151784
import numpy as np import pandas as pd df = pd.DataFrame({ 'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4], 'C': np.random.randn(4)}) print(df) # 按照A列分组后聚合求最小值 df1 = df.groupby('A').agg('min') print(df1) # 按照A列分组后聚合求最小值和最大值 df2 = df.groupby('A').agg(['min', 'max']) print(df2) # 按照A列分组后聚合后对B列求最小值和最大值,同时对C列求和 df3 = df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) print(df3) 输出结果: A B C 0 1 1 1.630997 1 1 2 -0.238686 2 2 3 0.841689 3 2 4 -0.532344 B C A 1 1 -0.238686 2 3 -0.532344 B C min max min max A 1 1 2 -0.238686 1.630997 2 3 4 -0.532344 0.841689 B C min max sum A 1 1 2 1.392311 2 3 4 0.309345
import numpy as np import pandas as pd df = pd.DataFrame(np.random.randn(4, 5), columns=list('ABCDE')) print(df) # 计算各列数据总和并作为新列添加到末尾 df['col_sum'] = df.apply(lambda x: x.sum(), axis=1) print(df) # 计算各行数据总和并作为新行添加到末尾 df.loc['row_sum'] = df.apply(lambda x: x.sum(), axis=0) print(df) 输出结果: A B C D E 0 -0.978040 -0.820454 0.673568 1.613058 -0.203536 1 0.879338 0.561962 0.278855 -0.481307 1.097682 2 -1.244722 1.304386 0.206882 -0.715907 -1.246526 3 1.419993 0.453622 0.237463 1.292601 -0.184714 A B C D E col_sum 0 -0.978040 -0.820454 0.673568 1.613058 -0.203536 0.284595 1 0.879338 0.561962 0.278855 -0.481307 1.097682 2.336530 2 -1.244722 1.304386 0.206882 -0.715907 -1.246526 -1.695887 3 1.419993 0.453622 0.237463 1.292601 -0.184714 3.218965 A B C D E col_sum 0 -0.978040 -0.820454 0.673568 1.613058 -0.203536 0.284595 1 0.879338 0.561962 0.278855 -0.481307 1.097682 2.336530 2 -1.244722 1.304386 0.206882 -0.715907 -1.246526 -1.695887 3 1.419993 0.453622 0.237463 1.292601 -0.184714 3.218965 row_sum 0.076569 1.499516 1.396768 1.708445 -0.537094 4.144204