Pandas 数据分组

分组统计 - groupby功能

根据某些条件将数据拆分成组
对每个组独立应用函数
将结果合并到一个数据结构中

Dataframe在行（axis=0）或列（axis=1）上进行分组，将一个函数应用到各个分组并产生一个新值，然后函数执行结果被合并到最终的结果对象中。

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

# 分组

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
print('------')

print(df.groupby('A'), type(df.groupby('A')))
print('------')
# 直接分组得到一个groupby对象，是一个中间数据，没有进行计算

a = df.groupby('A').mean()    # （0.494839+0.977357 -0.079595）/3
b = df.groupby(['A','B']).mean()
c = df.groupby(['A'])['D'].mean()  # 以A分组，但是只计算D的平均值
print(a,type(a),'
',a.columns)
print(b,type(b),'
',b.columns)
print(c,type(c))
# 通过分组后的计算，得到一个新的dataframe
# 默认axis = 0，以行来分组
# 可单个或多个（[]）列分组

输出结果：

   A      B         C         D
0  foo    one  0.539903 -0.291392
1  bar    one  0.243375  1.093706
2  foo    two -0.552425 -0.333666
3  bar  three  0.307315 -0.094833
4  foo    two -1.011648 -0.856448
5  bar    two  1.078264  1.590439
6  foo    one  0.550491 -0.044095
7  foo  three  0.162069  0.445236
------
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7B527B8> <class 'pandas.core.groupby.DataFrameGroupBy'>
------
            C         D
A                      
bar  0.542985  0.863104
foo -0.062322 -0.216073 <class 'pandas.core.frame.DataFrame'> 
 Index(['C', 'D'], dtype='object')
                  C         D
A   B                        
bar one    0.243375  1.093706
    three  0.307315 -0.094833
    two    1.078264  1.590439
foo one    0.545197 -0.167744
    three  0.162069  0.445236
    two   -0.782036 -0.595057 <class 'pandas.core.frame.DataFrame'> 
 Index(['C', 'D'], dtype='object')
A
bar    0.863104
foo   -0.216073
Name: D, dtype: float64 <class 'pandas.core.series.Series'>

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
b = df.groupby(['A','B']).mean()
b.reset_index(inplace = True)
print(b)

输出结果：

   A      B         C         D
0  bar    one -1.501298 -1.450693
1  bar  three -0.623903  0.832721
2  bar    two  1.264561  0.265831
3  foo    one  0.996789  0.645541
4  foo  three  1.338508 -1.213098
5  foo    two  0.934279 -1.139260

# 分组 - 可迭代对象

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))  #输出的类型为可迭代的对象
print('-----')

print(list(df.groupby('X')), '→ 可迭代对象，直接生成list
')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示
')

for n,g in df.groupby('X'):
    print(n)
    print(g)
    print('###')
print('-----')
# n是组名，g是分组后的Dataframe

print(df.groupby(['X']).get_group('A'),'
')
print(df.groupby(['X']).get_group('B'),'
')
print('-----')
# .get_group()提取分组后的组

grouped = df.groupby(['X'])
print(grouped.groups,'
')
print(grouped.groups['A'])  # 也可写：df.groupby('X').groups['A']
print('-----')
# .groups：将分组后的groups转为dict
# 可以字典索引方法来查看groups里的元素

sz = grouped.size()
print(sz,type(sz))
print('-----')
# .size()：查看分组后的长度

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df,'
')
print(grouped,'
')
print(grouped[('foo', 'three')])
# 按照两个列进行分组

输出结果：

   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7BA6908> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A',    X  Y
0  A  1
2  A  3), ('B',    X  Y
1  B  4
3  B  2)] → 可迭代对象，直接生成list

('A',    X  Y
0  A  1
2  A  3) → 以元祖形式显示

A
   X  Y
0  A  1
2  A  3
###
B
   X  Y
1  B  4
3  B  2
###
-----
   X  Y
0  A  1
2  A  3 

   X  Y
1  B  4
3  B  2 

-----
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')} 

Int64Index([0, 2], dtype='int64')
-----
X
A    2
B    2
dtype: int64 <class 'pandas.core.series.Series'>
-----
     A      B         C         D
0  foo    one  0.185932 -0.128426
1  bar    one  1.205172  0.860480
2  foo    two  0.965735  1.008437
3  bar  three  0.442906  0.065548
4  foo    two  0.461985 -1.591069
5  bar    two -0.917835 -0.707424
6  foo    one  0.537916 -0.031545
7  foo  three  0.838332 -0.804244 

{('foo', 'two'): Int64Index([2, 4], dtype='int64'), ('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64')} 

Int64Index([7], dtype='int64')

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))  #输出的类型为可迭代的对象
print('-----')

print(list(df.groupby('X')), '→ 可迭代对象，直接生成list
')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示
')
tup = list(df.groupby('X'))[0]
print(tup[0],type(tup[0]))
print(tup[1],type(tup[1]))

输出结果：

  X  Y
0  A  1
1  B  4
2  A  3
3  B  2
<pandas.core.groupby.DataFrameGroupBy object at 0x000001DCA7B87A20> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A',    X  Y
0  A  1
2  A  3), ('B',    X  Y
1  B  4
3  B  2)] → 可迭代对象，直接生成list

('A',    X  Y
0  A  1
2  A  3) → 以元祖形式显示

A <class 'str'>
   X  Y
0  A  1
2  A  3 <class 'pandas.core.frame.DataFrame'>

# 分组计算函数方法

s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0)  # 唯一索引用.groupby(level=0)，将同一个index的分为一组 当用index做分组的时候，用level
print(grouped)
print(grouped.first(),'→ first：非NaN的第一个值
')
print(grouped.last(),'→ last：非NaN的最后一个值
')
print(grouped.sum(),'→ sum：非NaN的和
')
print(grouped.mean(),'→ mean：非NaN的平均值
')
print(grouped.median(),'→ median：非NaN的算术中位数
')
print(grouped.count(),'→ count：非NaN的值
')
print(grouped.min(),'→ min、max：非NaN的最小值、最大值
')
print(grouped.std(),'→ std，var：非NaN的标准差和方差
')
print(grouped.prod(),'→ prod：非NaN的积
')

输出结果：

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000002B38F39B4E0>
1    1
2    2
3    3
dtype: int64 → first：非NaN的第一个值

1    10
2    20
3    30
dtype: int64 → last：非NaN的最后一个值

1    11
2    22
3    33
dtype: int64 → sum：非NaN的和

1     5.5
2    11.0
3    16.5
dtype: float64 → mean：非NaN的平均值

1     5.5
2    11.0
3    16.5
dtype: float64 → median：非NaN的算术中位数

1    2
2    2
3    2
dtype: int64 → count：非NaN的值

1    1
2    2
3    3
dtype: int64 → min、max：非NaN的最小值、最大值

1     6.363961
2    12.727922
3    19.091883
dtype: float64 → std，var：非NaN的标准差和方差

1    10
2    40
3    90
dtype: int64 → prod：非NaN的积

 多函数计算：agg()
df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.rand(4),
                  'c':np.random.rand(4),
                  'd':np.random.rand(4),})
print(df)
print(df.groupby('a').agg(['mean',np.sum]))  #计算一个均值和一个求和  会把b,c,d的每一列都会计算一个mean和sum
print(df.groupby('a')['b'].agg({'result1':np.mean,
                               'result2':np.sum}))  #按a分组后b这一列的均值和求和
# 函数写法可以用str，或者np.方法
# 可以通过list，dict传入，当用dict时，key名为columns → 更新pandas后会出现警告
# 尽量用list传入

输出结果：

 a         b         c         d
0  1  0.758848  0.375900  0.962917
1  1  0.430484  0.322437  0.402809
2  2  0.285699  0.230663  0.525483
3  2  0.676740  0.191693  0.874899
          b                   c                   d          
       mean       sum      mean       sum      mean       sum
a                                                            
1  0.594666  1.189331  0.349169  0.698337  0.682863  1.365727
2  0.481219  0.962438  0.211178  0.422356  0.700191  1.400382
    result1   result2
a                    
1  0.594666  1.189331
2  0.481219  0.962438

C:Users__main__.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version

练习题：

作业1：按要求创建Dataframe df，并通过分组得到以下结果

① 以A分组，求出C,D的分组平均值

② 以A,B分组，求出D,E的分组求和

③ 以A分组，得到所有分组，以字典显示

④ 按照数值类型分组，求和

⑤ 将C,D作为一组分出来，并计算求和

⑥ 以B分组，求出每组的均值，求和，最大值，最小值

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
                   'B':list('hhhhffff'),
                   'C':list(range(10,25,2)),
                   'D':np.random.rand(8),
                   'E':np.random.rand(8)})
#(1)
print(df.groupby(by = 'A')[['D','E']].mean())
#(2)
print(df.groupby(by = ['A','B'])[['D','E']].sum())
#(3)
g = df.groupby(by = 'A')
for n,i in g:
    print(n)
    print(i)
#(4）
print(df.groupby(df.dtypes,axis = 1).sum())
#(5)
df2 = df[['C','D']]
print(df2)
df2['sum'] = df2.sum(axis = 1)
print(df2)
#(6)
print(df.groupby('B').agg(['mean','sum','max',np.min]))

输出结果：

              D         E
A                        
one    0.371521  0.524208
three  0.549758  0.513263
two    0.407525  0.511265
                D         E
A     B                    
one   f  0.505448  0.961745
      h  0.609115  0.610880
three f  0.856872  0.495862
      h  0.242644  0.530664
two   f  0.679124  1.358688
      h  0.543450  0.175106
one
     A  B   C         D         E
0  one  h  10  0.258020  0.411822
3  one  h  16  0.351095  0.199058
6  one  f  22  0.505448  0.961745
three
       A  B   C         D         E
2  three  h  14  0.242644  0.530664
5  three  f  20  0.856872  0.495862
two
     A  B   C         D         E
1  two  h  12  0.543450  0.175106
4  two  f  18  0.010620  0.919586
7  two  f  24  0.668504  0.439102
   int64   float64  object
0     10  0.669841    oneh
1     12  0.718555    twoh
2     14  0.773309  threeh
3     16  0.550153    oneh
4     18  0.930206    twof
5     20  1.352733  threef
6     22  1.467194    onef
7     24  1.107606    twof
    C         D
0  10  0.258020
1  12  0.543450
2  14  0.242644
3  16  0.351095
4  18  0.010620
5  20  0.856872
6  22  0.505448
7  24  0.668504
    C         D        sum
0  10  0.258020  10.258020
1  12  0.543450  12.543450
2  14  0.242644  14.242644
3  16  0.351095  16.351095
4  18  0.010620  18.010620
5  20  0.856872  20.856872
6  22  0.505448  22.505448
7  24  0.668504  24.668504
     C                      D                                       E  
  mean sum max amin      mean       sum       max      amin      mean   
B                                                                       
f   21  84  24   18  0.510361  2.041444  0.856872  0.010620  0.704074   
h   13  52  16   10  0.348802  1.395209  0.543450  0.242644  0.329162   

                                 
        sum       max      amin  
B                                
f  2.816295  0.961745  0.439102  
h  1.316650  0.530664  0.175106