python进行数据分析groupby基础操作

from pandas import Series,DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],'key2' : ['one', 'two', 'one', 'two', 'one'],
 'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
grouped=df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
Out[10]:
key1  key2
a     one    -0.230076
      two     1.248653
b     one    -0.196613
      two     0.689761
Name: data1, dtype: float64
means.unstack(level=-1)
Out[12]:
key2       one       two
key1                    
a    -0.230076  1.248653
b    -0.196613  0.689761

means.unstack(level=0)
Out[13]:
key1         a         b
key2                    
one  -0.230076 -0.196613
two   1.248653  0.689761

mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

people = DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

mapping
Out[25]: {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

people
Out[26]:
               a         b         c         d         e
Joe    -0.676828  0.880429  0.386540  2.961741  1.651430
Steve   0.269119  0.916097 -0.238961  0.949983 -0.697962
Wes     0.078493 -0.120419 -1.275525 -0.289106 -0.956001
Jim    -1.225288  0.120219 -1.173347 -0.792199 -0.417982
Travis  0.728367  0.906879  0.373102 -1.151590 -1.883083

people.groupby(mapping,axis=1).sum()
Out[29]:
            blue       red
Joe     3.348280  1.855032
Steve   0.711022  0.487254
Wes    -1.564631 -0.997927
Jim    -1.965546 -1.523050
Travis -0.778488 -0.247836

people.groupby(len).sum()
Out[30]:
          a         b         c         d         e
3 -1.823623  0.880229 -2.062331  1.880435  0.277447
5  0.269119  0.916097 -0.238961  0.949983 -0.697962
6  0.728367  0.906879  0.373102 -1.151590 -1.883083

columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])

columns
Out[35]:
MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

hier_df.groupby(level='cty',axis=1).count()
Out[37]:
cty  JP  US
0     2   3
1     2   3
2     2   3
3     2   3
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

years = np.array([2005, 2005, 2006, 2005, 2006])

df
Out[16]:
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
1  1.248653  0.116745    a  two
2 -0.196613 -0.224198    b  one
3  0.689761  1.436277    b  two
4 -0.290392  1.292000    a  one

df['data1'].groupby([states,years]).mean()
Out[17]:
California  2005    1.248653
            2006   -0.196613
Ohio        2005    0.260000
            2006   -0.290392
Name: data1, dtype: float64

#被聚合的只有数值列
df.groupby(df['key1']).mean()
Out[19]:
         data1     data2
key1                    
a     0.262833  0.370314
b     0.246574  0.606039

df.groupby(['key1','key2']).mean()
Out[20]:
              data1     data2
key1 key2                    
a    one  -0.230076  0.497098
     two   1.248653  0.116745
b    one  -0.196613 -0.224198
     two   0.689761  1.436277

for name,group in df.groupby(['key1']):
    print (name)
    print(group)
    
a
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
1  1.248653  0.116745    a  two
4 -0.290392  1.292000    a  one
b
      data1     data2 key1 key2
2 -0.196613 -0.224198    b  one
3  0.689761  1.436277    b  two

for (k1,k2),group in df.groupby(['key1','key2']):
    print (k1,k2)
    print(group)
    
a one
      data1     data2 key1 key2
0 -0.169761 -0.297803    a  one
4 -0.290392  1.292000    a  one
a two
      data1     data2 key1 key2
1  1.248653  0.116745    a  two
b one
      data1     data2 key1 key2
2 -0.196613 -0.224198    b  one
b two
      data1     data2 key1 key2
3  0.689761  1.436277    b  two

df.dtypes
Out[29]:
data1    float64
data2    float64
key1      object
key2      object
dtype: object

grouped=df.groupby(df.dtypes,axis=1)

dict(list(grouped))
Out[31]:
{dtype('float64'):       data1     data2
 0 -0.169761 -0.297803
 1  1.248653  0.116745
 2 -0.196613 -0.224198
 3  0.689761  1.436277
 4 -0.290392  1.292000, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

df.groupby(['key1', 'key2'])[['data2']].mean()
Out[15]:
              data2
key1 key2          
a    one   0.942926
     two  -0.671692
b    one   0.366026
     two  -0.794155
     
df.ix[2:3,['data1','data2']]=np.nan

df
Out[22]:
      data1     data2 key1 key2
0 -1.090066  0.132120    a  one
1 -0.683913 -0.671692    a  two
2       NaN       NaN    b  one
3       NaN       NaN    b  two
4 -0.003452  1.753732    a  one

原文地址:https://www.cnblogs.com/qmxj-blog/p/9373711.html