char-8

# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 22:07:13 2017
@author: zzpp220
"""
from pandas import DataFrame,Series
from numpy import nan as NaN
import pandas as pd
import numpy as np
df=DataFrame({'k1':list('aabba'),'k2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
#分组键为Ｓｅｒｉｅｓ对象
grouped=df.groupby('k1')
desc_group=grouped.describe()
grouped_1=df['data1'].groupby([df['k1'],df['k2']]).sum()##group 是一个series对象
## 有多重索引的Series和ＤａｔａＦｒａｍｅ中的相互转变
##分组键为ｇｒｏｕｐｂｙ前面对象中的列名
df.groupby(['k1','k2']).sum().unstack()
df.groupby(['k1','k2']).sum().unstack().stack([0,1])
##粉组件是任何长度适当的数组
states=np.array(['sd','gd','sd','fj','fj'])
year=np.array([2005,2006,2005,2005,2006])
df.groupby([states,year]).mean()
##GroupBy 对象的数据聚合　按照不同的的聚合函数聚合
tips=pd.read_csv('/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch08/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
gro_tips=tips.groupby(['sex','smoker'])
#对聚合后的１列应用１个聚合函数：
gro_tips['tip_pct'].mean()
gro_tips['tip_pct'].agg('mean')##同上，只不过是把函数名用字符串的形似传入
#对聚合后的１列应用多个聚合函数：
gro_tips['tip_pct'].agg(['mean','std','var'])##传入一组函数名，得到DataFrame的列就会以相应的函数命名
gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])#按照(name.funciton)传入二元元祖列表,返回的表的各列名是元祖第一个元素
'''
gro_tips['tip_pct'].mean()
Out[94]: 
sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64
gro_tips['tip_pct'].agg(['mean','std','var'])
Out[98]: 
                   mean       std       var
sex    smoker                              
Female No      0.156921  0.036421  0.001327
       Yes     0.182150  0.071595  0.005126
Male   No      0.160669  0.041849  0.001751
       Yes     0.152771  0.090588  0.008206
gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])
Out[99]: 
                    avg       vag       gav
sex    smoker                              
Female No      0.156921  0.036421  0.001327
       Yes     0.182150  0.071595  0.005126
Male   No      0.160669  0.041849  0.001751
       Yes     0.152771  0.090588  0.008206
'''
##设计对表中的多个列应用同样的多个函数
## 对'tip_pct','total_bill'　分别计算functions中的三个统计信息
functions=['count','mean','max']
asix=[('name1','count'),('name2','mean'),('name3','max')]
result=gro_tips['tip_pct','total_bill'].agg(functions)
asix_result=gro_tips['tip_pct','total_bill'].agg(asix)
'''
result
Out[102]: 
              tip_pct                     total_bill                  
                count      mean       max      count       mean    max
sex    smoker                                                         
Female No          54  0.156921  0.252672         54  18.105185  35.83
       Yes         33  0.182150  0.416667         33  17.977879  44.30
Male   No          97  0.160669  0.291990         97  19.791237  48.33
       Yes         60  0.152771  0.710345         60  22.284500  50.81
       
       
asix_result
Out[105]: 
              tip_pct                     total_bill                  
                name1     name2     name3      name1      name2  name3
sex    smoker                                                         
Female No          54  0.156921  0.252672         54  18.105185  35.83
       Yes         33  0.182150  0.416667         33  17.977879  44.30
Male   No          97  0.160669  0.291990         97  19.791237  48.33
       Yes         60  0.152771  0.710345         60  22.284500  50.81       
       
       如上表，result拥有层次化的列，这相当于对各列应用不同的聚合函数后，再分别进行聚合，然后用concat进行聚合。
'''
##`对多个不同的列应用不同的函数，传入列名和带应用函数名的字典
map={'tip_pct':['min','max','mean','std'],'size':'sum',}
diff_to_diff=gro_tips.agg(map)
''' diff_to_diff
Out[109]: 
                tip_pct                               size
                    min       max      mean       std  sum
sex    smoker                                             
Female No      0.056797  0.252672  0.156921  0.036421  140
       Yes     0.056433  0.416667  0.182150  0.071595   74
Male   No      0.071804  0.291990  0.160669  0.041849  263
       Yes     0.035638  0.710345  0.152771  0.090588  150
'''
#－－－－－－－－－－－－－apply 函数，返回一个pandas对象或者标量值－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－
'''定义函数得到表df按照列名为columｎ的升序排序后的结果，选出最大５个tip_pct所在的行‘'''
def top(df,n=5,column='tip_pct'):#按照
    return df.sort_index(by=column)[-n:]
    
top_col=top(tips,n=6)
tips.groupby('smoker').apply(top,n=3)##返回一个层次化索引的dataframe
tips.groupby('smoker')['tip_pct'].describe().unstack(0)
test2=DataFrame(np.arange(24).reshape(6,4),index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'],columns=['col_1','col_2','col_3', 'col_4'])
test2['smoker']=Series(['no','yes','no','yes','yes','yes'],index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'])
test2=test2.rename_axis({'col_1':'tip_pct'},axis=1)
test2['tip_pct']=[2,5,4,1,2,6]#
test2['col_2']=[7,8,5,-1,4,6]
test2.groupby('smoker').apply(top,n=3)##在groupby对象上调用apply,返回一个层次化索引的dataframe
test2.groupby('smoker').apply(top,n=3).unstack()
##如果不想要层次化的索引，可以把分组键禁止掉
test2.groupby('smoker',group_keys=False).apply(top,n=3)
''' 
将tips按找smoker分组，对于每一个组内都应用top 函数，应用完成后，将各个组的应用结果concat起来
返回的结果就有了一个层次化的索引，内层的索引就是tip中原来的　索引值
total_bill   tip     sex smoker   day    time  size   tip_pct
smoker                                                                   
No     51        10.29  2.60  Female     No   Sun  Dinner     2  0.252672
       149        7.51  2.00    Male     No  Thur   Lunch     2  0.266312
       232       11.61  3.39    Male     No   Sat  Dinner     2  0.291990
-------------------------------------------------------------------------------concat---------------
Yes    67         3.07  1.00  Female    Yes   Sat  Dinner     1  0.325733
       178        9.60  4.00  Female    Yes   Sun  Dinner     2  0.416667
       172        7.25  5.15    Male    Yes   Sun  Dinner     2  0.710345
       
test2.groupby('smoker').apply(top,n=3):
tip_pct  col_2  col_3  col_4 smoker
smoker                                           
no     row_1        2      7      2      3     no
       row_3        4      5     10     11     no
-----------------------------------------------------------concat-------------------------
yes    row_5        2      4     18     19    yes
       row_2        5      8      6      7    yes
       row_6        6      6     22     23    yes
       
test2.groupby('smoker',group_keys=False).apply(top,n=3):
       tip_pct  col_2  col_3  col_4 smoker
row_1        2      7      2      3     no
row_3        4      5     10     11     no
row_5        2      4     18     19    yes
row_2        5      8      6      7    yes
row_6        6      6     22     23    yes   
       
用特定于分组的值填充缺失值：       '''
s=Series(np.random.rand(6))
s[::2]=np.nan#步长为２
'''
s:
0         NaN
1    0.654905
2         NaN
3    0.475528
4         NaN
5    0.269656
dtype: float64
s.fillna(s.mean())
Out[208]: 
0    0.466696
1    0.654905
2    0.466696
3    0.475528
4    0.466696
5    0.269656
dtype: float64
'''
s.fillna(s.mean())