chart-7

# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 15:17:51 2017
@author: zzpp220
"""
'''合并（ｍｅｒｇｅ）－列或者连接（ｃｏｎｃａｔ）－行数据集'''
from pandas import DataFrame,Series
from numpy import nan as NaN
import pandas as pd
import numpy as np
import json
df1=DataFrame({'left':list('bbacaab'),'data1':range(7)})
df2=DataFrame({'right':list('abd'),'data2':range(3)})
df12=pd.merge(df1,df2,left_on='left',right_on='right') ##分为内、外连接、像mysql 一样
##重塑和轴向旋转　stack: change col to row 和unstack:change row to col
'''这个好！！该有的都由了'''
df3=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['sd','gz'],name='state'),columns=pd.Index(['one','two','thr'],name='number'))
'''多重索引的Series和DataFrame可以通过stack，unstack相互转哈,默认情况下取的都是最内层的索引'''
##将列转为行，得到一个Series,索引是一个多重索引
stack_df3=df3.stack()
#对于一个层次化索引的Series ,可以用unstack 重排为ＤａｔａＦｒａｍｅ。
stack_df3.unstack()
'''
df3.stack()
Out[19]: 
state  number
sd     one       0
       two       1
       thr       2
gz     one       3
       two       4
       thr       5
dtype: int64
stack_df3.unstack()
Out[22]: 
number  one  two  thr
state                
sd        0    1    2
gz        3    4    5
unstack and stack operate on the in-most level-index,also can operate on other level when give other para
'''
stack_df3.unstack('state')==stack_df3.unstack(0)## operate on the out-most level the same as :
''''stack_df3.unstack() 默认是按最内层也就是’number‘
Out[22]: 
number  one  two  thr
state                
sd        0    1    2
gz        3    4    5
                    stack_df3.unstack('state')==stack_df3.unstack(0)
                    Out[25]: 
                    state     sd    gz
                    number            
                    one     True  True
                    two     True  True
                    thr     True  True
'''
'''对全部的营养数据做分析'''
db=json.load(open('/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch07/foods-2011-10-03.json'))
nutrients=DataFrame(db[0]['nutrients'])
'''
nutrients[:10]
Out[36]: 
                   description        group units    value
0                      Protein  Composition     g    25.18
1            Total lipid (fat)  Composition     g    29.20
2  Carbohydrate, by difference  Composition     g     3.06
3                          Ash        Other     g     3.28
4                       Energy       Energy  kcal   376.00
5                        Water  Composition     g    39.28
6                       Energy       Energy    kJ  1573.00
7         Fiber, total dietary  Composition     g     0.00
8                  Calcium, Ca     Elements    mg   673.00
9                     Iron, Fe     Elements    mg     0.64
'''
info_keys=['description','group','id','manufacturer']
info=DataFrame(db,columns=info_keys)##仅仅导入db中的info_keys中的字段，其他的就不导入了；　
info[:5]
'''查看食物类别的分布情况'''
#info.group.value_counts()[:10]#查看group中的分布情况＝＝pd.values_count(info.group)
'''将所有的食物的营养成分整合到一个大表中'''
nutrients=[]
for rec in db:
    fnuts=DataFrame(rec['nutrients'])##将各食物的营养成分整合到一个大表
    fnuts['id']=rec['id']#并且在表中添加一个表示编号的列,用原来的表中的值赋值
    nutrients.append(fnuts)#依次将DataFrame添加到大的ｌｉｓｔ中
nutrients=pd.concat(nutrients,ignore_index=True)##用concat连接起来大表
'''查看表中有多少重复项'''
nutrients.duplicated().sum()
'''丢弃重复项'''
nutrients.drop_duplicates()
'''重命名对象中的列名(因为二者有相同的列，可能会混淆)'''
col_mapping={'description':'food','group':'fgroup'}
info=info.rename(columns=col_mapping,copy=False)
'''
info.rename(columns=col_mapping,copy=False)[:3]
Out[50]: 
              food                  fgroup    id manufacturer
0  Cheese, caraway  Dairy and Egg Products  1008             
1  Cheese, cheddar  Dairy and Egg Products  1009             
2     Cheese, edam  Dairy and Egg Products  1018 
'''
col_mapping={'description':'nutrients','group':'nutgroup'}
nutrients=nutrients.rename(columns=col_mapping,copy=False)
'''合并ｉｎｆｏ和nutrients'''
ndata=pd.merge(nutrients,info,on='id',how='outer')
'''根据营养分类，得出锌的中位值'''
result=ndata.groupby(['nutrients','fgroup'])['value'].quantile(0.5)