001_python实现数据分析

一、

# coding:utf8
# !/usr/bin/python
# import numpy as np
import pandas as pd
import np

def example2():
    '''
    Describing a numeric ``Series``.
    :return:
    '''
    s = pd.Series([1, 2, 3])
    print s.describe()
    '''
    count    3.0     
    mean     2.0
    std      1.0
    min      1.0
    25%      1.5
    50%      2.0
    75%      2.5
    max      3.0
    dtype: float64
    '''
def example3():
    '''
    Describing a categorical ``Series``.
    :return:
    '''
    s = pd.Series(['a', 'a', 'b', 'c'])
    print s.describe()
    '''
    count     4
    unique    3
    top       a
    freq      2
    dtype: object
    '''
def example4():
    '''
    Describing a timestamp ``Series``.
    :return:
    '''
    s = pd.Series([
        np.datetime64("2000-01-01"),
        np.datetime64("2010-01-01"),
        np.datetime64("2010-01-01")
        ])
    print s.describe()
    '''
    count                       3
    unique                      2
    top       2010-01-01 00:00:00
    freq                        2
    first     2000-01-01 00:00:00
    last      2010-01-01 00:00:00
    dtype: object
    '''
def example5():
    '''
    Describing a ``DataFrame``. By default only numeric fields are returned.
    :return:
    '''
    df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),
                       'numeric': [1, 2, 3],
                        'object': ['a', 'b', 'c']})
    print df.describe()
    '''
    #Describing all columns of a ``DataFrame`` regardless of data type.
    print df.describe(include='all')
    #Describing a column from a ``DataFrame`` by accessing it as an attribute.
    print df.numeric.describe()
    #Including only numeric columns in a ``DataFrame`` description.
    print df.describe(include=[np.number])
    #Including only string columns in a ``DataFrame`` description.
    print df.describe(include=[np.object])
    #Including only categorical columns from a ``DataFrame`` description.
    print df.describe(include=['category'])
    #Excluding numeric columns from a ``DataFrame`` description.
    print df.describe(exclude=[np.number])
    #Excluding object columns from a ``DataFrame`` description.
    print df.describe(exclude=[np.object])
    '''
def example1():
    dic1={'000':{'a':1,'b':2,'c':3},'001':{'d':4,'e':5,'f':6}}
    df2=pd.DataFrame(dic1)
    # print df2.describe()
    '''
           000  001
    count  3.0  3.0
    mean   2.0  5.0
    std    1.0  1.0
    min    1.0  4.0
    25%    1.5  4.5
    50%    2.0  5.0
    75%    2.5  5.5
    max    3.0  6.0
    '''
    print "返回非NAN数据项数量=>count()
{count}
".format(count = df2.describe().count())
    print "返回中位数,等价第50位百分位数的值=>median()
{median}
".format(median = df2.describe().median())
    print "返回数据的众值=>mode()
{mode}
".format(mode = df2.describe().mode())
    print "返回数据的标准差(描述离散度)=>std()
{std}
".format(std = df2.describe().std())
    print "返回方差=>var()
{var}
".format(var = df2.describe().var())
    print "偏态系数(skewness,表示数据分布的对称程度)=>skew()
{skew}
".format(skew = df2.describe().skew())

def main():
    example1()
if __name__ == '__main__':
    main()

输出=>

返回非NAN数据项数量=>count()
000    8
001    8
dtype: int64
返回中位数,等价第50位百分位数的值=>median()
000    2.00
001    4.75
dtype: float64
返回数据的众值=>mode()
   000  001
0  1.0  5.0
1  2.0  NaN
2  3.0  NaN
返回数据的标准差(描述离散度)=>std()
000    0.801784
001    1.603567
dtype: float64
返回方差=>var()
000    0.642857
001    2.571429
dtype: float64
偏态系数(skewness,表示数据分布的对称程度)=>skew()
000    0.000000
001   -1.299187
dtype: float64