python数据分析基础—

python数据分析基础——pandas Tutorial

参考pandas官方文档:

http://pandas.pydata.org/pandas-docs/stable/10min.html#min

1.pandas中的数据类型

Series 带有索引标记的一维数组，可以存储任何数据类型

 1 #基本方法
 2 >>s =pd.Series(data, index=index)
 3 
 4 >>import pandas as pd
 5 >>import numpy as np
 6 
 7 # 使用ndarray创建
 8 >>indexs = ['a', 'b', 'c']
 9 >>s  = pd.Series(np.random.randn(3), index=indexs)
10 >>s
11 a   -1.817485
12 b    0.012912
13 c    0.866929
14 dtype: float64
15 >>s.index
16 Index(['a', 'b', 'c'], dtype='object')
17 
18 #默认索引值
19 >>s  = pd.Series(np.random.randn(3))
20 >>s
21 0    1.985833
22 1    0.467035
23 2    0.636828
24 dtype: float64
25 
26 #使用dict创建
27 #默认使用dict的索引
28 >>d = {'a' : 0., 'b' : 1., 'c' : 2.}
29 >>pd.Series(d)
30 a    0.0
31 b    1.0
32 c    2.0
33 dtype: float64
34 
35 #指明索引值
36 >>pd.Series(d, index=['b', 'c', 'd', 'a'])
37 b    1.0
38 c    2.0
39 d    NaN
40 a    0.0
41 dtype: float64
42 
43 #使用标量值创建
44 >>pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
45 a    5.0
46 b    5.0
47 c    5.0
48 d    5.0
49 e    5.0
50 dtype: float64

Series 类似ndarray，可以使用Numpy的很多语法

>>s = pd.Series(np.random.randn(5),index=['a', 'b', 'c', 'd', 'e'])
>>s
a   -1.329486
b    0.396057
c   -1.156737
d   -1.152107
e   -0.787661
dtype: float64

# 索引
>>s[0]
-1.3294860342555725

#切片
>>s[:3]
a   -1.329486
b    0.396057
c   -1.156737
dtype: float64

# 推导式
>>s[s > s.median()]
b    0.396057
e   -0.787661
dtype: float64

# 按序索引
>>s[[4,3,1]]
e   -0.787661
d   -1.152107
b    0.396057
dtype: float64

>>np.exp(s)
a    0.264613
b    1.485954
c    0.314511
d    0.315970
e    0.454908
dtype: float64

Series 类似dict类型，可以操作索引值

>>s['a']
-1.3294860342555725

>>s['e']=12
>>s
a    -1.329486
b     0.396057
c    -1.156737
d    -1.152107
e    12.000000
dtype: float64

>>'e' in s
True

>>s.get('e')
12.0

>>s+s
a    -2.658972
b     0.792115
c    -2.313474
d    -2.304214
e    24.000000
dtype: float64

>>s*2
a    -2.658972
b     0.792115
c    -2.313474
d    -2.304214
e    24.000000
dtype: float64

#索引值自动对齐
#s[1:]中有a, s[:-1]中有e
>>s[1:] + s[:-1]
a         NaN
b    0.792115
c   -2.313474
d   -2.304214
e         NaN
dtype: float64

Series的name属性，创建新对象

#注意 name属性
>>s = pd.Series(np.random.randn(5),name='sth')
>>s
0    1.338578
1    2.074678
2   -0.462777
3    0.518763
4   -0.372692
Name: sth, dtype: float64

# 使用rename方法
>>s2 = s.rename('dif')
>>s2
0    1.338578
1    2.074678
2   -0.462777
3    0.518763
4   -0.372692
Name: dif, dtype: float64

>>id(s)
2669465319632

>>id(s2)
2669465320416

#s 与 s2是不同的对象，两者尽管值相同，但地址不同

DataFrame 带索引值的二维数组，类似SQL的表，列项通常是不同的数据类型

index 行索引，columns列索引

#使用Series字典或字典创建DataFrame
>>d= {'one':pd.Series([1.,2.,3.], index=['a','b','c']),         'two':pd.Series([1.,2.,3.,4.], index=['a','b','c','d'])}
>>df = pd.DataFrame(d)
>>df
   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0

# 按序输出
>>pd.DataFrame(d, index=['d','b','a'])
   one  two
d  NaN  4.0
b  2.0  2.0
a  1.0  1.0

>>df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
>>df.columns
Index(['one', 'two'], dtype='object')

#使用ndarrays/list字典
>>d = {'one':[1.,2.,3.,4.],'two':[4.,3.,2.,1.]}
>>pd.DatdFrame(d)
   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0

#指定index
>>pd.DataFrame(d,index=['a','b','c','d'])
   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0

DataFrame操作

列选择、添加、删除

>>df['one']
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

#添加 three 与 flag 列，总在尾部添加
>>df['three'] = df['one'] * df['two']
>>df['flag']=df['one']>2
>>df
   one  two  three   flag
a  1.0  1.0    1.0  False
b  2.0  2.0    4.0  False
c  3.0  3.0    9.0   True
d  NaN 4.0   NaN  False

# 删除
>>del df['two']
>>three = df.pop('three')
>>three
a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

>>df
   one   flag
a  1.0  False
b  2.0  False
c  3.0   True
d  NaN  False

#可以将列数据截断
>>df['one_trunc'] = df['one'][:2]
   one   flag  one_trunc
a  1.0  False        1.0
b  2.0  False        2.0
c  3.0   True        NaN
d  NaN  False       NaN

>>df['foo'] = 'bar'
>>df
   one   flag  one_trunc  foo
a  1.0  False        1.0     bar
b  2.0  False        2.0     bar
c  3.0   True        NaN    bar
d  NaN  False      NaN    bar

#使用insert函数可以在指定列后插入
#在第1列后插入
>>df.insert(1,'ba',df['one'])
>>df
   one   ba     flag  one_trunc  foo
a  1.0  1.0    False        1.0  bar
b  2.0  2.0    False        2.0  bar
c  3.0  3.0     True        NaN  bar
d  NaN  NaN  False       NaN  bar

索引、选择行

选择列　　 df[col] Series

按照标签选择行　 df.loc[label]　 Series

按照索引值选择行 df.iloc[loc]　　Series

切分行　　　　　　df[5:10] DataFrame

按照布尔向量选择行 df[bool_vec] DataFrame

>>d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
>>df = pd.DataFrame(d)
>>df
   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0

#按照标签选择行
>>df.loc['b']
one    2.0
two    2.0
Name: b, dtype: float64
>>type(df.loc['b'])
pandas.core.series.Series

#按照索引值选择行
>>df.iloc[2]
one    3.0
two    3.0
Name: c, dtype: float64

#切分行
>>df[1:3]
   one  two
b  2.0  2.0
c  3.0  3.0
>>type(df[1:3])
pandas.core.frame.DataFrame

选择列

>>df.one
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

>>df['one']
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

数据对齐与计算

对齐：列与行标签自动对齐

>>da = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
>>db = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
>>da +db
          A            B              C           D
0 -0.920370 -0.529455 -2.386419  NaN
1 -1.277148  1.292130  1.196099   NaN
2  1.182199  0.454546  0.381586   NaN
3  1.100170 -1.830894  1.105932   NaN
4  0.507649  1.291516 -2.084368   NaN
5 -1.198811 -2.180978  0.342185   NaN
6  0.667211  2.141364  0.044136   NaN
7       NaN       NaN            NaN      NaN
8       NaN       NaN            NaN      NaN
9       NaN       NaN            NaN      NaN

#支持Numpy操作
>>np.exp(da)
>>np.asarray(da)

3维数据类型Penel，在0.20.0及其后续版本中不再支持

新的类型xarray，用于支持多维数据