pandas教程

　　对pandas做最简单的介绍，针对初学者。

　　一、引入相关模块模块

1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt

　　二、对象创建

　　2.1 创建一个Series。

 1 '''
 2  Series(data,index,dtype,copy)
 3     data:array-like,dict, or scalar value
 4     index:array-like or index(1d)
 5     dtype:numpy.dtype or None
 6     copy:boolean, default False
 7 '''
 8 '''结果：
 9 0    1.0
10 1    3.0
11 2    5.0
12 3    NaN
13 4    6.0
14 5    8.0
15 dtype: float64
16 '''
17 s = pd.Series([1,3,5,np.nan,6,8])

　　2.2 创建DataFrame。数据使用numpy的array，索引index使用datetime，列名使用标签

 1 '''
 2 DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
 3                '2013-01-05', '2013-01-06'],
 4               dtype='datetime64[ns]', freq='D')
 5 '''
 6 dates = pd.date_range('20130101', periods=6)
 7 
 8 '''
 9 A         B         C         D
10 2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
11 2013-01-02  1.212112 -0.173215  0.119209 -1.044236
12 2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
13 2013-01-04  0.721555 -0.706771 -1.039575  0.271860
14 2013-01-05 -0.424972  0.567020  0.276232 -1.087401
15 2013-01-06 -0.673690  0.113648 -1.478427  0.524988
16 '''
17 df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

　　2.3 通过一个字典的数据创建一个dataFrame

1 '''
2 A          B    C  D      E    F
3 0  1.0 2013-01-02  1.0  3   test  foo
4 1  1.0 2013-01-02  1.0  3  train  foo
5 2  1.0 2013-01-02  1.0  3   test  foo
6 3  1.0 2013-01-02  1.0  3  train  foo
7 '''
8 df2 = pd.DataFrame({ 'A' : 1.,'B' : pd.Timestamp('20130102'),'C' : pd.Series(1,index=list(range(4)),dtype='float32'),'D' : np.array([3] * 4,dtype='int32'),'E' ：pd.Categorical(["test","train","test","train"]),'F' : 'foo' })

　　三、取数据

　　3.1 取前几行或者最后几行的数据

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df.head()
df.tail()

　　3.2 显示index、column、或者数据

df.index
df.columns
df.values

　　3.3 显示数据的一些统计数据

 1 '''
 2               A         B         C         D
 3 count  6.000000  6.000000  6.000000  6.000000
 4 mean   0.073711 -0.431125 -0.687758 -0.233103
 5 std    0.843157  0.922818  0.779887  0.973118
 6 min   -0.861849 -2.104569 -1.509059 -1.135632
 7 25%   -0.611510 -0.600794 -1.368714 -1.076610
 8 50%    0.022070 -0.228039 -0.767252 -0.386188
 9 75%    0.658444  0.041933 -0.034326  0.461706
10 max    1.212112  0.567020  0.276232  1.071804
11 '''
12 df.describe()

　　3.4 矩阵转置

1 df.T

　　3.5 根据某一维度进行排序

'''
对维度1的索引做降序排序
'''
df.sort_index(axis=1, ascending=False)

　　3.6 对数据值做排序

'''
                   A         B         C         D
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-06 -0.673690  0.113648 -1.478427  0.524988
2013-01-05 -0.424972  0.567020  0.276232 -1.087401
'''
df.sort_values(by='B')

　　四、数据选择

　　4.1、根据维度上的值取数据

　　4.1.1选择一个单独的列，df是Series对象

'''
2013-01-01    0.469112
2013-01-02    1.212112
2013-01-03   -0.861849
2013-01-04    0.721555
2013-01-05   -0.424972
2013-01-06   -0.673690
Freq: D, Name: A, dtype: float64
'''
print(df['A']) #等同于 df.A

　　4.1.2类似[]一样做切割的操作

'''
                   A         B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
'''
df[0:3] #等同于df['2013-01-01':'2013-01-04']

　　4.1.3对于二维，根据纵轴上的值取数据

'''
A    0.469112
B   -0.282863
C   -1.509059
D   -1.135632
Name: 2013-01-01 00:00:00, dtype: float64
'''
df.loc['2013-01-01']

　　4.1.4[]的切割和轴上取值的综合使用

'''
A         B
2013-01-01  0.469112 -0.282863
2013-01-02  1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04  0.721555 -0.706771
2013-01-05 -0.424972  0.567020
2013-01-06 -0.673690  0.113648
'''
df.loc[:,['A','B']]

　　4.1.5[]的切割和轴上取值的综合使用的进阶版

'''
                   A         B
2013-01-02  1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04  0.721555 -0.706771
'''
df.loc['20130102':'20130104',['A','B']] #20130102会自动格式化

　　4.1.6相关的额外的用例

'''
A    1.212112
B   -0.173215
Name: 2013-01-02 00:00:00, dtype: float64
'''
df.loc['20130102',['A','B']]

'''
0.46911229990718628
'''
df.loc[dates[0],'A']  # dates[0]等价于 '20130101'

　　4.2、根据位置取数据

'''
A    0.721555
B   -0.706771
C   -1.039575
D    0.271860
Name: 2013-01-04 00:00:00, dtype: float64
'''
df.iloc[3]

'''
 　　　　　　　　A         B
2013-01-04  0.721555 -0.706771
2013-01-05 -0.424972  0.567020
'''
df.iloc[3:5,0:2]

'''
　　　　　　　　A         C
2013-01-02  1.212112  0.119209
2013-01-03 -0.861849 -0.494929
2013-01-05 -0.424972  0.276232
'''
df.iloc[[1,2,4],[0,2]]

'''
　　　　　　　　　　A         B         C         D
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
'''
df.iloc[1:3,:]

'''
　　　　　　　　B         C
2013-01-01 -0.282863 -1.509059
2013-01-02 -0.173215  0.119209
2013-01-03 -2.104569 -0.494929
2013-01-04 -0.706771 -1.039575
2013-01-05  0.567020  0.276232
2013-01-06  0.113648 -1.478427
'''
df.iloc[:,1:3]

'''
-0.17321464905330858
'''
df.iloc[1,1]

　　五、数据选择的中布尔条件的筛选用法

'''
　　　　　　　　　　　A          B         C         D
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
2013-01-02  1.212112 -0.173215  0.119209 -1.044236
2013-01-04  0.721555 -0.706771 -1.039575  0.271860
'''
df[df.A > 0]  #针对A 条件成立的数据显示，不成立的数据过滤

'''
　　　　　　　　　　A         B         C         D
2013-01-01  0.469112       NaN       NaN       NaN
2013-01-02  1.212112       NaN  0.119209       NaN
2013-01-03       NaN       NaN       NaN  1.071804
2013-01-04  0.721555       NaN       NaN  0.271860
2013-01-05       NaN  0.567020  0.276232       NaN
2013-01-06       NaN  0.113648       NaN  0.524988
'''
df[df > 0]  # 对整个数据做判断，成立的显示，不成立的显示nan

# 复制
df2 = df.copy()
# 增加一个列
df2['E'] = ['one', 'one','two','three','four','three']

'''
　　　　　　　　　　A         B         C         D      E
2013-01-01  0.469112 -0.282863 -1.509059 -1.135632    one
2013-01-02  1.212112 -0.173215  0.119209 -1.044236    one
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804    two
2013-01-04  0.721555 -0.706771 -1.039575  0.271860  three
2013-01-05 -0.424972  0.567020  0.276232 -1.087401   four
2013-01-06 -0.673690  0.113648 -1.478427  0.524988  three
'''
print(df2)

'''
　　　　　　　　　　A         B         C         D     E
2013-01-03 -0.861849 -2.104569 -0.494929  1.071804   two
2013-01-05 -0.424972  0.567020  0.276232 -1.087401  four
'''
df2[df2['E'].isin(['two','four'])]  # isin()方法用于过滤，显示条件成立的结果

　　六、赋值（setting）

　　6.1 增加一列

# 新列的值
s = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101',periods=6))

'''
　　　　　　　　　　  A         B          C        D   E
2013-01-01 -0.330600 -1.326650  1.956782  0.328470  1
2013-01-02  0.173402 -0.373742 -0.121202  0.382443  2
2013-01-03 -0.579300 -0.381537 -2.955372 -0.557058  3
2013-01-04  1.358076  0.907546  0.629780 -1.579100  4
2013-01-05  2.269737  1.224567  0.591703 -1.022714  5
2013-01-06  0.966249 -0.205897 -0.003112  1.925219  6

'''
df['E'] = s  # 对新列赋值

　　6.2 增加一行，具体使用拼接的或者添加（concat、append）

'''
　　　　　　　　　　A         B         C         D         E
2013-01-07  1.105365  0.027329  2.210636  1.497980  0.761118
2013-01-08  0.387425 -1.506767  0.416878 -1.479918 -0.716363

'''
su_df = pd.DataFrame(np.random.randn(2,5), index=pd.date_range('20130107',periods=2),columns=list('ABCDE'))

'''
　　　　　　　　　　A         B         C         D         E
2013-01-01 -2.476921 -0.961169  0.063422  2.010977  1.000000
2013-01-02  1.060736  0.265674  0.092731 -0.423340  2.000000
2013-01-03  0.036753  1.757448  0.987356  0.344027  3.000000
2013-01-04 -0.429803  0.783153 -0.124511 -0.678557  4.000000
2013-01-05 -0.266420 -3.515056 -0.138616  1.244520  5.000000
2013-01-06  0.217777 -0.327220  0.266039  0.672814  6.000000
2013-01-07  1.105365  0.027329  2.210636  1.497980  0.761118
2013-01-08  0.387425 -1.506767  0.416878 -1.479918 -0.716363

'''
df.append(su_df)

　　6.3 根据刻度赋值

df.at['20130101','A'] = 0

　　6.4 根据位置赋值

df.iat[0,1] = 0

　　6.5 为某一列赋值

df.loc[:,'E'] = np.array([5] * len(df))

　　七、其他对数据处理的函数

'''
dates = ['2013-01-01','2013-01-02','2013-01-03','2013-01-04',...]

A         B         C  D    F    E
2013-01-01  0.000000  0.000000 -1.509059  5  NaN  1.0
2013-01-02  1.212112 -0.173215  0.119209  5  1.0  1.0
2013-01-03 -0.861849 -2.104569 -0.494929  5  2.0  NaN
2013-01-04  0.721555 -0.706771 -1.039575  5  3.0  NaN

''' df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) # 可以对原有的数据进行增删改，返回一个经过增删改后新的数据'''

A         B         C         D    E
2013-01-01 -2.476921 -0.961169  0.063422  2.010977  1.0
2013-01-02  1.060736  0.265674  0.092731 -0.423340  1.0
2013-01-03  0.036753  1.757448  0.987356  0.344027  1.0
2013-01-04 -0.429803  0.783153 -0.124511 -0.678557  NaN
2013-01-05 -0.266420 -3.515056 -0.138616  1.244520  NaN
2013-01-06  0.217777 -0.327220  0.266039  0.672814  NaN

'''
df

'''
A         B         C         D    E
2013-01-01 -2.476921 -0.961169  0.063422  2.010977  1.0
2013-01-02  1.060736  0.265674  0.092731 -0.423340  1.0
2013-01-03  0.036753  1.757448  0.987356  0.344027  1.0

'''
df.dropna(how='any')

'''
A         B         C         D      E
2013-01-01 -2.476921 -0.961169  0.063422  2.010977    1.0
2013-01-02  1.060736  0.265674  0.092731 -0.423340    1.0
2013-01-03  0.036753  1.757448  0.987356  0.344027    1.0
2013-01-04 -0.429803  0.783153 -0.124511 -0.678557  777.0
2013-01-05 -0.266420 -3.515056 -0.138616  1.244520  777.0
2013-01-06  0.217777 -0.327220  0.266039  0.672814  777.0

'''
df.fillna(value=5)  # 对为NAN的值进行填充

　　八、使用回调函数处理数据

df.apply(np.cumsum)
df.apply(lambda x : x.max() - x.min())

　　九、对key和value的处理函数

'''
0    4
1    2
2    1
3    2
4    6
5    4
6    4
7    6
8    4
9    4
'''
s = pd.Series(np.random.randint(0, 7, size=10))

'''
4    5
6    2
2    2
1    1
dtype: int64
'''
s.value_counts()  # 统计对应的值出现的次数