pandas入门：基本功能

重新索引

from pandas import Series,DataFrame

# Series重新索引
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
print(obj)
'''
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
'''
# 调用reindex将会根据新索进行重排，如果某个索引值当前不存在，就引入缺失值
obj2 = obj.reindex(['a','b','c','d','e'])
print(obj2)
'''
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
'''
obj3 = obj.reindex(['a','b','c','d','e'],fill_value=0)
print(obj3)
# fill_value 实现空值填充
'''
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
'''
# ffill实现向前填充,bfill实现向后填充
obj4 = Series(['blue','purpul','yellow'],index=[0,1,4])
obj5 = obj4.reindex(range(6),method='ffill')
print(obj5)
'''
0      blue
1    purpul
2    purpul
3    purpul
4    yellow
5    yellow
dtype: object
'''
obj6 = obj4.reindex(range(6),method='bfill')
print(obj6)
'''
0      blue
1    purpul
2    yellow
3    yellow
4    yellow
5       NaN
dtype: object
'''

from pandas import Series,DataFrame
import numpy as np

# DataFrame重新索引
frame = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])
print(frame)
'''
   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
'''
frame2 = frame.reindex(['a','b','c','d'])
print(frame2)
'''
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
'''
states = ['Texas','Utah','California']
frame3 = frame.reindex(columns=states)
print(frame3)
'''
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
'''
# 可同时对行列进行索引
frame4 = frame.reindex(index=['a','b','c','d'],columns=['Ohio','Texas','California','Utah'])
print(frame4)
'''
   Ohio  Texas  California  Utah
a   0.0    1.0         2.0   NaN
b   NaN    NaN         NaN   NaN
c   3.0    4.0         5.0   NaN
d   6.0    7.0         8.0   NaN
'''

# 利用ix的标签索引功能，重新索引任务可以变得更简洁
frame5 = frame.ix[['a','c','d'],['Ohio','Texas','California']]
print(frame5)
'''
   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
'''

丢弃指定轴上的项

from pandas import Series,DataFrame
import numpy as np

# drop方法
obj = Series(np.arange(5),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
print(new_obj)
'''
a    0
b    1
d    3
e    4
dtype: int32
'''
new_obj = obj.drop(['d','c'])
print(new_obj)
'''
a    0
b    1
e    4
dtype: int32
'''

# 对于DataFrame可删除任意轴上的索引值
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=[1,2,3,4],
                 columns=['one','two','three','four'])
new_data = data.drop([1,3])
print(new_data)
'''
   one  two  three  four
2    4    5      6     7
4   12   13     14    15
'''
new_data = data.drop('two',axis=1)
print(new_data)
'''
   one  three  four
1    0      2     3
2    4      6     7
3    8     10    11
4   12     14    15
'''
new_data = data.drop(['two','four'],axis=1)
print(new_data)
'''
   one  three
1    0      2
2    4      6
3    8     10
4   12     14
'''

索引、选取和过滤

from pandas import Series,DataFrame

obj = Series([9,5,7,3],index=['a','b','c','d'])
print(obj['b']) # 5
print(obj[2]) # 7
print(obj[2:4])
'''
c    7
d    3
dtype: int64
'''
print(obj[['b','a','d']])
'''
b    5
a    9
d    3
dtype: int64
'''
print(obj[[1,3]])
'''
b    5
d    3
dtype: int64
'''
print(obj[obj<5])
'''
d    3
dtype: int64
'''
# 利用标签的切片运算与普通python切片运算不通过，其末端是包含的
print(obj['b':'c'])
'''
b    5
c    7
dtype: int64
'''
# 赋值方式
obj['b':'c'] = 5
print(obj)
'''
a    9
b    5
c    5
d    3
dtype: int64
'''

from pandas import Series,DataFrame
import numpy as np

data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print(data)
'''
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
'''
print(data['two'])
'''
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
'''
print(data[['three','one']])
'''
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
'''
# 特殊情况，通过切片或布尔型数组选取行
print(data[:2])
'''
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
'''
print(data['three'] >5)
'''
Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool
'''
print(data[data['three'] >5]) # 等于data[[False,True,True,True]]
'''
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
'''
# 索引字段ix
print(data.ix['Colorado',['two','three']])
'''
two      5
three    6
Name: Colorado, dtype: int32
'''
print(data.ix[['Colorado','Utah'],[3,0,1]])
'''
          four  one  two
Colorado     7    4    5
Utah        11    8    9
'''
print(data.ix[2])
'''
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
'''
print(data.ix[:'Utah','two'])
'''
Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32
'''
print(data.ix[data.three>5,:3])
'''
          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14
'''

算术运算和数据对齐

from pandas import Series,DataFrame
import numpy as np

s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])

print(s1+s2)
# 自动的数据对齐在不重叠处引入NA值
'''
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
'''

df1 = DataFrame(np.arange(9).reshape((3,3)),
                    columns=list('ABC'),
                index=['one','two','three'])
df2 = DataFrame(np.arange(12).reshape((4,3)),
                    columns=list('ABC'),
                index=['one','three','four','five'])
print(df1+df2)
# 索引为原来两个DataFrame的并集
'''
         A     B     C
five   NaN   NaN   NaN
four   NaN   NaN   NaN
one    0.0   2.0   4.0
three  9.0  11.0  13.0
two    NaN   NaN   NaN
'''

from pandas import Series,DataFrame
import numpy as np

# 在算术方法中填充值
df1 = DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
print(df1+df2)
'''
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
'''
print(df1.add(df2,fill_value=0))
'''
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0  11.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
'''

from pandas import Series,DataFrame
import numpy as np

# DataFrame和Series之间的运算
arr = np.arange(12.).reshape((3,4))
print(arr)
'''
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
'''
print(arr[0])
'''
[0. 1. 2. 3.]
'''
print(arr-arr[0])
'''
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]
'''
frame = DataFrame(np.arange(12.).reshape((4,3)),
                  columns=list('bde'),
                  index=['one','three','four','five'])
print(frame)
'''
         b     d     e
one    0.0   1.0   2.0
three  3.0   4.0   5.0
four   6.0   7.0   8.0
five   9.0  10.0  11.0
'''
series = frame.ix[0]
print(series)
'''
b    0.0
d    1.0
e    2.0
Name: one, dtype: float64
'''
# 默认情况下，DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列，然后沿着行乡下广播
print(frame-series)
'''
         b    d    e
one    0.0  0.0  0.0
three  3.0  3.0  3.0
four   6.0  6.0  6.0
five   9.0  9.0  9.0
'''
# 如果某个索引在DataFrame或Series的索引中找不到，则参与运算的两个对象会被重新索引以形成并集
series2 = Series(range(3),index=['b','e','f'])
print(series2)
'''
b    0
e    1
f    2
dtype: int64
'''
print(frame+series2)
'''
         b   d     e   f
one    0.0 NaN   3.0 NaN
three  3.0 NaN   6.0 NaN
four   6.0 NaN   9.0 NaN
five   9.0 NaN  12.0 NaN
'''
# 如果希望匹配行切在列上广播，需使用蒜素运算方法，如
series3 = frame['d']
print(series3)
'''
one       1.0
three     4.0
four      7.0
five     10.0
Name: d, dtype: float64
'''
print(frame.sub(series3,axis=0))
# 传入的轴号是希望匹配的轴
'''
         b    d    e
one   -1.0  0.0  1.0
three -1.0  0.0  1.0
four  -1.0  0.0  1.0
five  -1.0  0.0  1.0
'''

函数应用和映射

from pandas import Series,DataFrame
import numpy as np

#Numpy的ufuncs也可用于操作pandas对象
frame = DataFrame(np.random.randn(4,3),
                  columns=list('bde'),
                  index=['one','two','three','four'])

print(frame)
'''
              b         d         e
one   -1.415255 -1.084419  0.724132
two   -0.468757  0.493345  0.318408
three  0.913162 -0.513506  0.149354
four  -2.219956  1.166779 -0.359199
'''
print(np.abs(frame))
'''
              b         d         e
one    1.415255  1.084419  0.724132
two    0.468757  0.493345  0.318408
three  0.913162  0.513506  0.149354
four   2.219956  1.166779  0.359199
'''
# 将函数应用到由各列或行形成的一维数组上,使用apply方法
data = [[1,2,3],
        [5,2,3],
        [6,6,6],
        [9,7,1]]
frame2 = DataFrame(data,
                  columns=list('bde'),
                  index=['one','two','three','four'])
print(frame2)
'''
       b  d  e
one    1  2  3
two    5  2  3
three  6  6  6
four   9  7  1
'''
f = lambda x:x.max()-x.min()
print(frame2.apply(f))
'''
b    8
d    5
e    5
dtype: int64
'''
# axis=1 横向计算，axis=0 默认纵向计算
print(frame2.apply(f,axis=1))
'''
one      2
two      3
three    0
four     8
dtype: int64
'''
# 元素级函数使用applymap
f = lambda x:x+1
print(frame2.applymap(f))
'''
        b  d  e
one     2  3  4
two     6  3  4
three   7  7  7
four   10  8  2
'''

print(frame2['e'].map(f))
'''
one      4
two      4
three    7
four     2
Name: e, dtype: int64
'''

排序和排名

from pandas import Series,DataFrame
import numpy as np

obj = Series([1,4,2,3],index=['d','a','c','b'])
print(obj)
'''
d    1
a    4
c    2
b    3
dtype: int64
'''
print(obj.sort_index())
'''
a    4
b    3
c    2
d    1
dtype: int64
'''
print(obj.sort_values())
'''
d    1
c    2
b    3
a    4
dtype: int64
'''
frame = DataFrame(np.arange(8).reshape((2,4)),
                  index=['n','c'],
                  columns=[0,4,1,6])
print(frame)
'''
   0  4  1  6
n  0  1  2  3
c  4  5  6  7
'''
print(frame.sort_index())
'''
   0  4  1  6
c  4  5  6  7
n  0  1  2  3
'''
# axis=1 横向，axis=0 默认纵向
print(frame.sort_index(axis=1))
'''
   0  1  4  6
n  0  2  1  3
c  4  6  5  7
'''
# ascending默认升序，可设置降序
print(frame.sort_index(axis=1,ascending=False))
'''
   6  4  1  0
n  3  1  2  0
c  7  5  6  4
'''
# 对Series排序,python3.6版本之后没有order了，可使用sort_values
obj = Series([4,7,-3,2])
print(obj.sort_values())
'''
2   -3
3    2
0    4
1    7
dtype: int64
'''
# 排序时，缺失值会放到Series的末尾
obj = Series([4,np.nan,7,np.nan,-3,2])
print(obj.sort_values())
'''
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
'''
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
print(frame)
'''
   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
'''
# 根据一个或多个李忠的值进行排序。将一个或多个列的名字传递给by选项即可
print(frame.sort_values(by='b'))
'''
   a  b
2  0 -3
3  1  2
0  0  4
1  1  7
'''
print(frame.sort_values(by=['a','b']))
'''
   a  b
2  0 -3
0  0  4
3  1  2
1  1  7
'''
# 排名（ranking） 为各组分配一个平均排名，即排序之后给一个编号
obj = Series([7,-5,7,4,2,0,4])
print(obj.rank())
'''
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
'''
print(obj.rank(method='first'))
'''
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
'''
print(obj.rank(ascending=False,method='max'))
'''
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
'''
frame = DataFrame({'b':[4.3,7,-3,2],
                   'a':[0,1,0,1],
                   'c':[-2,5,8,-2.5]})
print(frame)
'''
   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
'''
print(frame.rank(axis=1))
'''
     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0
'''

method

average:默认，在相等的分组中，为各个值分配平均排名
min：使用整个组的最小排名
max：使用整个组的最大排名
first：按值在原始数据中出现的顺序排名

带有重复值的轴索引

from pandas import Series

obj= Series(range(5),index=['a','a','b','b','c'])
print(obj)
'''
a    0
a    1
b    2
b    3
c    4
dtype: int64
'''
# 索引的is_unique是否唯一
print(obj.index.is_unique) # False
print(obj['a'])
'''
a    0
a    1
dtype: int64
'''
print(obj['c']) # 4