Pandas入门之十三：缺失值处理

已信任
Jupyter 服务器: 本地
Python 3: Not Started
[1]



import pandas as pd
import numpy as np
[3]



df = pd.DataFrame(np.random.randn(5,3),index=['a','b','e','f','h'],columns=['one','two','three'])
df 
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[5]



df = df.reindex(['a','b','c','d','e','f','h'])
df
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    NaN    NaN    NaN
d    NaN    NaN    NaN
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[6]



# 检查数据是否为空
df['one'].isnull()
a    False
b    False
c     True
d     True
e    False
f    False
h    False
Name: one, dtype: bool
[7]



# 检查数据是否非空
df['one'].notnull()
a     True
b     True
c    False
d    False
e     True
f     True
h     True
Name: one, dtype: bool
[8]



# 提取空
df[df['one'].isnull()]
one    two    three
c    NaN    NaN    NaN
d    NaN    NaN    NaN
[9]



#提取非空
df[df['one'].notnull()]
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[10]



df
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    NaN    NaN    NaN
d    NaN    NaN    NaN
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[11]



# 计算 第一列求和,若有nan，则视为0；如果所有的数据都为nan，则结果也为nan
df['one'].sum()
-1.9544119617918125
[12]



# 填充,把所有的nan填充为0
df.fillna(0)
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    0.000000    0.000000    0.000000
d    0.000000    0.000000    0.000000
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[16]



df
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    NaN    NaN    NaN
d    NaN    NaN    NaN
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[15]



# 计算平均值进行填充,按列的平均值进行填充
df.fillna(df.mean())
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    -0.390882    -0.090721    -0.109377
d    -0.390882    -0.090721    -0.109377
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[17]



# pad为填充前一个数据
df.fillna(method='pad')
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    0.056577    -0.612873    -1.710761
d    0.056577    -0.612873    -1.710761
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[18]



# backfill为填充后一个数据
df.fillna(method='backfill')
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    1.000864    -0.708675    0.690998
d    1.000864    -0.708675    0.690998
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[19]



# 将空值的数据删除,按行删除nan
df.dropna()
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[20]



# 按列删除，每列都有nan，为空
df.dropna(axis=1)
a
b
c
d
e
f
h
[22]



# 替换丢失或者nan值或者通用值
df.replace({np.nan:10})
one    two    three
a    -0.134914    -0.183527    1.455060
b    0.056577    -0.612873    -1.710761
c    10.000000    10.000000    10.000000
d    10.000000    10.000000    10.000000
e    1.000864    -0.708675    0.690998
f    -2.126286    0.363740    -0.151361
h    -0.750653    0.687731    -0.830824
[24]



df['four']=pd.Series([1,2,3,4,5,6,7],index=['a','b','c','d','e','f','h'])
df
one    two    three    four
a    -0.134914    -0.183527    1.455060    1
b    0.056577    -0.612873    -1.710761    2
c    NaN    NaN    NaN    3
d    NaN    NaN    NaN    4
e    1.000864    -0.708675    0.690998    5
f    -2.126286    0.363740    -0.151361    6
h    -0.750653    0.687731    -0.830824    7
[25]



df.replace({np.nan:10,5:1000})
one    two    three    four
a    -0.134914    -0.183527    1.455060    1
b    0.056577    -0.612873    -1.710761    2
c    10.000000    10.000000    10.000000    3
d    10.000000    10.000000    10.000000    4
e    1.000864    -0.708675    0.690998    1000
f    -2.126286    0.363740    -0.151361    6
h    -0.750653    0.687731    -0.830824    7
[-]