已信任 Jupyter 服务器: 本地 Python 3: Not Started [1] import pandas as pd import numpy as np [3] df = pd.DataFrame(np.random.randn(5,3),index=['a','b','e','f','h'],columns=['one','two','three']) df one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [5] df = df.reindex(['a','b','c','d','e','f','h']) df one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c NaN NaN NaN d NaN NaN NaN e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [6] # 检查数据是否为空 df['one'].isnull() a False b False c True d True e False f False h False Name: one, dtype: bool [7] # 检查数据是否非空 df['one'].notnull() a True b True c False d False e True f True h True Name: one, dtype: bool [8] # 提取空 df[df['one'].isnull()] one two three c NaN NaN NaN d NaN NaN NaN [9] #提取非空 df[df['one'].notnull()] one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [10] df one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c NaN NaN NaN d NaN NaN NaN e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [11] # 计算 第一列求和,若有nan,则视为0;如果所有的数据都为nan,则结果也为nan df['one'].sum() -1.9544119617918125 [12] # 填充,把所有的nan填充为0 df.fillna(0) one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c 0.000000 0.000000 0.000000 d 0.000000 0.000000 0.000000 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [16] df one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c NaN NaN NaN d NaN NaN NaN e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [15] # 计算平均值进行填充,按列的平均值进行填充 df.fillna(df.mean()) one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c -0.390882 -0.090721 -0.109377 d -0.390882 -0.090721 -0.109377 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [17] # pad为填充前一个数据 df.fillna(method='pad') one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c 0.056577 -0.612873 -1.710761 d 0.056577 -0.612873 -1.710761 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [18] # backfill为填充后一个数据 df.fillna(method='backfill') one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c 1.000864 -0.708675 0.690998 d 1.000864 -0.708675 0.690998 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [19] # 将空值的数据删除,按行删除nan df.dropna() one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [20] # 按列删除,每列都有nan,为空 df.dropna(axis=1) a b c d e f h [22] # 替换丢失或者nan值或者通用值 df.replace({np.nan:10}) one two three a -0.134914 -0.183527 1.455060 b 0.056577 -0.612873 -1.710761 c 10.000000 10.000000 10.000000 d 10.000000 10.000000 10.000000 e 1.000864 -0.708675 0.690998 f -2.126286 0.363740 -0.151361 h -0.750653 0.687731 -0.830824 [24] df['four']=pd.Series([1,2,3,4,5,6,7],index=['a','b','c','d','e','f','h']) df one two three four a -0.134914 -0.183527 1.455060 1 b 0.056577 -0.612873 -1.710761 2 c NaN NaN NaN 3 d NaN NaN NaN 4 e 1.000864 -0.708675 0.690998 5 f -2.126286 0.363740 -0.151361 6 h -0.750653 0.687731 -0.830824 7 [25] df.replace({np.nan:10,5:1000}) one two three four a -0.134914 -0.183527 1.455060 1 b 0.056577 -0.612873 -1.710761 2 c 10.000000 10.000000 10.000000 3 d 10.000000 10.000000 10.000000 4 e 1.000864 -0.708675 0.690998 1000 f -2.126286 0.363740 -0.151361 6 h -0.750653 0.687731 -0.830824 7 [-]