- 如何重命名dataframe的特定列
df1 = pd.DataFrame(data=np.array([[18,50],[19,51],[20,55]]),index=['man1','man2','man3'],columns=['age','weight']) print(df1) # 修改列名 print("\nchange columns :\n") #方法1 df1.rename(columns={'weight':'stress'}) #方法2 df1.columns.values[1] = 'stress' print(df1) #> age weight man1 18 50 man2 19 51 man3 20 55 change columns : age stress man1 18 50 man2 19 51 man3 20 55
- 如何检查dataframe中是否有缺失值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv') # 若有缺失值,则为Ture df.isnull().values.any() #> True 9. 如何统计dataframe的每列中缺失值的个数 df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv') # 获取每列的缺失值个数 n_missings_each_col = df.apply(lambda x: x.isnull().sum()) print(n_missings_each_col.head()) #> Manufacturer 4 Model 1 Type 3 Min.Price 7 Price 2 dtype: int64
- 如何用平均值替换相应列的缺失值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv',nrows=10) print(df[['Min.Price','Max.Price']].head()) # 平均值替换缺失值 df_out = df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x: x.fillna(x.mean())) print(df_out.head()) #> Min.Price Max.Price 0 12.9 18.8 1 29.2 38.7 2 25.9 32.3 3 NaN 44.6 4 NaN NaN #> Min.Price Max.Price 0 12.9 18.8 1 29.2 38.7 2 25.9 32.3 3 23.0 44.6 4 23.0 29.9
- 如何用全局变量作为apply函数的附加参数处理指定的列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv') print(df[['Min.Price', 'Max.Price']].head()) # 全局变量 d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian} # 列名Min.Price的缺失值用平均值代替,Max.Price的缺失值用中值代替 df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, )) print(df[['Min.Price', 'Max.Price']].head()) #> Min.Price Max.Price 0 12.9 18.8 1 29.2 38.7 2 25.9 32.3 3 NaN 44.6 4 NaN NaN #> Min.Price Max.Price 0 12.900000 18.80 1 29.200000 38.70 2 25.900000 32.30 3 17.118605 44.60 4 17.118605 19.15