pandas_format04

如何得到列中前n个最大值对应的索引

df = pd.DataFrame(np.random.randint(1, 15, 15).reshape(5,-1), columns=list('abc'))
print(df)
# 取'a'列前3个最大值对应的行
n = 5
df['a'].argsort()[::-1].iloc[:3]

#>        a   b   c
    0   5   5   2
    1  12   7   1
    2   5   2  12
    3   5  14  12
    4   1  13  13

#>    4    1
    3    3
    2    2
    Name: a, dtype: int64

如何获得dataframe行的和大于100的最末n行索引

df = pd.DataFrame(np.random.randint(10, 40, 16).reshape(-1, 4))
print(df)
# dataframe每行的和
rowsums = df.apply(np.sum, axis=1)

# 选取大于100的最末两行索引
# last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
nline = np.where(rowsums > 100)[0][-2:]
nline

#>        0   1   2   3
    0  19  34  15  12
    1  38  35  14  26
    2  39  32  18  20
    3  28  27  36  38

#>    array([2, 3], dtype=int64)

如何从series中查找异常值并赋值

ser = pd.Series(np.logspace(-2, 2, 30))

# 小于low_per分位的数赋值为low，大于low_per分位的数赋值为high
def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return(ser)

capped_ser = cap_outliers(ser, .05, .95)

#>    0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934

如何交换dataframe的两行

df = pd.DataFrame(np.arange(9).reshape(3, -1))
print(df)
# 函数
def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    # 通过iloc换行
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

# 2和3行互换
print(swap_rows(df, 1, 2))

#>       0  1  2
    0  0  1  2
    1  3  4  5
    2  6  7  8

#>       0  1  2
    0  0  1  2
    1  6  7  8
    2  3  4  5

如何倒转dataframe的行

df = pd.DataFrame(np.arange(9).reshape(3, -1))
print(df)

# 方法 1
df.iloc[::-1, :]

# 方法 2
print(df.loc[df.index[::-1], :])

#>       0  1  2
    0  0  1  2
    1  3  4  5
    2  6  7  8

#>       0  1  2
    2  6  7  8
    1  3  4  5
    0  0  1  2

如何对分类变量进行one-hot编码

df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
print(df)
# 对列'a'进行onehot编码
df_onehot = pd.concat([pd.get_dummies(df['a']), df[list('bcde')]], axis=1)
print(df_onehot)

#>        a   b   c   d   e
    0   0   1   2   3   4
    1   5   6   7   8   9
    2  10  11  12  13  14
    3  15  16  17  18  19
    4  20  21  22  23  24

#>       0  5  10  15  20   b   c   d   e
    0  1  0   0   0   0   1   2   3   4
    1  0  1   0   0   0   6   7   8   9
    2  0  0   1   0   0  11  12  13  14
    3  0  0   0   1   0  16  17  18  19
    4  0  0   0   0   1  21  22  23  24