pandas_format04

  1. 如何得到列中前n个最大值对应的索引
    df = pd.DataFrame(np.random.randint(1, 15, 15).reshape(5,-1), columns=list('abc'))
    print(df)
    # 取'a'列前3个最大值对应的行
    n = 5
    df['a'].argsort()[::-1].iloc[:3]
    
    #>        a   b   c
        0   5   5   2
        1  12   7   1
        2   5   2  12
        3   5  14  12
        4   1  13  13
    
    #>    4    1
        3    3
        2    2
        Name: a, dtype: int64
  2. 如何获得dataframe行的和大于100的最末n行索引
    df = pd.DataFrame(np.random.randint(10, 40, 16).reshape(-1, 4))
    print(df)
    # dataframe每行的和
    rowsums = df.apply(np.sum, axis=1)
    
    # 选取大于100的最末两行索引
    # last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
    nline = np.where(rowsums > 100)[0][-2:]
    nline
    
    #>        0   1   2   3
        0  19  34  15  12
        1  38  35  14  26
        2  39  32  18  20
        3  28  27  36  38
    
    #>    array([2, 3], dtype=int64)
  3. 如何从series中查找异常值并赋值
    ser = pd.Series(np.logspace(-2, 2, 30))
    
    # 小于low_per分位的数赋值为low,大于low_per分位的数赋值为high
    def cap_outliers(ser, low_perc, high_perc):
        low, high = ser.quantile([low_perc, high_perc])
        print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
        ser[ser < low] = low
        ser[ser > high] = high
        return(ser)
    
    capped_ser = cap_outliers(ser, .05, .95)
    
    #>    0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934
  4. 如何交换dataframe的两行
    df = pd.DataFrame(np.arange(9).reshape(3, -1))
    print(df)
    # 函数
    def swap_rows(df, i1, i2):
        a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
        # 通过iloc换行
        df.iloc[i1, :], df.iloc[i2, :] = b, a
        return df
    
    # 2和3行互换
    print(swap_rows(df, 1, 2))
    
    #>       0  1  2
        0  0  1  2
        1  3  4  5
        2  6  7  8
    
    #>       0  1  2
        0  0  1  2
        1  6  7  8
        2  3  4  5
  5. 如何倒转dataframe的行
    df = pd.DataFrame(np.arange(9).reshape(3, -1))
    print(df)
    
    # 方法 1
    df.iloc[::-1, :]
    
    # 方法 2
    print(df.loc[df.index[::-1], :])
    
    #>       0  1  2
        0  0  1  2
        1  3  4  5
        2  6  7  8
    
    #>       0  1  2
        2  6  7  8
        1  3  4  5
        0  0  1  2
  6. 如何对分类变量进行one-hot编码
    df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
    print(df)
    # 对列'a'进行onehot编码
    df_onehot = pd.concat([pd.get_dummies(df['a']), df[list('bcde')]], axis=1)
    print(df_onehot)
    
    #>        a   b   c   d   e
        0   0   1   2   3   4
        1   5   6   7   8   9
        2  10  11  12  13  14
        3  15  16  17  18  19
        4  20  21  22  23  24
    
    #>       0  5  10  15  20   b   c   d   e
        0  1  0   0   0   0   1   2   3   4
        1  0  1   0   0   0   6   7   8   9
        2  0  0   1   0   0  11  12  13  14
        3  0  0   0   1   0  16  17  18  19
        4  0  0   0   0   1  21  22  23  24
原文地址:https://www.cnblogs.com/huaobin/p/15687104.html