pandas_format05

如何获取dataframe行方向上最大值个数最多的列

df = pd.DataFrame(np.random.randint(1,100, 9).reshape(3, -1))
print(df)
# 获取每列包含行方向上最大值的个数
count_series = df.apply(np.argmax, axis=1).value_counts()
print(count_series)
# 输出行方向最大值个数最多的列的索引
print('Column with highest row maxes: ', count_series.index[0])

#>        0   1   2
    0  46  31  34
    1  38  13   6
    2   1  18  15

#>统计列的最大值的个数
        0    2
        1    1
        dtype: int64
    
#>    Column with highest row maxes:  0

如何得到列之间最大的相关系数

df = pd.DataFrame(np.random.randint(1,100, 16).reshape(4, -1), columns=list('pqrs'), index=list('abcd'))
# df
print(df)
# 得到四个列的相关系数
abs_corrmat = np.abs(df.corr())
print(abs_corrmat)
# 得到每个列名与其他列的最大相关系数
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])
# 显示每列与其他列的相关系数
print('Maximum Correlation possible for each column: ', np.round(max_corr.tolist(), 2))

#>        p   q   r   s
    a  59  99   1  34
    b  89  60  97  40
    c  43  35  14   6
    d  70  59  30  53
#>              p         q         r         s
    p  1.000000  0.200375  0.860051  0.744529
    q  0.200375  1.000000  0.236619  0.438541
    r  0.860051  0.236619  1.000000  0.341399
    s  0.744529  0.438541  0.341399  1.000000

#>    Maximum Correlation possible for each column:  [0.86 0.44 0.86 0.74]

如何创建包含每行最小值与最大值比例的列

df = pd.DataFrame(np.random.randint(1,100, 9).reshape(3, -1))
print(df)
# 方法1：axis=1表示行方向，
min_by_max = df.apply(lambda x: np.min(x)/np.max(x), axis=1)

# 方法2
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)

min_by_max

#>        0   1   2
    0  81  68  59
    1  45  73  23
    2  20  22  69
    
#>    0    0.728395
    1    0.315068
    2    0.289855
    dtype: float64

如何创建包含每行第二大值的列

df = pd.DataFrame(np.random.randint(1,100, 9).reshape(3, -1))
print(df)
# 行方向上取第二大的值组成series
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
# 构建dataframe新的列
df['penultimate'] = out
print(df)

#>        0   1   2
    0  28  77   1
    1  43  19  69
    2  29  30  72

#>        0   1   2  penultimate
    0  28  77   1           28
    1  43  19  69           43
    2  29  30  72           30

如何归一化dataframe的所有列

df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

# 正态分布归一化
out1 = df.apply(lambda x: ((x - x.mean())/x.std()).round(2))
print('Solution Q1\n',out1)

# 线性归一化
out2 = df.apply(lambda x: ((x.max() - x)/(x.max() - x.min())).round(2))
print('Solution Q2\n', out2)

如何计算每一行与下一行的相关性

df = pd.DataFrame(np.random.randint(1,100, 25).reshape(5, -1))

# 行与行之间的相关性
[df.iloc[i].corr(df.iloc[i+1]).round(2) for i in range(df.shape[0])[:-1]]

如何用0赋值dataframe的主对角线和副对角线

df = pd.DataFrame(np.random.randint(1,100, 25).reshape(5, -1))
print(df)
# zhu
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    df.iat[df.shape[0]-i-1, i] = 0
print(df)

#>        0   1   2   3   4
    0  51  35  71  71  79
    1  78  25  71  85  44
    2  90  97  72  14   4
    3  27  91  37  25  48
    4   1  26  68  70  20

#>        0   1   2   3   4
    0   0  35  71  71   0
    1  78   0  71   0  44
    2  90  97   0  14   4
    3  27   0  37   0  48
    4   0  26  68  70   0