Numpy数据处理函数

Numpy函数介绍

import numpy as np
#sqrt 计算各元素的平方根
arr = np.arange(10)
np.sqrt(arr)
array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

# square 计算各元素的平方
arr1 = np.arange(10)
np.square(arr1)
array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)



# modf将数组的小数和整数部分以两个独立数组的形式返回
arr2 = np.array([1.22,3.55])
np.modf(arr2)
(array([0.22, 0.55]), array([1., 3.]))

利用数组进行数据处理

Numpy数组表达式代替循环，可以比等价的Python快上一两个数量级，这就是矢量化计算的强大手段。

points = np.arange(-5,5,0.01) # 1000个间隔相等的点
xs,ys = np.meshgrid(points,points)
xs
array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ...,
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])
ys
array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ...,
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])


z = np.sqrt(xs ** 2 + ys ** 2)
z
array([[7.07106781, 7.06400028, 7.05693985, ..., 7.04988652, 7.05693985,
        7.06400028],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       ...,
       [7.04988652, 7.04279774, 7.03571603, ..., 7.0286414 , 7.03571603,
        7.04279774],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568]])

import matplotlib.pyplot as plt
plt.imshow(z, cmap = plt.cm.gray);plt.colorbar()
plt.title("Image plot")

将条件逻辑表述为数组运算

xarr = np.array([1.1,1.2,1.3,1.4,1.5])
yarr = np.array([2.1,2.2,2.3,2.4,2.5])
cond = np.array([True,False,True,True,False])
# cond为True取xarr 否则取yarr
# 传统写法
result = [(x if c else y) for x,y,c in zip(xarr,yarr,cond)]
result
[1.1, 2.2, 1.3, 1.4, 2.5]


# numpy做法 
result1 = np.where(cond,xarr,yarr)
result1
array([1.1, 2.2, 1.3, 1.4, 2.5])

where闪亮登场

np.where(条件，真值，假值)
传递给where的数组大小可以不相等，甚至是标量值

arr3 = np.random.randn(4, 4)
arr3
array([[ 0.6498161 ,  0.35784392, -1.47023858,  1.09367264],
       [-0.62756846,  0.23898718,  1.41371883,  0.48955242],
       [-0.10017446,  0.24327529,  0.04354429,  0.80346031],
       [-0.74234979, -0.11921036, -0.11432723, -0.37912988]])


result2 = np.where(arr3>0,2,-2)
result2
array([[ 2,  2, -2,  2],
       [-2,  2,  2,  2],
       [-2,  2,  2,  2],
       [-2, -2, -2, -2]])
result3 = np.where(arr3>0,2,arr3)
result3
array([[ 2.        ,  2.        , -1.47023858,  2.        ],
       [-0.62756846,  2.        ,  2.        ,  2.        ],
       [-0.10017446,  2.        ,  2.        ,  2.        ],
       [-0.74234979, -0.11921036, -0.11432723, -0.37912988]])

cond1 = np.array([True,True,False,False])
cond2 = np.array([True,False,True,False])

# 如果cond1和cond2都为真，则输出0，如果cond1为真，则输出1，如果cond2为真，则输出2，如果都是为假，则输出3
result4 = np.where(cond1 & cond2, 0, np.where(cond1, 1, np.where(cond2, 2, 3)))
result4
array([0, 1, 2, 3])

数学和统计方法

sum/mean/std等聚合计算(和，平均值，标准差)

test1 = np.array([[2,2,3,4,5],[6,7,8,9,10]])
test1

array([[ 2,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])
np.mean(test1)
5.5

test1.sum()
55

test1.mean(axis=1)
array([3., 8.])

test1.mean(axis=0)
array([3.5, 4.5, 5.5, 6.5, 7.5])

test1.cumsum(0) # cumsum 所有元素的累计和 cumprod 所有元素的累计积
array([[ 2,  2,  3,  4,  5],
       [ 8,  9, 11, 13, 15]], dtype=int32)
- axis轴，指的是维度

test1.std(axis=0) # 标准差
array([2. , 2.5, 2.5, 2.5, 2.5])

test1.var(axis=0) # 方差
array([4.  , 6.25, 6.25, 6.25, 6.25])

test1.argmin(axis=0)
array([0, 0, 0, 0, 0], dtype=int64)

test1.argmax(axis=0)
array([1, 1, 1, 1, 1], dtype=int64)

用于布尔型数组的方法

test2 = np.random.randn(100)
test2
array([ 0.25903273,  0.13939567, -0.10597059,  0.77790221, -0.76341781,
       -0.61086865,  0.36002937, -0.1423488 ,  1.69071728,  1.63576044,
        0.39950296,  1.37198449,  0.454591  , -1.67927663,  0.73649581,
        0.88126353, -1.82159175, -0.46323513, -0.30399076,  2.16435963,
       -0.79892847,  0.05767935,  0.45429729,  0.96934967, -0.78818112,
       -0.8438922 ,  0.31373184,  0.05242094,  0.2332054 ,  0.26647064,
        1.56850088,  0.41425585, -1.62452194, -1.17165311,  0.23586585,
        0.45476575, -0.57501697,  1.42377017,  0.00666962,  1.53916711,
        0.508553  , -1.37573917,  0.51378532,  1.72682708, -0.76148258,
       -1.19819233, -1.05367328,  1.0792924 ,  0.80229908,  1.03273504,
        0.71938515,  0.28893472, -0.08472809,  1.02170717,  0.03897593,
       -0.0693723 , -0.60612239, -0.35538122, -1.09975843,  0.23485432,
       -0.4513678 , -0.8119979 , -0.53072714,  1.02247374,  0.52980399,
       -1.17365366, -0.4948684 , -0.81596822,  1.10386231, -1.10894077,
        1.33491691,  0.21015349, -0.32206128, -0.33041407, -0.06815369,
        2.27874416, -0.26642346, -0.95616127, -1.38222481, -0.89619146,
        2.70433   , -1.8758817 , -1.61408998, -0.70112051,  0.63143197,
       -0.5937125 , -0.82650637,  1.24456287, -0.61903984, -0.45140393,
        0.25139079, -0.18882441, -0.61667939,  0.84566077, -1.08506887,
       -0.45491845, -1.68915454,  0.58872177, -0.30961048, -0.43431663])

(test2 > 0).sum()
49

# any用于检测数组中是否存在True
bools = np.array([False,False,False,False])
bools.any()
False

# all用于检测数组中所有值是否都是True
bools.all()
False

排序

sort()方法直接修改数组本身

test2 = np.array([11,55,33,44,88])
test2
array([11, 55, 33, 44, 88])

test2.sort()
test2
array([11, 33, 44, 55, 88])

test4 = np.array([[82,222,1,4,5],[62,72,8,93,10]])
test4
array([[ 82, 222,   1,   4,   5],
       [ 62,  72,   8,  93,  10]])


# 最终的值排序
test4.sort(1)
test4
array([[  1,   4,   5,  62,  72],
       [  8,  10,  82,  93, 222]])

# 两个位于同一0维度的值排序
test4.sort(0)
test4

array([[  1,   4,   5,  62,  72],
       [  8,  10,  82,  93, 222]])

唯一化以及其他的集合逻辑

np.unique找出唯一值并返回已排序的结果

names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])
np.unique(names)
array(['Bob', 'Joe', 'Will'], dtype='<U4')

是1不是l

方法	说明
unique(x)	计算x中的唯一元素，并返回有序结果
intersect1d(x,y)	x和y的公共元素
union1d(x,y)	计算x和y的并集
in1d(x,y)	得到一个表示"x的元素是否包含于y"的布尔型数组
setdiff1d(x,y)	集合的差，即元素在x中且不再y中
setxor1d(x,y)	集合的对称差,存在一个数组中，但不同时存在于两个数组中的元素

values = np.array([6,0,0,3,2,5,6])
np.in1d(values,[2,3,6])
array([ True, False, False,  True,  True, False,  True])