机器学习之路--Numpy

常用代码

ndarray.dtype 数据类型必须是一样的
常用代码

import numpy 


#numpy读取文件 
world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",", dtype=str, skip_header=1)

#<class 'numpy.ndarray'>
print(type(world_alcohol))

#获取帮助信息
print (help(numpy.genfromtxt))

#创建一个一维数组 (4,)
vector = numpy.array([1, 2, 3, 4])


#创建一个矩阵 (3,3)
matrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])

#获取矩阵的行和列数
print(matrix.shape)
>>(3,3)

#获取第二行第二列的值
third_country = world_alcohol[2,2]

#创建一个矩阵 
matrix = numpy.array([
[5, 10, 15], 
[20, 25, 30],
[35, 40, 45]
])

#获取第一列的所有值
print(matrix[:,1])
>>[10 25 40]

#获取第一行的所有值
print(matrix[1,:])
>>[20, 25, 30]

#获取第0列到第二列的所有值
print(matrix[:,0:2])
>>[[ 5 10] [20 25] [35 40]]

#判断是否有该数
matrix = numpy.array([
                    [5, 10, 15], 
                    [20, 25, 30],
                    [35, 40, 45]
                 ])

#注意返回的是一个布尔值列表
matrix == 25
>>array([[False, False, False],
       [False,  True, False],
       [False, False, False]], dtype=bool)

#根据布尔相应条件返回值
matrix = numpy.array([
                [5, 10, 15], 
                [20, 25, 30],
                [35, 40, 45]
             ])
second_column_25 = (matrix[:,1] == 25)
print second_column_25
print(matrix[second_column_25, :])
>>[False  True False]
     [[20 25 30]]


#集合操作
vector = numpy.array([5, 10, 15, 20])
equal_to_ten_and_five = (vector == 10) & (vector == 5)
print equal_to_ten_and_five
>>[False False False False]

vector = numpy.array([5, 10, 15, 20])
equal_to_ten_or_five = (vector == 10) | (vector == 5)
print equal_to_ten_or_five
>>[ True  True False False]

vector = numpy.array([5, 10, 15, 20])
equal_to_ten_or_five = (vector == 10) | (vector == 5)
vector[equal_to_ten_or_five] = 50
print(vector)
>>[50 50 15 20]


#dtype的 转换
vector = numpy.array(["1", "2", "3"])
print (vector.dtype)
print vector
vector = vector.astype(float)
print vector.dtype
print vector
>>|S1
['1' '2' '3']
float64
[ 1.  2.  3.]


#最小值
vector = numpy.array([5, 10, 15, 20])
vector.min()

#求和 axis=1是按行 axis=0是按列
matrix = numpy.array([
                [5, 10, 15], 
                [20, 25, 30],
                [35, 40, 45]
             ])
matrix.sum(axis=1)
>>array([ 30,  75, 120])

#小案例替换文本中的nan为0 
#原始数据
a,b,ce,1
ea,b4,fc,1
a,b,c,
a3,b3,fc,1
ae,b2,c,
af,b,c,1

#replace nan value with 0
#注意如果dtype不为float的像字符串这样就会被转为nan
world_alcohol = numpy.genfromtxt("test.txt", delimiter=",",dtype=float)  
print (world_alcohol)
#这里is_value_empty  返回的是一个布尔列表
is_value_empty = numpy.isnan(world_alcohol[:,3])
print (is_value_empty)
#world_alcohol 里面可以加布尔列表 
world_alcohol[is_value_empty, 3] = '0'
alcohol_consumption = world_alcohol[:,3]
alcohol_consumption = alcohol_consumption.astype(float)
total_alcohol = alcohol_consumption.sum()
average_alcohol = alcohol_consumption.mean()
print (total_alcohol)
print (average_alcohol)
>>
[[nan nan nan  1.]
 [nan nan nan  1.]
 [nan nan nan nan]
 [nan nan nan  1.]
 [nan nan nan nan]
 [nan nan nan  1.]]
[False False  True False  True False]
4.0
0.6666666666666666


#生成数组
print (np.arange(15))
a = np.arange(15).reshape(3, 5)
a
>>[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

#获取维度
a.ndim
>>2

#获取值类型
a.dtype.name
>>'int32'

#生成一个全是0的矩阵
np.zeros ((3,4)) 
>>
array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

#生成两个3行4维全是1的矩阵
np.ones( (2,3,4), dtype=np.int32 )
>>
array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]])


#生成一个10到30区间范围的 并且 以5为步长的一维矩阵
np.arange( 10, 30, 5 )
>>array([10, 15, 20, 25]

#生成一个0到12区间范围的 并且 以1为步长的一维矩阵 然后在
reshape成为4行3列的矩阵
np.arange(12).reshape(4,3)
>>
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])


#随机生成一个2行3列的矩阵 里面的值是在-1到1里面取
np.random.random((2,3))
>>
array([[-0.54802527, -0.13235897, -0.25751953],
       [ 0.29272435,  0.05077192, -0.31131139]])


#均值生成的数组 前两个参数是取值范围 第三个参数是个数 
np.linspace( 0, 10, 5 )
>>array([ 0. ,  2.5,  5. ,  7.5, 10. ])

#矩阵计算1
#The matrix product can be performed using the dot function or method
A = np.array( [[1,1],
               [0,1]] )
B = np.array( [[2,0],
               [3,4]] )
print (A)
print ('-------')
print (B)
print ('-------')
#A B矩阵之间的内积
print (A*B)
print ('-------')
#A B矩阵之间的乘法
print (A.dot(B))
print ('-------')
print (np.dot(A, B)) 
>>
[[1 1]
 [0 1]]
-------
[[2 0]
 [3 4]]
-------
[[2 0]
 [0 4]]
-------
[[5 4]
 [3 4]]
-------
[[5 4]
 [3 4]]

#矩阵计算2
#the product operator * operates elementwise in NumPy arrays
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
print (a) 
print (b)
#b
c = a-b
print (c)
c = c -1
print (c)
b**2
print (b**2)
print (a<35)
>>[20 30 40 50]
[0 1 2 3]
[20 29 38 47]
[19 28 37 46]
[0 1 4 9]
[ True  True False False]

#矩阵计算3
B = np.arange(3)
print (B)
#exp是ln对数
print (np.exp(B))
print (np.sqrt(B))
>>[0 1 2]
[ 1.          2.71828183  7.3890561 ]
[ 0.          1.          1.41421356]


#floor是向下取整 比如 np.floor(-1.5)  >> -2.0
a = np.floor(10*np.random.random((3,4)))
print (a)
print ('--------')
#a.shape

#ravel()是讲矩阵变成一维数组
print (a.ravel())
print ('--------')
a.shape = (6, 2)
print (a) 
print ('--------')
#a.T是转置
print (a.T)
>>
[[ 6.  7.  2.  9.]
 [ 6.  0.  5.  2.]
 [ 9.  0.  9.  6.]]
--------
[ 6.  7.  2.  9.  6.  0.  5.  2.  9.  0.  9.  6.]
--------
[[ 6.  7.]
 [ 2.  9.]
 [ 6.  0.]
 [ 5.  2.]
 [ 9.  0.]
 [ 9.  6.]]
--------
[[ 6.  2.  6.  5.  9.  9.]
 [ 7.  9.  0.  2.  0.  6.]]


#合并
import numpy as np
a = np.floor(10*np.random.random((2,2)))
b = np.floor(10*np.random.random((2,2)))
print (a)
print ('---')
print (b)
print ('---')
#垂直合并
print (np.vstack((a,b)))
#水平合并
#np.hstack((a,b))
>>
[[ 3.  7.]
 [ 2.  6.]]
---
[[ 9.  6.]
 [ 0.  7.]]
---
[[ 3.  7.]
 [ 2.  6.]
 [ 9.  6.]
 [ 0.  7.]]

#切分
a = np.floor(10*np.random.random((2,12)))
print (a)
print ('---')
#hsplit按列切分第一个参数是要切分的数据集
# 第二个参数是要平均的切分几份
print (np.hsplit(a,3))
print ('---')
#里面(3,4)是指在第3列和第四列切开
print (np.hsplit(a,(3,4)))   # Split a after the third and the fourth column
a = np.floor(10*np.random.random((12,2)))
print ('---')
print (a)
np.vsplit(a,3)
>>
[[ 8.  3.  3.  5.  9.  0.  1.  1.  6.  2.  7.  2.]
 [ 7.  1.  9.  7.  5.  2.  5.  7.  0.  3.  1.  1.]]
---
[array([[ 8.,  3.,  3.,  5.],
       [ 7.,  1.,  9.,  7.]]), array([[ 9.,  0.,  1.,  1.],
       [ 5.,  2.,  5.,  7.]]), array([[ 6.,  2.,  7.,  2.],
       [ 0.,  3.,  1.,  1.]])]
---
[array([[ 8.,  3.,  3.],
       [ 7.,  1.,  9.]]), array([[ 5.],
       [ 7.]]), array([[ 9.,  0.,  1.,  1.,  6.,  2.,  7.,  2.],
       [ 5.,  2.,  5.,  7.,  0.,  3.,  1.,  1.]])]
---
[[ 8.  1.]
 [ 3.  2.]
 [ 8.  0.]
 [ 9.  0.]
 [ 9.  0.]
 [ 5.  3.]
 [ 3.  3.]
 [ 7.  2.]
 [ 5.  7.]
 [ 9.  6.]
 [ 4.  0.]
 [ 8.  4.]]
[array([[ 8.,  1.],
        [ 3.,  2.],
        [ 8.,  0.],
        [ 9.,  0.]]), array([[ 9.,  0.],
        [ 5.,  3.],
        [ 3.,  3.],
        [ 7.,  2.]]), array([[ 5.,  7.],
        [ 9.,  6.],
        [ 4.,  0.],
        [ 8.,  4.]])]

#Python深浅拷贝
https://www.cnblogs.com/echoboy/p/9059183.html
=  数据完全共享
b=[1,2,['a','b']]
a=b 
浅拷贝 数据半共享（复制其数据独立内存存放，但是只拷贝成功第一层）
a=b.copy()
深拷贝 数据完全不共享（复制其数据完完全全放独立的一个内存，完全拷贝，数据不共享）
import copy
a=copy.deepcopy(b)