老唐数据分析机器学习
numpy1
import numpy
world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",", dtype=str)
print(type(world_alcohol))
print (world_alcohol)
#print (help(numpy.genfromtxt)) #帮助文档
'''
<class 'numpy.ndarray'>
[['Year' 'WHO region' 'Country' 'Beverage Types' 'Display Value']
['1986' 'Western Pacific' 'Viet Nam' 'Wine' '0']
['1986' 'Americas' 'Uruguay' 'Other' '0.5']
...
['1987' 'Africa' 'Malawi' 'Other' '0.75']
['1989' 'Americas' 'Bahamas' 'Wine' '1.5']
['1985' 'Africa' 'Malawi' 'Spirits' '0.31']]
'''
#The numpy.array() function can take a list or list of lists as input. When we input a list, we get a one-dimensional array as a result:
vector = numpy.array([5, 10, 15, 20])
#When we input a list of lists, we get a matrix as a result:
matrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])
print (vector)
print (matrix)
'''
[ 5 10 15 20]
[[ 5 10 15]
[20 25 30]
[35 40 45]]
'''
#We can use the ndarray.shape property to figure out how many elements are in the array
vector = numpy.array([1, 2, 3, 4])
print(vector.shape)
#For matrices, the shape property contains a tuple with 2 elements.
matrix = numpy.array([[5, 10, 15], [20, 25, 30]])
print(matrix.shape)
'''
(4,)
(2, 3)
'''
#Each value in a NumPy array has to have the same data type
#NumPy will automatically figure out an appropriate data type when reading in data or converting lists to arrays.
#You can check the data type of a NumPy array using the dtype property.
numbers = numpy.array([1, 2, 3, 4])
numbers.dtype
'''
dtype('int32')
'''
#When NumPy can't convert a value to a numeric data type like float or integer, it uses a special nan value that stands for Not a Number
#nan is the missing data
#1.98600000e+03 is actually 1.986 * 10 ^ 3
world_alcohol
'''
array([['Year', 'WHO region', 'Country', 'Beverage Types',
'Display Value'],
['1986', 'Western Pacific', 'Viet Nam', 'Wine', '0'],
['1986', 'Americas', 'Uruguay', 'Other', '0.5'],
...,
['1987', 'Africa', 'Malawi', 'Other', '0.75'],
['1989', 'Americas', 'Bahamas', 'Wine', '1.5'],
['1985', 'Africa', 'Malawi', 'Spirits', '0.31']], dtype='<U52')
'''
world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",", dtype="U75", skip_header=1)
print(world_alcohol)
'''
[['1986' 'Western Pacific' 'Viet Nam' 'Wine' '0']
['1986' 'Americas' 'Uruguay' 'Other' '0.5']
['1985' 'Africa' "Cte d'Ivoire" 'Wine' '1.62']
...
['1987' 'Africa' 'Malawi' 'Other' '0.75']
['1989' 'Americas' 'Bahamas' 'Wine' '1.5']
['1985' 'Africa' 'Malawi' 'Spirits' '0.31']]
'''
uruguay_other_1986 = world_alcohol[1,4]
third_country = world_alcohol[2,2]
print (uruguay_other_1986)
print (third_country)
'''
0.5
Cte d'Ivoire
'''
vector = numpy.array([5, 10, 15, 20])
print(vector[0:3])
'''
[ 5 10 15]
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
print(matrix[:,1])
'''
[10 25 40]
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
print(matrix[:,0:2])
'''
[[ 5 10]
[20 25]
[35 40]]
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
print(matrix[1:3,0:2])
'''
[[20 25]
[35 40]]
'''
numpy2
import numpy
#it will compare the second value to each element in the vector
# If the values are equal, the Python interpreter returns True; otherwise, it returns False
vector = numpy.array([5, 10, 15, 20])
vector == 10
'''
array([False, True, False, False])
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
matrix == 25
'''
array([[False, False, False],
[False, True, False],
[False, False, False]])
'''
#Compares vector to the value 10, which generates a new Boolean vector [False, True, False, False]. It assigns this result to equal_to_ten
vector = numpy.array([5, 10, 15, 20])
equal_to_ten = (vector == 10)
print (equal_to_ten)
print(vector[equal_to_ten])
'''
[False True False False]
[10]
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
second_column_25 = (matrix[:,1] == 25)
print (second_column_25)
print(matrix[second_column_25, :])
'''
[False True False]
[[20 25 30]]
'''
#We can also perform comparisons with multiple conditions
vector = numpy.array([5, 10, 15, 20])
equal_to_ten_and_five = (vector == 10) & (vector == 5)
print (equal_to_ten_and_five)
'''
[False False False False]
'''
vector = numpy.array([5, 10, 15, 20])
equal_to_ten_or_five = (vector == 10) | (vector == 5)
print (equal_to_ten_or_five)
'''
[ True True False False]
'''
vector = numpy.array([5, 10, 15, 20])
equal_to_ten_or_five = (vector == 10) | (vector == 5)
vector[equal_to_ten_or_five] = 50
print(vector)
'''
[50 50 15 20]
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
second_column_25 = matrix[:,1] == 25
print (second_column_25)
matrix[second_column_25, 1] = 10
print (matrix)
'''
[False True False]
[[ 5 10 15]
[20 10 30]
[35 40 45]]
'''
#We can convert the data type of an array with the ndarray.astype() method.
vector = numpy.array(["1", "2", "3"])
print (vector.dtype)
print (vector)
vector = vector.astype(float)
print (vector.dtype)
print (vector)
'''
<U1
['1' '2' '3']
float64
[1. 2. 3.]
'''
vector = numpy.array([5, 10, 15, 20])
vector.sum()
'''
50
'''
# The axis dictates which dimension we perform the operation on
#1 means that we want to perform the operation on each row, and 0 means on each column
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
matrix.sum(axis=1)
'''
array([ 30, 75, 120])
'''
matrix = numpy.array([
[5, 10, 15],
[20, 25, 30],
[35, 40, 45]
])
matrix.sum(axis=0)
'''
array([60, 75, 90])
'''
#replace nan value with 0
world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",")
#print world_alcohol
is_value_empty = numpy.isnan(world_alcohol[:,4])
#print is_value_empty
world_alcohol[is_value_empty, 4] = '0'
alcohol_consumption = world_alcohol[:,4]
alcohol_consumption = alcohol_consumption.astype(float)
total_alcohol = alcohol_consumption.sum()
average_alcohol = alcohol_consumption.mean()
print (total_alcohol)
print (average_alcohol)
'''
1137.78
1.140060120240481
'''
numpy3
import numpy as np
print(np.arange(15))
a = np.arange(15).reshape(3, 5)
a
'''
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
'''
a.shape
'''
(3, 5)
'''
#the number of axes (dimensions) of the array
a.ndim
'''
2
'''
a.dtype.name
'''
'int32'
'''
#the total number of elements of the array
a.size
'''
15
'''
np.zeros ((3,4))
'''
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
'''
np.ones( (2,3,4), dtype=np.int32 )
'''
array([[[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1]],
[[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1]]])
'''
#To create sequences of numbers
np.arange( 10, 30, 5 )
'''
array([10, 15, 20, 25])
'''
np.arange( 0, 2, 0.3 )
'''
array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])
'''
np.arange(12).reshape(4,3)
'''
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
'''
np.random.random((2,3))#random随机产生(-1,1)区间的数
'''
array([[0.06665873, 0.92526157, 0.42866618],
[0.19151176, 0.79870056, 0.32145198]])
'''
from numpy import pi
#linspace取100个间隔为2*pi的数
np.linspace( 0, 2*pi, 100 )
'''
array([0. , 0.06346652, 0.12693304, 0.19039955, 0.25386607,
0.31733259, 0.38079911, 0.44426563, 0.50773215, 0.57119866,
0.63466518, 0.6981317 , 0.76159822, 0.82506474, 0.88853126,
0.95199777, 1.01546429, 1.07893081, 1.14239733, 1.20586385,
1.26933037, 1.33279688, 1.3962634 , 1.45972992, 1.52319644,
1.58666296, 1.65012947, 1.71359599, 1.77706251, 1.84052903,
1.90399555, 1.96746207, 2.03092858, 2.0943951 , 2.15786162,
2.22132814, 2.28479466, 2.34826118, 2.41172769, 2.47519421,
2.53866073, 2.60212725, 2.66559377, 2.72906028, 2.7925268 ,
2.85599332, 2.91945984, 2.98292636, 3.04639288, 3.10985939,
3.17332591, 3.23679243, 3.30025895, 3.36372547, 3.42719199,
3.4906585 , 3.55412502, 3.61759154, 3.68105806, 3.74452458,
3.8079911 , 3.87145761, 3.93492413, 3.99839065, 4.06185717,
4.12532369, 4.1887902 , 4.25225672, 4.31572324, 4.37918976,
4.44265628, 4.5061228 , 4.56958931, 4.63305583, 4.69652235,
4.75998887, 4.82345539, 4.88692191, 4.95038842, 5.01385494,
5.07732146, 5.14078798, 5.2042545 , 5.26772102, 5.33118753,
5.39465405, 5.45812057, 5.52158709, 5.58505361, 5.64852012,
5.71198664, 5.77545316, 5.83891968, 5.9023862 , 5.96585272,
6.02931923, 6.09278575, 6.15625227, 6.21971879, 6.28318531])
'''
np.sin(np.linspace( 0, 2*pi, 100 ))
'''
array([ 0.00000000e+00, 6.34239197e-02, 1.26592454e-01, 1.89251244e-01,
2.51147987e-01, 3.12033446e-01, 3.71662456e-01, 4.29794912e-01,
4.86196736e-01, 5.40640817e-01, 5.92907929e-01, 6.42787610e-01,
6.90079011e-01, 7.34591709e-01, 7.76146464e-01, 8.14575952e-01,
8.49725430e-01, 8.81453363e-01, 9.09631995e-01, 9.34147860e-01,
9.54902241e-01, 9.71811568e-01, 9.84807753e-01, 9.93838464e-01,
9.98867339e-01, 9.99874128e-01, 9.96854776e-01, 9.89821442e-01,
9.78802446e-01, 9.63842159e-01, 9.45000819e-01, 9.22354294e-01,
8.95993774e-01, 8.66025404e-01, 8.32569855e-01, 7.95761841e-01,
7.55749574e-01, 7.12694171e-01, 6.66769001e-01, 6.18158986e-01,
5.67059864e-01, 5.13677392e-01, 4.58226522e-01, 4.00930535e-01,
3.42020143e-01, 2.81732557e-01, 2.20310533e-01, 1.58001396e-01,
9.50560433e-02, 3.17279335e-02, -3.17279335e-02, -9.50560433e-02,
-1.58001396e-01, -2.20310533e-01, -2.81732557e-01, -3.42020143e-01,
-4.00930535e-01, -4.58226522e-01, -5.13677392e-01, -5.67059864e-01,
-6.18158986e-01, -6.66769001e-01, -7.12694171e-01, -7.55749574e-01,
-7.95761841e-01, -8.32569855e-01, -8.66025404e-01, -8.95993774e-01,
-9.22354294e-01, -9.45000819e-01, -9.63842159e-01, -9.78802446e-01,
-9.89821442e-01, -9.96854776e-01, -9.99874128e-01, -9.98867339e-01,
-9.93838464e-01, -9.84807753e-01, -9.71811568e-01, -9.54902241e-01,
-9.34147860e-01, -9.09631995e-01, -8.81453363e-01, -8.49725430e-01,
-8.14575952e-01, -7.76146464e-01, -7.34591709e-01, -6.90079011e-01,
-6.42787610e-01, -5.92907929e-01, -5.40640817e-01, -4.86196736e-01,
-4.29794912e-01, -3.71662456e-01, -3.12033446e-01, -2.51147987e-01,
-1.89251244e-01, -1.26592454e-01, -6.34239197e-02, -2.44929360e-16])
'''
#the product operator * operates elementwise in NumPy arrays
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
#print (a)
#print (b)
#b
c = a-b
#print (c)
b**2
#print (b**2)
print (a<35)
'''
[ True True False False]
'''
#The matrix product can be performed using the dot function or method
A = np.array( [[1,1],
[0,1]] )
B = np.array( [[2,0],
[3,4]] )
print (A)
print (B)
#print (A*B) #对应位置上的数相乘
print (A.dot(B)) #数学中的矩阵相乘
print (np.dot(A, B)) #数学中的矩阵相乘
'''
[[1 1]
[0 1]]
[[2 0]
[3 4]]
[[5 4]
[3 4]]
[[5 4]
[3 4]]
'''
numpy4
import numpy as np
B = np.arange(3)
print(B)
print(np.exp(B))
print(np.sqrt(B))
'''
[0 1 2]
[1. 2.71828183 7.3890561 ]
[0. 1. 1.41421356]
'''
#Return the floor of the input
a = np.floor(10*np.random.random((3,4)))
print(a)
print('--------------')
print(a.shape)
print('--------------')
## flatten the array
print(a.ravel()) #拉平
print('--------------')
a.shape = (6, 2)
print(a)
print('--------------')
print(a.T) #转置
print(a.resize((2,6)))
print(a)
#If a dimension is given as -1 in a reshaping operation, the other dimensions are automatically calculated:
#a.reshape(3,-1) #用-1表示会进行自动计算
'''
[[1. 6. 7. 4.]
[5. 4. 1. 0.]
[2. 3. 9. 7.]]
--------------
(3, 4)
[1. 6. 7. 4. 5. 4. 1. 0. 2. 3. 9. 7.]
--------------
[[1. 6.]
[7. 4.]
[5. 4.]
[1. 0.]
[2. 3.]
[9. 7.]]
--------------
[[1. 7. 5. 1. 2. 9.]
[6. 4. 4. 0. 3. 7.]]
None
[[1. 6. 7. 4. 5. 4.]
[1. 0. 2. 3. 9. 7.]]
'''
a = np.floor(10*np.random.random((2,2)))
b = np.floor(10*np.random.random((2,2)))
print(a)
print('---')
print(b)
print('---')
print(np.hstack((a,b))) #横向拼接
print(np.vstack((a,b))) #纵向拼接
#np.hstack((a,b))
'''
[[7. 5.]
[9. 1.]]
---
[[6. 2.]
[4. 7.]]
---
[[7. 5. 6. 2.]
[9. 1. 4. 7.]]
[[7. 5.]
[9. 1.]
[6. 2.]
[4. 7.]]
'''
a = np.floor(10*np.random.random((2,12)))
print(a)
print('-------------')
print(np.hsplit(a,3)) #横向平均切分三份
print('-------------')
print(np.hsplit(a,(3,4))) # Split a after the third and the fourth column
a = np.floor(10*np.random.random((12,2)))
print('-------------')
print(a)
np.vsplit(a,3) #纵向平均切分三份
'''
[[0. 8. 1. 3. 4. 7. 7. 1. 9. 8. 7. 2.]
[4. 2. 7. 3. 9. 6. 9. 1. 7. 8. 3. 8.]]
-------------
[array([[0., 8., 1., 3.],
[4., 2., 7., 3.]]), array([[4., 7., 7., 1.],
[9., 6., 9., 1.]]), array([[9., 8., 7., 2.],
[7., 8., 3., 8.]])]
-------------
[array([[0., 8., 1.],
[4., 2., 7.]]), array([[3.],
[3.]]), array([[4., 7., 7., 1., 9., 8., 7., 2.],
[9., 6., 9., 1., 7., 8., 3., 8.]])]
-------------
[[9. 3.]
[3. 5.]
[1. 1.]
[0. 3.]
[6. 4.]
[5. 6.]
[9. 4.]
[1. 7.]
[6. 2.]
[1. 6.]
[1. 1.]
[8. 9.]]
[array([[9., 3.],
[3., 5.],
[1., 1.],
[0., 3.]]), array([[6., 4.],
[5., 6.],
[9., 4.],
[1., 7.]]), array([[6., 2.],
[1., 6.],
[1., 1.],
[8., 9.]])]
'''
python三种复制
#Simple assignments make no copy of array objects or of their data.
a = np.arange(12)
b = a
# a and b are two names for the same ndarray object
print(b is a)
b.shape = (3,4)
print(a.shape)
print(id(a))
print(id(b))
'''
True
(3, 4)
1229965715056
1229965715056
'''
#The view method creates a new array object that looks at the same data.
c = a.view()
print(c is a)
c.shape = 2,6
print(a.shape)
c[0,4] = 1234
print(a)
print(id(a))
print(id(c))
'''
False
(3, 4)
[[ 0 1 2 3]
[1234 5 6 7]
[ 8 9 10 11]]
1229965715056
1229965716336
'''
#The copy method makes a complete copy of the array and its data.
d = a.copy()
print(d is a)
d[0,0] = 9999
print(d)
print(a)
'''
False
[[9999 1 2 3]
[1234 5 6 7]
[ 8 9 10 11]]
[[ 0 1 2 3]
[1234 5 6 7]
[ 8 9 10 11]]
'''
numpy5
import numpy as np
data = np.sin(np.arange(20)).reshape(5,4)
print(data)
ind = data.argmax(axis=0)
print(ind)
print(data.shape)
print(data.shape[1])
data_max = data[ind, range(data.shape[1])]
print(data_max)
all(data_max == data.max(axis=0))
'''
[[ 0. 0.84147098 0.90929743 0.14112001]
[-0.7568025 -0.95892427 -0.2794155 0.6569866 ]
[ 0.98935825 0.41211849 -0.54402111 -0.99999021]
[-0.53657292 0.42016704 0.99060736 0.65028784]
[-0.28790332 -0.96139749 -0.75098725 0.14987721]]
[2 0 3 1]
(5, 4)
4
[0.98935825 0.84147098 0.99060736 0.6569866 ]
True
'''
a = np.arange(0, 40, 10)
print(a)
b = np.tile(a, (3, 5)) #扩展
print(b)
'''
[ 0 10 20 30]
[[ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30]
[ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30]
[ 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30 0 10 20 30]]
'''
a = np.array([[4, 3, 5], [1, 2, 1]])
print(a)
print('------------')
b = np.sort(a, axis=1)
print(b)
#b
a.sort(axis=1)
print('------------')
print(a)
a = np.array([4, 3, 1, 2])
j = np.argsort(a) #排序得到索引值
print('------------')
print(j)
print('------------')
print(a[j])
'''
[[4 3 5]
[1 2 1]]
------------
[[3 4 5]
[1 1 2]]
------------
[[3 4 5]
[1 1 2]]
------------
[2 3 1 0]
------------
[1 2 3 4]
'''