20190929 numpy模块的使用

numpy模块
matplotlib模块
pandas模块

numpy模块

numpy是一种开元的数值计算扩展库，可以用来存储和处理大型numpy数组。用来做数据分析，对numpy数组（即有行又有列）--矩阵进行科学运算。

# 计算两个数组的相乘
lt1 = [1, 2, 3]  # n个元素
lt2 = [4, 5, 6]
lt = []
for i in range(len(lt1)):  # O(n)
    lt.append(lt1[i] * lt2[i])
print(lt)	# [4, 10, 18]

# 用numpy数组进行运算
import numpy as np  # 约定俗成的
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
print(arr1 * arr2)	# [ 4 10 18]

numpy数组（可变）

一维数组（不在讨论范围）

arr = np.array([1, 2, 4])
print(type(arr), arr)
# <class 'numpy.ndarray'> [1 2 4]

二维数组（***）

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print(arr)
[[1 2 3]
 [4 5 6]]

三维数组（TensorFlow）

arr3 = np.array([
    [[1, 2, 3],
     [4, 5, 6]],
    [[1, 2, 3],
     [4, 5, 6]],
])
print(arr)

数组属性

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

1.`T` 数组的的转置

对高维数组而言，行列互换

print(arr,'
',arr.T)
'''
[[1 4]
 [2 5]
[36]]
'''

2.`dtype`数据类型

int32/float64属于numpy的

print(arr.dtype)  # 数组是小数则是float64

3.`size`数组元素的个数

print(arr.size)	# 6

4.`ndim`数组的维度

print( arr.ndim)  # 2

5.`shape`数组维度的大小

print(arr.shape)	# 元组形式打印数组几行几列
# (2, 3)
print(arr.shape[0])
#  2

6.`astype`类型转换

arr = arr.astype(np.float)
print(arr) # 转换成浮点类型
'''
[[1. 2. 3.]
 [4. 5. 6.]]'''

切片

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr[:, :]左边是行，右边是列以,隔开

print(arr[:, :])  # 行，列（全取）
# [[1 2 3]
# [4 5 6]]

print(arr[0, 0])   #（第一行第一列）
# 1

print(arr[0, :])   # 第一行，所有列
# [1 2 3]

print(arr[:, -2:])   # 所有行，-2到最后列
# [[2 3]
#  [5 6]]

逻辑取值

print(arr[arr>4])  # 取出所有大于4的值
#  [5 6]

赋值

# 列表赋值
lt = [1, 2, 3]
lt[:] = [0, 0, 0]
print(lt)

# 数组的赋值
arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr[0, 0] = 0   # 数组序列0与列为0的数为0
print(arr)
'''[[0 2 3]
 [4 5 6]]'''

arr[0, :] = 0  # 0行所有列 =》0
print(arr)
'''[[0 0 0]
 [4 5 6]]'''

arr[:, :] = 0	# 所有行所有列 =》0
print(arr)
'''[[0 0 0]
 [0 0 0]]'''

数组的合并

arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [7, 8, 9],
    ['a', 'b', 'c']
])

`hstack()`水平合并

print(np.hstack((arr1, arr2)))  # 只能放元组
'''[['1' '2' '3' '7' '8' '9']
 ['4' '5' '6' 'a' 'b' 'c']]'''

`vstack()` 垂直合并

print(np.vstack((arr1, arr2)))
'''[['1' '2' '3']
 ['4' '5' '6']
 ['7' '8' '9']
 ['a' 'b' 'c']]'''

`concatenate((), axis=1)`

默认以列合并，可添加参数 axis 0表示列 1表示行

print(np.concatenate((arr1, arr2), axis=1))  
# 默认以列合并 # 0表示列，1表示行
'''[['1' '2' '3' '7' '8' '9']
 ['4' '5' '6' 'a' 'b' 'c']]'''

通过函数创建numpy数组

`ones`创建指定矩阵

print(np.zeros((2, 3)))
'''[[0. 0. 0.]
 [0. 0. 0.]]'''

print(np.ones((2, 3)))
'''[[1. 1. 1.]
 [1. 1. 1.]]'''

`eye`创建单位矩阵

print(np.eye(3, 3))
'''
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]'''

`linspace()`指定范围分段

print(np.linspace(1, 100, 10))
#[  1.  12.  23.  34.  45.  56.  67.  78.  89. 100.]

`arange`数字循环成数组

print(np.arange(2, 10))
# [2 3 4 5 6 7 8 9]

`reshape` 重构形状

arr1 = np.zeros((1, 12))
print(arr1.reshape((3, 4)))  # 重构形状
'''[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]'''

`numpy`数组运算

`+-*/`运算

arr1 = np.ones((3, 4)) * 4
print(arr1)
'''[[4. 4. 4. 4.]
 [4. 4. 4. 4.]
 [4. 4. 4. 4.]]'''

numpy数组运算函数

print(np.sin(arr1))
'''[[-0.7568025 -0.7568025 -0.7568025 -0.7568025]
 [-0.7568025 -0.7568025 -0.7568025 -0.7568025]
 [-0.7568025 -0.7568025 -0.7568025 -0.7568025]]'''

矩阵运算--点乘

arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [1, 2],
    [4, 5],
    [6, 7]
])

`mn nm=m*m`

print(np.dot(arr1, arr2))
'''[[27 33]
 [60 75]]'''

`linalg.inv`求逆

arr = np.array([[1, 2, 3], [4, 5, 6], [9, 8, 9]])
print(np.linalg.inv(arr))
'''[[ 0.5        -1.          0.5       ]
 [-3.          3.         -1.        ]
 [ 2.16666667 -1.66666667  0.5       ]]'''

数组数学和统计方法

print(np.sum(arr[0, :]))
# 6

`random()`生成随机数 ***

print(np.random.rand(3, 4))
'''[[0.69248384 0.77295215 0.62033052 0.62397813]
 [0.8533195  0.28560153 0.7704435  0.42432567]
 [0.25479423 0.72977557 0.28070411 0.23453135]]'''
print(np.random.random((3, 4)))

`random.random`

print(np.random.random((3, 4)))

s = np.random.RandomState(1)
print(s.random((3, 4)))

arr = np.array([[1, 2, 3], [4, 5, 6], [9, 8, 9]])
np.random.shuffle(arr)
print(arr)

`choice()`针对一维

print(np.random.choice([1, 2, 3], 1))
#[1]

`random.randint()`针对某一个范围

print(np.random.randint(1, 100, (3, 4)))
'''[[27 12  7 33]
 [77 66 40 62]
 [34 61 13  6]]'''

matplotlib模块

matplotlib是一个绘图库，它可以创建常用的统计图，包括条形图、箱型图、折线图、散点图、饼图和直方图。

条形图

import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
font = FontProperties(fname='/Library/Fonts/Heiti.ttc')

# 修改背景为条纹
plt.style.use('ggplot')

classes = ['3班', '4班', '5班', '6班']

classes_index = range(len(classes))
print(list(classes_index))

student_amounts = [66, 55, 45, 70]

# 画布设置
fig = plt.figure()
# 1,1,1表示一张画布切割成1行1列共一张图的第1个；2,2,1表示一张画布切割成2行2列共4张图的第一个（左上角）
ax1 = fig.add_subplot(1, 1, 1)
ax1.bar(classes_index, student_amounts, align='center', color='darkblue')
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')

plt.xticks(classes_index,
           classes,
           rotation=0,
           fontsize=13,
           fontproperties=font)
plt.xlabel('班级', fontproperties=font, fontsize=15)
plt.ylabel('学生人数', fontproperties=font, fontsize=15)
plt.title('班级-学生人数', fontproperties=font, fontsize=20)
# 保存图片，bbox_inches='tight'去掉图形四周的空白
# plt.savefig('classes_students.png?x-oss-process=style/watermark', dpi=400, bbox_inches='tight')
plt.show()

直方图

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
font = FontProperties(fname='/Library/Fonts/Heiti.ttc')

# 修改背景为条纹
plt.style.use('ggplot')

mu1, mu2, sigma = 50, 100, 10
# 构造均值为50的符合正态分布的数据
x1 = mu1 + sigma * np.random.randn(10000)
print(x1)
'''[59.00855949 43.16272141 48.77109774 ... 57.94645859 54.70312714
 58.94125528]'''
# 构造均值为100的符合正态分布的数据
x2 = mu2 + sigma * np.random.randn(10000)
print(x2)
'''[115.19915511  82.09208214 110.88092454 ...  95.0872103  104.21549068
 133.36025251]'''

fig = plt.figure()
ax1 = fig.add_subplot(121)
# bins=50表示每个变量的值分成50份，即会有50根柱子
ax1.hist(x1, bins=50, color='darkgreen')

ax2 = fig.add_subplot(122)
ax2.hist(x2, bins=50, color='orange')

fig.suptitle('两个正态分布', fontproperties=font, fontweight='bold', fontsize=15)
ax1.set_title('绿色的正态分布', fontproperties=font)
ax2.set_title('橙色的正态分布', fontproperties=font)
plt.show()

折线图

import numpy as np
from numpy.random import randn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
font = FontProperties(fname='/Library/Fonts/Heiti.ttc')

# 修改背景为条纹
plt.style.use('ggplot')

np.random.seed(1)

# 使用numpy的累加和，保证数据取值范围不会在（0，1）内波动
plot_data1 = randn(40).cumsum()
print(plot_data1)

'''[ 1.62434536  1.01258895  0.4844172  -0.58855142  0.2768562  -2.02468249
 -0.27987073 -1.04107763 -0.72203853 -0.97140891  0.49069903 -1.56944168
 -1.89185888 -2.27591324 -1.1421438  -2.24203506 -2.41446327 -3.29232169
 -3.25010794 -2.66729273 -3.76791191 -2.6231882  -1.72159748 -1.21910314
 -0.31824719 -1.00197505 -1.12486527 -2.06063471 -2.32852279 -1.79816732
 -2.48982807 -2.8865816  -3.5737543  -4.41895994 -5.09020607 -5.10287067
 -6.22018102 -5.98576532 -4.32596314 -3.58391898]'''


plot_data2 = randn(40).cumsum()
plot_data3 = randn(40).cumsum()
plot_data4 = randn(40).cumsum()

plt.plot(plot_data1, marker='o', color='red', linestyle='-', label='红实线')
plt.plot(plot_data2, marker='x', color='orange', linestyle='--', label='橙虚线')
plt.plot(plot_data3, marker='*', color='yellow', linestyle='-.', label='黄点线')
plt.plot(plot_data4, marker='s', color='green', linestyle=':', label='绿点图')

# loc='best'给label自动选择最好的位置
plt.legend(loc='best', prop=font)
plt.show()

散点图+直线图

import numpy as np
from numpy.random import randn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
font = FontProperties(fname='/Library/Fonts/Heiti.ttc')

# 修改背景为条纹
plt.style.use('ggplot')

x = np.arange(1, 20, 1)
print(x)
#  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

# 拟合一条水平散点线
np.random.seed(1)
y_linear = x + 10 * np.random.randn(19)
print(y_linear)
'''[ 17.24345364  -4.11756414  -2.28171752  -6.72968622  13.65407629
 -17.01538697  24.44811764   0.38793099  12.19039096   7.50629625
  25.62107937  -8.60140709   9.77582796  10.15945645  26.33769442
   5.00108733  15.27571792   9.22141582  19.42213747]'''

# 拟合一条x²的散点线
y_quad = x**2 + 10 * np.random.randn(19)
print(y_quad)
'''[  6.82815214  -7.00619177  20.4472371   25.01590721  30.02494339
  45.00855949  42.16272141  62.77109774  71.64230566  97.3211192
 126.30355467 137.08339248 165.03246473 189.128273   216.54794359
 249.28753869 288.87335401 312.82689651 363.34415698]'''

# s是散点大小
fig = plt.figure()
ax1 = fig.add_subplot(121)
plt.scatter(x, y_linear, s=30, color='r', label='蓝点')
plt.scatter(x, y_quad, s=100, color='b', label='红点')

ax2 = fig.add_subplot(122)
plt.plot(x, y_linear, color='r')
plt.plot(x, y_quad, color='b')

# 限制x轴和y轴的范围取值
plt.xlim(min(x) - 1, max(x) + 1)
plt.ylim(min(y_quad) - 10, max(y_quad) + 10)
fig.suptitle('散点图+直线图', fontproperties=font, fontsize=20)
ax1.set_title('散点图', fontproperties=font)
ax1.legend(prop=font)
ax2.set_title('直线图', fontproperties=font)
plt.show()

pandas模块

操作excle/json/sql/ini/csv配置文件

pandas基于numpy，可以看成是处理文本或表格数据，pandas中国有两个主要的数据结构，其中serise数据结构类似于numpy中的一维数组，dataframe类似于多维表格数据结构。

pandas从Excel中读取 DataFrame数据类型。

import numpy as np
import pandas as pd

np.random.seed(10)

index = pd.date_range('2019-01-01', periods=6, freq='M')
print(index)
columns = ['c1', 'c2', 'c3', 'c4']
print(columns)
val = np.random.randn(6, 4)
print(val)

df = pd.DataFrame(index=index, columns=columns, data=val)
print(df)

# 保存文件，读出成文件
df.to_excel('date_c.xlsx')

# 读出文件
df = pd.read_excel('date_c.xlsx', index_col=[0])
print(df)

print(df.index)
print(df.columns)
print(df.values)

print(df[['c1', 'c2']])

# 按照index取值
# print(df['2019-01-31'])
print(df.loc['2019-01-31'])
print(df.loc['2019-01-31':'2019-05-31'])

# 按照values取值
print(df)
print(df.iloc[0, 0])

df.iloc[0, :] = 0
print(df)

20190929 numpy模块的使用

numpy模块

numpy数组（可变）

一维数组（不在讨论范围）

二维数组（***）

三维数组（TensorFlow）

数组属性

1.T 数组的的转置

2.dtype数据类型

3.size数组元素的个数

4.ndim数组的维度

5.shape数组维度的大小

6.astype类型转换

切片

逻辑取值

赋值

数组的合并

hstack()水平合并

vstack() 垂直合并

concatenate((), axis=1)

通过函数创建numpy数组

ones创建指定矩阵

eye创建单位矩阵

linspace()指定范围分段

arange数字循环成数组

reshape 重构形状

numpy数组运算

+-*/运算