Python3入门人工智能掌握机器学习深度学习提升实战能力6：模型评价与优化

过拟合和欠拟合

数据分离与混淆矩阵

模型优化

实战准备

实战一

1 #generate new data 建立新数据
2 x_2_range = np.linspace(40,90,300).reshape(-1,1)#最小值40，最大值90，产生300个点;转成300行一列的数组
3 x_2_range = poly2.transform(x_2_range)
4 y_2_range_predict = lr2.predict(x_2_range)
5 
6 x_5_range = np.linspace(40,90,300).reshape(-1,1)#最小值40，最大值90，产生300个点;转成300行一列的数组
7 x_5_range = poly5.transform(x_5_range)
8 y_5_range_predict = lr5.predict(x_5_range)

实战二

1 #load the data
2 import pandas as pd
3 import numpy as np
4 data = pd.read_csv('data_class_raw.csv')
5 data.head()

1 #define x and y
2 x = data.drop(['y'],axis=1)
3 y = data.loc[:,'y']
4 print(x.shape,y.shape)

 1 #visualize the data
 2 %matplotlib inline
 3 from matplotlib import pyplot as plt
 4 fig1 = plt.figure(figsize=(5,5))
 5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
 6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
 7 plt.legend((good,bad),('good','bad'))
 8 plt.title('raw data')
 9 plt.xlabel('x1')
10 plt.ylabel('x2')
11 plt.show()

1 #anomaly detextion 异常点检测
2 from sklearn.covariance import EllipticEnvelope
3 ad_model = EllipticEnvelope(contamination=0.02)
4 ad_model.fit(x[y==0])
5 y_predict_bad = ad_model.predict(x[y==0])
6 print(y_predict_bad)

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1]

 1 #visualize the data
 2 %matplotlib inline
 3 from matplotlib import pyplot as plt
 4 fig1 = plt.figure(figsize=(5,5))
 5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
 6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
 7 plt.scatter(x.loc[:,'x1'][y==0][y_predict_bad==-1],x.loc[:,'x2'][y==0][y_predict_bad==-1],marker='x',s=150)
 8 plt.legend((good,bad),('good','bad'))
 9 plt.title('raw data')
10 plt.xlabel('x1')
11 plt.ylabel('x2')
12 plt.show()

1 data = pd.read_csv('data_class_processed.csv')
2 data.head()
3 #define x and y
4 x = data.drop(['y'],axis=1)
5 y = data.loc[:,'y']

 1 #pca
 2 from sklearn.preprocessing import StandardScaler
 3 from sklearn.decomposition import PCA
 4 x_norm = StandardScaler().fit_transform(x)#标准化处理数据
 5 pca = PCA(n_components=2)
 6 x_reduced = pca.fit_transform(x_norm)
 7 var_ratio = pca.explained_variance_ratio_
 8 print(var_ratio)
 9 fig4 = plt.figure(figsize=(5,5))
10 plt.bar([1,2],var_ratio)
11 plt.show()

1 #train and test split:random_state=4,test_size=0.4 数据分离
2 from sklearn.model_selection import train_test_split
3 x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=4,test_size=0.4)
4 print(x_train.shape,x_test.shape,x.shape)

(21, 2) (14, 2) (35, 2)

 1 #knn model
 2 from sklearn.neighbors import KNeighborsClassifier
 3 knn_10 = KNeighborsClassifier(n_neighbors=10)
 4 knn_10.fit(x_train,y_train)
 5 y_train_predict = knn_10.predict(x_train)
 6 y_test_predict = knn_10.predict(x_test)
 7 #calculate the accuracy
 8 from sklearn.metrics import accuracy_score
 9 accuracy_train = accuracy_score(y_train,y_train_predict)
10 accuracy_test = accuracy_score(y_test,y_test_predict)
11 print('training accuracy:',accuracy_train)
12 print('testing accuracy:',accuracy_test)

training accuracy: 0.9047619047619048
testing accuracy: 0.6428571428571429

1 #visualize the knn result and boundary 2 xx, yy = np.meshgrid(np.arange(0,10,0.05),np.arange(0,10,0.05)) 3 print(yy.shape)

(200, 200) 

1 x_range = np.c_[xx.ravel(),yy.ravel()] 2 print(x_range.shape)

(40000, 2)
1 y_range_predict = knn_10.predict(x_range)

 1 fig4 = plt.figure(figsize=(5,5))
 2 knn_bad = plt.scatter(x_range[:,0][y_range_predict==0],x_range[:,1][y_range_predict==0])
 3 knn_good = plt.scatter(x_range[:,0][y_range_predict==1],x_range[:,1][y_range_predict==1])
 4 
 5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
 6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
 7 plt.legend((good,bad,knn_good,knn_bad),('good','bad','knn_good','knn_bad'))
 8 plt.title('raw data')
 9 plt.xlabel('x1')
10 plt.ylabel('x2')
11 plt.show()

1 from sklearn.metrics import confusion_matrix 2 cm = confusion_matrix(y_test,y_test_predict) 3 print(cm)

[[4 2]
 [3 5]]
 1 TP = cm[1,1] 2 TN = cm[0,0] 3 FP = cm[0,1] 4 FN = cm[1,0] 5 print(TP,TN,FP,FN) 
5 4 2 3

 1 accuracy =(TP+TN)/(TP+TN+FP+FN)#准确率：整体样本中正确样本数的比例
 2 recall = TP/(TP+FP)#Sensitivity 灵敏度(召回率)：正样本中，预测正确的比例
 3 specificity = TN/(TN+FP)#特异度：负样本中，预测正确的比例
 4 precision = TP/(TP+FP)#精确率:预测结果为正样本中，预测正确的比例
 5 f1 = 2*precision*recall/(precision + recall)#F1 Score：综合Precision和Recall的喝一喝判断指标
 6 print('准确率:',accuracy)
 7 print('灵敏度:',recall)
 8 print('特异度:',specificity)
 9 print('精确率:',precision)
10 print('F1 Score:',f1)

准确率: 0.6428571428571429
灵敏度: 0.7142857142857143
特异度: 0.6666666666666666
精确率: 0.7142857142857143
F1 Score: 0.7142857142857143

 1 #try different k and calcualte the accuracy for each
 2 n = [i for i in range(1,21)]
 3 accuracy_train = []
 4 accuracy_test = []
 5 for i in n:
 6     knn = KNeighborsClassifier(n_neighbors=i)
 7     knn.fit(x_train,y_train)
 8     y_train_predict = knn.predict(x_train)
 9     y_test_predict = knn.predict(x_test)
10     accuracy_train_i = accuracy_score(y_train,y_train_predict)
11     accuracy_test_i = accuracy_score(y_test,y_test_predict)
12     accuracy_train.append(accuracy_train_i)
13     accuracy_test.append(accuracy_test_i)
14 print(accuracy_train,accuracy_test)

[1.0, 1.0, 1.0, 1.0, 1.0, 0.9523809523809523, 0.9523809523809523, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.9047619047619048, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714]

[0.5714285714285714, 0.5, 0.5, 0.5714285714285714, 0.7142857142857143, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.6428571428571429, 0.6428571428571429, 0.6428571428571429, 0.5714285714285714, 0.6428571428571429, 0.6428571428571429, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.42857142857142855, 0.42857142857142855, 0.42857142857142855]

 1 fig5 = plt.figure(figsize=(12,5))
 2 plt.subplot(121)
 3 plt.plot(n,accuracy_train,marker='o')
 4 plt.title('training accuracy vs n_neighbors')
 5 plt.xlabel('n_neighbors')
 6 plt.ylabel('accuracy')
 7 
 8 plt.subplot(122)
 9 plt.plot(n,accuracy_test,marker='o')
10 plt.title('testing accuracy vs n_neighbors')
11 plt.xlabel('n_neighbors')
12 plt.ylabel('accuracy')
13 plt.show()

Python3入门人工智能 掌握机器学习 深度学习 提升实战能力6：模型评价与优化

Python3入门人工智能掌握机器学习深度学习提升实战能力6：模型评价与优化