python机器学习之KNN预测QSAR生物浓缩类别

KNN预测QSAR生物浓缩类别

  数据来源:http://archive.ics.uci.edu/ml/datasets/QSAR+Bioconcentration+classes+dataset

import numpy
import pandas   #导入Excel文件
from sklearn.neighbors import KNeighborsClassifier   #机器学习算法库,没有深度学习算法

shen=pandas.read_csv(r"D:Python代码Machine-Learn1-KNNdatashenwu.csv")
print("总数据条数:{};列数:{}".format(shen.shape[0],shen.shape[1]))
shen.head()
总数据条数:779;列数:14
 CASSMILESSetnHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]ClasslogBCF
0 100-02-7 O=[N+](c1ccc(cc1)O)[O-] Train 0 0.0 1.49 0.14 1.35 0.72 0 1 5 1 0.74
1 100-17-4 O=[N+](c1ccc(cc1)OC)[O-] Train 0 0.0 1.47 0.14 1.70 0.88 0 1 5 1 0.93
2 100-18-5 c1cc(ccc1C(C)C)C(C)C Train 0 0.0 1.20 0.25 4.14 2.06 0 0 0 3 3.24
3 100-25-4 O=[N+]([O-])c1ccc(cc1)[N+](=O)[O-] Train 0 0.0 1.69 0.13 1.89 0.79 0 1 8 3 -0.40
4 100-40-3 C=CC1CCC=CC1 Train 0 0.0 0.52 0.25 2.65 1.31 0 0 0 1 2.24
# 筛选set值为Train的训练数据
shen_train=shen[shen.Set.isin(["Train"])]
shen_test=shen[shen.Set.isin(["Test"])]
print("训练数据:{}个
测试数据:{}个".format((shen_train.shape)[0],(shen_test.shape[0])))
shen_test.head()
训练数据:584个
测试数据:195个
 CASSMILESSetnHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]ClasslogBCF
5 100-42-5 C=Cc1ccccc1 Test 0 0.000 1.40 0.18 2.85 0.86 0 0 0 3 1.13
12 101-53-1 Oc1ccc(cc1)Cc1ccccc1 Test 0 5.768 2.21 0.18 3.40 1.47 0 0 1 3 1.40
15 101-84-8 O(c1ccccc1)c1ccccc1 Test 0 5.614 2.21 0.16 3.40 1.31 0 0 2 1 2.57
16 102-06-7 N=C(Nc1ccccc1)Nc1ccccc1 Test 0 5.030 2.07 0.16 3.09 1.54 0 1 0 2 1.05
19 10315-98-7 O1CCN(CC1)CC(C)C Test 0 0.000 0.00 0.28 1.00 1.80 0 1 1 1 0.23

获得训练,测试数据中的训练数据和结果

# 筛选训练数据中的结果数据
y_train=shen_train.iloc[:,[-2,-1]]
y_test=shen_test.iloc[:,[-2,-1]]
print("训练数据结果:
{}
测试数据结果:
{}
".format(y_train.head(),y_test.head()))

训练数据结果:

   Class  logBCF
0      1    0.74
1      1    0.93
2      3    3.24
3      3   -0.40
4      1    2.24
测试数据结果:
    Class  logBCF
5       3    1.13
12      3    1.40
15      1    2.57
16      2    1.05
19      1    0.23

# 筛选训练数据中的训练数据
x_train=shen_train.iloc[:,[3,4,5,6,7,8,9,10,11]]
x_test=shen_test.iloc[:,[3,4,5,6,7,8,9,10,11]]
print("训练数据:
{}
测试数据:
{}
".format(x_train.head(),x_test.head()))
 
训练数据:
   nHM  piPC09   PCD  X2Av  MLOGP  ON1V  N-072  B02[C-N]  F04[C-O]
0    0     0.0  1.49  0.14   1.35  0.72      0         1         5
1    0     0.0  1.47  0.14   1.70  0.88      0         1         5
2    0     0.0  1.20  0.25   4.14  2.06      0         0         0
3    0     0.0  1.69  0.13   1.89  0.79      0         1         8
4    0     0.0  0.52  0.25   2.65  1.31      0         0         0
测试数据:
    nHM  piPC09   PCD  X2Av  MLOGP  ON1V  N-072  B02[C-N]  F04[C-O]
5     0   0.000  1.40  0.18   2.85  0.86      0         0         0
12    0   5.768  2.21  0.18   3.40  1.47      0         0         1
15    0   5.614  2.21  0.16   3.40  1.31      0         0         2
16    0   5.030  2.07  0.16   3.09  1.54      0         1         0
19    0   0.000  0.00  0.28   1.00  1.80      0         1         1
 
# 查看训练数据的数据类型,当数据类型不是int时要将数据映射为数字才能进行训练

y_train.info()

<class 'pandas.core.frame.DataFrame'> Int64Index: 584 entries, 0 to 776 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Class 584 non-null int64 1 logBCF 584 non-null float64 dtypes: float64(1), int64(1) memory usage: 13.7 KB

将非int类型的数据量化为整数

def change_type(values):
    for col in values.columns[:]:
        u=values[col].unique()
        # 获取每个属性的具体属性值,argwhere方法可以获取属性值的索引值(0,1,2...),并将具体属性值映射为索引值
        def conver(x):
            return numpy.argwhere(u==x)[0,0]
        values[col]=values[col].map(conver)

change_type(x_train)
change_type(x_test)
change_type(y_train)
change_type(y_test)
y_train

584 rows × 2 columns

 ClasslogBCF
0 0 0
1 0 1
2 1 2
3 1 3
4 0 4
... ... ...
771 0 333
772 0 334
773 0 41
774 0 142
776 0 335
 
knn=KNeighborsClassifier(n_neighbors=5,weights="distance",n_jobs=-1)

knn.fit(x_train, y_train)

y_=knn.predict(x_test)

acc=(y_==y_test).mean()

print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))

预测生物富集因子准确率:0.041025641025641026;预测生物富集等级准确率:0.4153846153846154

提高算法准确率

1,修改算法参数

 
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)

knn.fit(x_train, y_train)

y_=knn.predict(x_test)

acc=(y_==y_test).mean()

print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))

预测生物富集因子准确率:0.06666666666666667;预测生物富集等级准确率:0.441025641025641

2,修改训练数据

 
# 最大值最小值归一化(当数据间隔很大时可以有效提高准确率,消除属性之间的差异)

x_train_min=x_train.min()

x_train_max=x_train.max()

x2_train=(x_train-x_train_min)/(x_train_max-x_train_min)

x_test_min=x_test.min()

x_test_max=x_test.max()

x2_test=(x_test-x_test_min)/(x_test_max-x_test_min)

x2_test.head()
nHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]
5 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.000000
12 0.0 0.008929 0.009524 0.000000 0.006849 0.007874 0.0 0.0 0.058824
15 0.0 0.017857 0.009524 0.029412 0.006849 0.015748 0.0 0.0 0.117647
16 0.0 0.026786 0.019048 0.029412 0.013699 0.023622 0.0 1.0 0.000000
19 0.0 0.000000 0.028571 0.058824 0.020548 0.031496 0.0 1.0 0.058824
 
 
 
 
 
 
 
 
 
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)

knn.fit(x2_train, y_train)

y_=knn.predict(x2_test)

acc=(y_==y_test).mean()

print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))

预测生物富集因子准确率:0.02564102564102564;预测生物富集等级准确率:0.4358974358974359
 
# Z-score归一化

# 求平均值

x_train_mean=x_train.mean()

# 求方差

x_train_std=x_train.std()

x3_train=(x_train-x_train_mean)/x_train_std

x3_train.head()

# 求平均值

x_test_mean=x_test.mean()

# 求方差

x_test_std=x_test.std()

x3_test=(x_test-x_test_mean)/x_test_std

x3_test.head()
 nHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]
5 -0.858971 -0.961536 -1.433213 -1.308189 -1.621471 -1.571899 -0.37945 -0.885971 -0.901314
12 -0.858971 -0.934067 -1.399267 -1.308189 -1.597619 -1.544272 -0.37945 -0.885971 -0.706463
15 -0.858971 -0.906599 -1.399267 -1.184954 -1.597619 -1.516645 -0.37945 -0.885971 -0.511611
16 -0.858971 -0.879131 -1.365321 -1.184954 -1.573767 -1.489018 -0.37945 1.122917 -0.901314
19 -0.858971 -0.961536 -1.331375 -1.061719 -1.549914 -1.461391 -0.37945 1.122917 -0.706463
 
 
 
 
 
 
 
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)

knn.fit(x3_train, y_train)

y_=knn.predict(x3_test)

acc=(y_==y_test).mean()

print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))

预测生物富集因子准确率:0.015384615384615385;预测生物富集等级准确率:0.41025641025641024
 
# sklearn封装为的z-score归一化操作

from sklearn.preprocessing import StandardScaler,MinMaxScaler

s=StandardScaler()

x4_train=s.fit_transform(x_train)   

# x4_train      为z-score归一化后的数据

x4_test=s.fit_transform(x_test)

m=MinMaxScaler()

x5_train=m.fit_transform(x_train)

x5_test=m.fit_transform(x_test)

# x5_train     为最大值,最小值归一化后的数据

保存算法模型

 
from sklearn.externals import joblib

joblib.dump(knn,'./model',cache_size=9)  # 保存模型,默认保存为压缩类型,会保存所有数据,cache_size=9时为压缩最小大小
# 加载模型  
model
=joblib.load('./model')

x
=model.predict(x2_test)

acc
=(y_==y_test).mean()

print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))

预测生物富集因子准确率:0.020512820512820513;预测生物富集等级准确率:0.41025641025641024

原文地址:https://www.cnblogs.com/lq13035130506/p/12543134.html