机器学习——kNN（2）示例：改进约会网站的配对效果

请通过右侧公告中的“联系邮箱（wlsandwho@foxmail.com）”联系我

勿用于学术性引用。

勿用于商业出版、商业印刷、商业引用以及其他商业用途。　　　　　　　　　　　　　　　　

本文不定期修正完善。

本文链接：http://www.cnblogs.com/wlsandwho/p/7587203.html

耻辱墙：http://www.cnblogs.com/wlsandwho/p/4206472.html

=======================================================================

这个示例实际上是对kNN的练习，区别是使用来自文件的数据。

=======================================================================

1从文件中读取数据并格式化为指定方式。

文件为file2matrix.py

 1 from numpy import *
 2 '''
 3 def file2matrix1(filename):
 4     f=open(filename)
 5     arrlines=f.readlines()
 6     rows=len(arrlines)
 7     retmat=zeros((rows,3))
 8     vctlabels=[]
 9     index=0
10 
11     for line in arrlines:
12         line=line.strip()
13         list=line.split("	")
14         retmat[index,0:3]=list[0:3]
15         vctlabels.append(int(list[-1]))
16         index+=1
17 
18     return retmat,vctlabels
19 
20 mat1,labels1=file2matrix1("datingTestSet2.txt")
21 print(mat1)
22 print(labels1)
23 '''
24 def file2matrix(filename):
25     with open(filename) as file:
26         line1=file.readline()
27         list1=line1.split()
28         cols=len(list1)
29         file.seek(0,0)
30         lines=file.readlines()
31         rows=len(lines)
32 
33         index=0
34         labels=[]
35         realcol=cols-1
36         retmat=zeros((rows,realcol))
37         for line in lines:
38             list=line.split()
39             retmat[index,:]=list[0:realcol]
40             labels.append(int(list[-1]))
41 
42             index+=1
43     return retmat,labels
44 
45 def file2matrix2(filename):
46     with open(filename) as file:
47         line1=file.readline()
48         list1=line1.split()
49         cols=len(list1)
50         file.seek(0,0)
51         lines=file.readlines()
52         rows=len(lines)
53 
54         index=0
55         labels=[]
56         realcol=cols-1
57         retmat=zeros((rows,realcol))
58         for line in lines:
59             list=line.split()
60             retmat[index,:]=list[0:realcol]
61             #labels.append(int(list[-1]))
62             if("largeDoses"==list[-1]):
63                 labels.append(3)
64             elif("smallDoses"==list[-1]):
65                 labels.append(2)
66             elif("didntLike"==list[-1]):
67                 labels.append(1)
68             index+=1
69     return retmat,labels

测试一下

文件为test_file2matrix.py

1 from file2matrix import *
2 
3 
4 mat,labels=file2matrix("datingTestSet2.txt")
5 print(mat)
6 print(labels)

结果（红框是因为我用的虚拟机，不要在意这些细节）

=======================================================================

2 做个图看看相关性/趋势。实际上这里并没有做数学上的讨论，就是画个图看看臆测一下。

文件是drawapicture1.py

 1 import matplotlib
 2 import matplotlib.pyplot as plt
 3 import numpy
 4 from file2matrix import *
 5 mat,labels=file2matrix("datingTestSet2.txt")
 6 nSizeofLabels=len(labels)
 7 
 8 fig=plt.figure()
 9 ax=fig.add_subplot(111)
10 ax.scatter(mat[:,1],mat[:,2],s=15.0*array(labels),c=15.0*array(labels))
11 plt.show()

结果

文件是drawapicture2.py

 1 import matplotlib
 2 import matplotlib.pyplot as plt
 3 import numpy
 4 from file2matrix import *
 5 
 6 mat,labels=file2matrix("datingTestSet2.txt")
 7 nSizeofLabels=len(labels)
 8 mat1x=[]
 9 mat2x=[]
10 mat3x=[]
11 mat1y=[]
12 mat2y=[]
13 mat3y=[]
14 for i in range(nSizeofLabels):
15     if  labels[i]==1:
16         mat1x.append(mat[i][0])
17         mat1y.append(mat[i][1])
18     elif labels[i]==2:
19         mat2x.append(mat[i][0])
20         mat2y.append(mat[i][1])
21     elif  labels[i]==3:
22         mat3x.append(mat[i][0])
23         mat3y.append(mat[i][1])
24 
25 fig=plt.figure()
26 ax=fig.add_subplot(111)
27 lg1=ax.scatter(mat1x,mat1y,s=20,c='red')
28 lg2=ax.scatter(mat2x,mat2y,s=20,c='green')
29 lg3=ax.scatter(mat3x,mat3y,s=20,c='blue')
30 fig.legend((lg1,lg2,lg3),('yiban','xihuan','milian'),"upper left")
31 plt.show()

结果是

=======================================================================

3 由于不同意义的数据的取值范围很大，所以需要归一化。

文件是matrixnormalization.py

 1 from  numpy import *
 2 
 3 def autonorm(mat):
 4     minv=mat.min(0)
 5     maxv=mat.max(0)
 6     diff=maxv-minv
 7     rows=mat.shape[0]
 8     normmat=zeros(shape(mat))
 9     normmat=mat-tile(minv,(rows,1))
10     normmat=normmat/tile(diff,(rows,1))
11     return normmat,minv,maxv,diff

测试归一化

文件是test_matrixnormalization.py

1 from matrixnormalization import *
2 
3 
4 mat=array([[1,20,3000],[5,60,7000],[2,30,3],[6,60,6000]])
5 normmat=autonorm(mat)
6 print(normmat)

结果（不知道为何，跟书上的结果不太一样。）

=======================================================================

4测试一下这个分类器的效果

文件名test_dating.py

 1 from file2matrix import *
 2 from matrixnormalization import *
 3 from kNN import *
 4 
 5 
 6 def testdating():
 7     ratio=0.1
 8     countforerr=0
 9 
10     mat,lab=file2matrix2("datingTestSet.txt")
11 
12     normmat,minv,maxv,diff=autonorm(mat)
13 
14     allrows=normmat.shape[0]
15     rowsforTest=int(allrows*ratio)
16 
17     for i in range(rowsforTest):
18         res=classify_kNN(normmat[i,:],normmat[rowsforTest:allrows,:],lab[rowsforTest:allrows],3)
19         print('the result is',res,'the real is',lab[i])
20         if (res!=lab[i]):
21             countforerr+=1.0
22 
23     print("the error rate is", (countforerr/float(allrows)))
24 
25 
26 testdating()

结果

=======================================================================

5支持手工输入

文件名test_dating2.py

 1 from file2matrix import *
 2 from matrixnormalization import *
 3 from kNN import *
 4 
 5 
 6 def testdating():
 7     ratio=0.1
 8     countforerr=0
 9     reslist=["not at all","a little","very well"]
10     mat,lab=file2matrix2("datingTestSet.txt")
11 
12     normmat,minv,maxv,diff=autonorm(mat)
13 
14     allrows=normmat.shape[0]
15     rowsforTest=int(allrows*ratio)
16 
17     percenttats=float(input("percentage of time spent playing video games?"))
18     flymiles=float(input("frequent filer miles earned per year?"))
19     icecream=float(input("liters of ice cream consumed per year?"))
20     desarr=array([flymiles,percenttats,icecream])
21     normdesarr=(desarr-minv)/diff
22 
23     res=classify_kNN(normdesarr,normmat[rowsforTest:allrows,:],lab[rowsforTest:allrows],3)
24     print("You will probably like this person:",reslist[res-1])
25 
26 
27 
28 testdating()

结果

=======================================================================

这一节更像是用一个小算法领着学习python。