K-近邻算法学习

# -- coding: utf-8 --
from numpy import *
import operator

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

def classify0(inX,dataSet,labels,k):
    print 'inX'
    print inX
    #获取行数
    dataSetSize = dataSet.shape[0]                 
    print 'dataSetSize:'
    print dataSetSize
    
    #将用于分类的输入向量重复训练集样本的行数-训练集样本
    print 'tile(inX,(dataSetSize,1))'
    print tile(inX,(dataSetSize,1))
    
    diffMat = tile(inX,(dataSetSize,1))-dataSet     
    print 'diffMat'
    print diffMat

    #将差值做平方操作
    sqDiffMat = diffMat**2                          
    print 'sqDiffMat'
    print sqDiffMat
    
    #将矩阵按行相加
    sqDistances = sqDiffMat.sum(axis=1)             
    print 'sqDistances'
    print sqDistances
    #相加后开根号
    distances = sqDistances**0.5                    
    print'distances'
    print distances
    
    #按从小到大大索引排序  假如[3,1,2],排序结果为[1,2.0],结果应该是训练集的列数        
    sortedDistIndicies = distances.argsort()        
    print 'sortedDistIndicies'
    print sortedDistIndicies
    classCount = {}
    #遍历
    for i in range(k):                             
        #sortedDistIndicies[i]获取距离按照索引排序后的第i个值
        #labels[sortedDistIndicies[i]]获取距离索引对应的Label
        print 'I='+str(i)
        #获取当前索引对应的标签        
        voteIlabel = labels[sortedDistIndicies[i]]
        print 'voteIlabel='+voteIlabel
        print 'classCount.get(voteIlabel,0)='+str(classCount.get(voteIlabel,0))
        
        #对标签进行计数
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    print 'classCount'    
    print classCount
    #对获取的标签通过数量进行逆序排序
    sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
    print 'sortedClassCount'
    print sortedClassCount
    return sortedClassCount[0][0]

group,labels=kNN.createDataSet();
print group
print labels
print kNN.classify0([0.1,0.2],group,labels,3)

最终的输出结果为

[[ 1. 1.1]
[ 1. 1. ]
[ 0. 0. ]
[ 0. 0.1]]
['A', 'A', 'B', 'B']
inX
[0.1, 0.2]
dataSetSize:
4
tile(inX,(dataSetSize,1))
[[ 0.1 0.2]
[ 0.1 0.2]
[ 0.1 0.2]
[ 0.1 0.2]]
diffMat
[[-0.9 -0.9]
[-0.9 -0.8]
[ 0.1 0.2]
[ 0.1 0.1]]
sqDiffMat
[[ 0.81 0.81]
[ 0.81 0.64]
[ 0.01 0.04]
[ 0.01 0.01]]
sqDistances
[ 1.62 1.45 0.05 0.02]
distances
[ 1.27279221 1.20415946 0.2236068 0.14142136]
sortedDistIndicies
[3 2 1 0]
I=0
voteIlabel=B
classCount.get(voteIlabel,0)=0
I=1
voteIlabel=B
classCount.get(voteIlabel,0)=1
I=2
voteIlabel=A
classCount.get(voteIlabel,0)=0
classCount
{'A': 1, 'B': 2}
sortedClassCount
[('B', 2), ('A', 1)]
B

  

原文地址:https://www.cnblogs.com/kevin-h-wang/p/6589413.html