机器学习K近邻算法

from numpy import *
import operator
from os import listdir
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()     
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0];

def autoNorm(dataSet):
    minVals=dataSet.min(0);
    maxVals=dataSet.max(0);
    ranges=maxVals-minVals;
    normDataSet=zeros(shape(dataSet));
    m=dataSet.shape[0];
    normDataSet=dataSet-tile(minVals,(m,1));
    normDataSet=normDataSet/tile(ranges,(m,1));
    return  normDataSet,ranges,minVals;

def file2matrix(filename):
    fr= open(filename);
    arrayline=fr.readlines();
    numberoflines=len(arrayline);
    returnMat=zeros((numberoflines,3));
    classlabelvector=[];
    index=0;
    for line in arrayline:
        line=line.strip();
        listFromLine=line.split('	');
        returnMat[index,:]=listFromLine[0:3];
        classlabelvector.append(int(listFromLine[-1]));
        index+=1;
    return returnMat,classlabelvector;

def datingClassTest():
    hoRatio=0.10;
    datingDataMat,datingLabels=file2matrix('datingTestSet.txt');
    normMat,ranges,minVals=autoNorm(datingDataMat);
    m=normMat.shape[0];
    numTestVecs=int(m*hoRatio);
    errorCount=0.0;
    for i in range(numTestVecs):
        classiferResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3);
        print "the classifier came back with %d,the real answer is %d" %(classiferResult,datingLabels[i]);
        if classiferResult!=datingLabels[i]:
            errorCount+=1.0;
    print "the total error rate is %f" %(errorCount/float(numTestVecs));

def classifyPerson():
    resultList=['not at all','in small doses','in large doses'];
    percentTats=float(raw_input("percentage of time spent playing video games?"))
    ffMiles=float(raw_input("frequent flier miles earned per year?"));
    iceCream=float(raw_input("liters of icecream cosumed per year?"));
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt');
    normMat,ranges,minVals=autoNorm(datingDataMat);
    inArr=array([ffMiles,percentTats,iceCream]);
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3);
    print "You will probably like this person %s" %resultList[classifierResult-1];
原文地址:https://www.cnblogs.com/cherryMJY/p/8525151.html