重写轮子之 kNN

# !/usr/bin/python
# -*- coding:utf-8 -*-


"""
Re-implement kNN algorithm as a practice
使用该 kNN re-implement 的前提:
    train data 的标签必须转成0,1,2,...的形式
"""

# Author: 相忠良(Zhong-Liang Xiang) <ugoood@163.com>
# Finished at July 11th, 2017

import sys
from numpy import array
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, cross_validation
from sklearn import neighbors


## Euclidean Distance
def euclidean(v1, v2):
    v11 = np.mat(v1)
    v22 = np.mat(v2)
    return np.sqrt((v11 - v22) * ((v11 - v22).T))[0, 0]


## Cosin Distance
def cosdis(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


## load data
def load_data():
    iris = datasets.load_iris()
    return cross_validation.train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)


class MyKNeighborsClassifier:
    predict_label = []
    n_neighbors = 0
    X_train = []
    y_train = []
    content = [] # 中间变量

    def __init__(self, n_neighbors=20):
        self.n_neighbors = n_neighbors
        return

    def fit(self, X, y):
        self.y_train = y
        self.X_train = X

    def predict(self, X):
        for item in X:
            for sample in self.X_train:
                self.content.append(euclidean(item, sample))
            temp = []
            i = 1
            while (i <= self.n_neighbors):
                index = np.argmin(self.content)
                temp.append(y_train[index])
                self.content[index] = sys.maxint
                i += 1
            self.predict_label.append(np.argmax(np.bincount(temp)))  # 重要1,2
            self.content = []
            temp = []
        return self.predict_label

    def score(self, X, y):
        pass

## 测试用例
X_train, X_test, y_train, y_test = load_data()

cls = MyKNeighborsClassifier()
cls.fit(X_train, y_train)
mine = cls.predict(X_test)
print 'my kNN: ', mine

cls1 = neighbors.KNeighborsClassifier(n_neighbors=20, p=2)
cls1.fit(X_train, y_train)
sklearnkNN = cls1.predict(X_test)
print 'sklearn kNN: ', sklearnkNN
print mine == sklearnkNN
print mine == y_test



'''
下面是编程过程中留下的经验
'''

# 重要1: np.bincount(list)
# >>> a=[1,1,2,2,4]
# >>> print np.bincount(a)
# 结果为 [0 2 2 0 1]

# 重要2: np.argmax(list)
# 返回最大值索引

# 重要3: 标识整数最大值
# >>> import sys
# >>> sys.maxint


## kNN 小示例
# def createDataset():
#     group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
#     labels = ['A', 'A', 'B', 'B']
#     return group, labels
#
#
# dataset, labels = createDataset()
# fig = plt.figure()
# ax = fig.add_subplot(111)
# index = 0
# testdata = [0.2, 0.2]
#
# for point in dataset:
#     if labels[index] == 'A':
#         ax.scatter(point[0], point[1], c='blue', marker='o', s=300)
#     else:
#         ax.scatter(point[0], point[1], c='red', marker='^', s=300)
#     index += 1
#
# ax.scatter(testdata[0], testdata[1], c='green', marker='^', s=300)
# plt.show()
原文地址:https://www.cnblogs.com/ZhongliangXiang/p/7357173.html