python ML 笔记：Kmeans

kmeans算法的python实现：

参考与样本来源《Machine Learning in Action》

 1 #-*-coding:UTF-8-*-
 2 '''
 3 Created on 2015年8月19日
 4 @author: Ayumi Phoenix
 5 '''
 6 import numpy as np
 7 
 8 def distL2(a,b):
 9     """ 计算两个向量之间的L2距离 """
10     return np.sqrt(np.sum((a-b)**2))
11 
12 class Kmeans():
13     def __init__(self, dataset,k):
14         self.dataset = dataset
15         self.k = k
16         self.m, self.n = dataset.shape
17         
18     def randcent(self):
19         """ 根据输入数据集获得随机生成一组簇质心 """
20         maxn = np.max(self.dataset, 0)  # 获取每一维的最大值
21         minn = np.min(self.dataset, 0)  # 获取每一维的最小值
22         centoroid = np.random.rand(self.k,self.n) * (maxn - minn) + minn    # k x n
23         return centoroid
24 
25     def train(self, dist, iter = 1):
26         """
27         # 1. 计算每个样本与所有簇心的最近匹配距离数组 m x 1：
28             # 计算某样本与所有簇心的距离,
29             # 找到最小距离所属的下标序号 0...k-1
30         # 2. 根据当前类标的分配，重新计算平均聚类中心
31             # 按照当前分配索引样本数据
32             # 迭代次数减一
33         # 3. 返回最终的质心与分配的序号
34         """
35         centoroid = self.randcent()
36         while iter:
37             labels = np.zeros((self.m,), int)               
38             for i in range(self.m):
39                 d = [dist(self.dataset[i,:],centoroid[j])   
40                      for j in range(self.k)]   
41                 labels[i] = np.argmin(d)                    
42             for i in range(self.k):                         
43                 x = self.dataset[labels==i]                 
44                 centoroid[i] = np.mean(x, 0)
45             iter -= 1                                       
46         return centoroid, labels

读取数据与测试函数：

 1 ef loadDataSet(filename):
 2     dataMat = []
 3     with open(filename) as f:
 4         for line in f.readlines():
 5             curline = line.strip().split('	')
 6             fltline = map(np.float, curline)
 7             dataMat.append(fltline)
 8     return dataMat
 9 
10 
11 if __name__=="__main__":
12     pass
13     datMat = np.array(loadDataSet('testSet.txt'))
14     km = Kmeans(datMat,4)
15     centoroid, labels = km.train(distL2, iter=20)
16     
17     # 根据当前质心显示样本分布
18     import matplotlib.pylab as pl
19     pl.figure()
20     c = ['ro','go','bo','yo','co','ko','wo','mo']
21     for i in range(datMat.shape[0]):
22         pl.plot(datMat[i][0],datMat[i][1],c[labels[i]])
23     for cen in centoroid:
24         pl.plot(cen[0],cen[1],'mo')
25     pl.show()

结果：