K-means 不知k值自动无监督分类

代码：
  1 # -*- coding:UTF-8 -*-
  2 from numpy import *
  3 import jieba as jb
  4 import time
  5 # 计算权值,并存储为txt
  6 # 计算所有文本包含的总词数
  7 def wordsCount(dataSet):
  8     wordsCnt = 0
  9     for document in dataSet:
 10         wordsCnt += len(document)
 11     return wordsCnt
 12 
 13 # 创建不重复的词条列表
 14 def createVocabList(dataSet):
 15     vocabSet = set([])
 16     for document in dataSet:
 17         vocabSet = vocabSet | set(document)
 18     return list(vocabSet)
 19 
 20 # 将文本转化为词袋模型
 21 def bagOfWords2Vec(vocabList, inputSet):
 22     returnVec = [0] * len(vocabList)
 23     for word in inputSet:
 24         if word in vocabList:
 25             returnVec[vocabList.index(word)] += 1
 26         else:
 27             print("the word: %s is not in my Vocabulary!" % word)
 28     return returnVec
 29 
 30 # 计算包含某个词的文本数
 31 def wordInFileCount(word, cutWordList):
 32     fileCnt = 0
 33     for i in cutWordList:
 34         for j in i:
 35             if word == j:
 36                 fileCnt = fileCnt + 1
 37             else:
 38                 continue
 39     return fileCnt
 40 
 41 def calTFIDF(dataSet):
 42     fileCnt = len(dataSet)  # 文本数
 43     vocabList = createVocabList(dataSet)  # 词条列表
 44     tfidfSet = []
 45 
 46     for line in dataSet:
 47         wordsBag = bagOfWords2Vec(vocabList, line)  # 每行文本对应的词袋向量
 48         lineWordsCnt = 0
 49         for i in range(len(wordsBag)):
 50             lineWordsCnt += wordsBag[i]  # 计算每个文本中包含的总词数
 51         tfidfList = [0] * len(vocabList)
 52         for word in line:
 53             wordinfileCnt = wordInFileCount(word, dataSet)  # 包含该词的文本数
 54             wordCnt = wordsBag[vocabList.index(word)]  # 该词在文本中出现的次数
 55             tf = float(wordCnt) / lineWordsCnt
 56             idf = math.log(float(fileCnt) / (wordinfileCnt + 1))
 57             tfidf = tf * idf
 58             tfidfList[vocabList.index(word)] = tfidf
 59         print(tfidfList)
 60         print(map(str, tfidfList))
 61         tfidfSet.append(tfidfList)
 62 
 63     return tfidfSet
 64 
 65 # 计算余弦距离
 66 def gen_sim(A, B):
 67     num = float(dot(mat(A), mat(B).T))
 68     denum = linalg.norm(A) * linalg.norm(B)
 69     if denum == 0:
 70         denum = 1
 71     cosn = num / denum
 72     sim = 0.5 + 0.5 * cosn  # 余弦值为[-1,1],归一化为[0,1],值越大相似度越大
 73     sim = 1 - sim  # 将其转化为值越小距离越近
 74     return sim
 75 
 76 
 77 # 计算两个簇的评均距离
 78 def distAvg(dataSet1, dataSet2):
 79     avgD = 0
 80     sumD = 0
 81     m = shape(dataSet1)[0]
 82     n = shape(dataSet2)[0]
 83     for i in range(m):
 84         for j in range(n):
 85             dist = gen_sim(dataSet1[i], dataSet2[j])
 86             sumD += dist
 87     avgD = sumD / (m * n)
 88     return avgD
 89 
 90 # 找到距离最近的两个簇
 91 def findMin(M):
 92     minDist = inf
 93     m = shape(M)[0]
 94     for i in range(m):
 95         for j in range(m):
 96             if i != j and M[i, j] < minDist:
 97                 minDist = M[i, j]
 98                 minI = i
 99                 minJ = j
100     return minI, minJ, minDist
101 
102 
103 # 层次聚类算法
104 def hCluster(dataSet, k, dist, distMeas=distAvg):
105     m = shape(dataSet)[0]
106     clusterAssment = mat(zeros((m, 1)))
107     performMeasure = []
108     M = mat(zeros((m, m)))  # 距离矩阵
109     # 初始化聚类簇，每个样本作为一个类
110     for ii in range(m):
111         clusterAssment[ii, 0] = ii
112 
113     for i in range(m):
114         for j in range(i + 1, m):
115             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
116             dataSetj = dataSet[nonzero(clusterAssment[:, 0].A == j)[0], :]
117             M[i, j] = distMeas(dataSeti, dataSetj)
118             M[j, i] = M[i, j]
119         if mod(i,10) == 0: print(i)
120     q = m  # 设置当前聚类个数
121     minDist = 0
122     # while (q > k):
123     while (minDist < dist):
124         i, j, minDist = findMin(M)  # 找到距离最小的两个簇
125         # 把第j个簇归并到第i个簇
126         clusterAssment[nonzero(clusterAssment[:, 0].A == j)[0], 0] = i
127         for l in range(j + 1, q):  # 将j之后的簇重新编号
128             clusterAssment[nonzero(clusterAssment[:, 0].A == l)[0], 0] = l - 1
129         M = delete(M, j, axis=0)
130         M = delete(M, j, axis=1)
131         for l in range(q - 1):  # 重新计算第i个簇和其他簇直接的距离
132             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
133             dataSetl = dataSet[nonzero(clusterAssment[:, 0].A == l)[0], :]
134             M[i, l] = distMeas(dataSeti, dataSetl)
135             M[l, i] = M[i, l]
136 
137         # DBI = DBIvalue(dataSet, clusterAssment, q)
138         # DI = DIvalue(dataSet, clusterAssment, q)
139         DBI = 0
140         DI = 0
141 
142         performMeasure.append([q - 1, minDist, DBI, DI])
143 
144         q = q - 1
145 
146         print(u'当前簇的个数是：', q)
147         print(u'距离最小的两个簇是第%d个和第%d个,距离是%f,DBI值是%f,DI值是%f' % (
148             i, j, minDist, DBI, DI))
149 
150     return clusterAssment, mat(performMeasure)
151 
152 def saveResult(clusterAssment):
153     listResult = clusterAssment.tolist()  # 矩阵转换为list
154     for i in range(len(listResult)):
155         print(map(str, listResult[i]))
156 
157 
158 if __name__ =='__main__':
159     a=["实施", "效益","节本","10"]
160     m=mat(calTFIDF(a))
161     clustAssing, performMeasure = hCluster(m, 0, 0.3)
162     print(clustAssing)
163     saveResult(clustAssing)
K-means 不知k值 自动无监督分类

K-means 不知k值自动无监督分类