利用python实现《数据挖掘——概念与技术》一书中描述的Apriori算法

 1 from itertools import combinations
 2 
 3 data = [['I1', 'I2', 'I5'], ['I2', 'I4'], ['I2', 'I3'], ['I1', 'I2', 'I4'], ['I1', 'I3'],
 4         ['I2', 'I3'], ['I1', 'I3'], ['I1', 'I2', 'I3', 'I5'], ['I1', 'I2', 'I3']]
 5 
 6 
 7 # 候选集生成
 8 # 输入：
 9 # f_set: k-1项集, k:项集个数
10 # 输出：
11 # k_cand：k项候选集
12 def apriori_gen(f_set, k):
13     k_cand = []
14     temp = [frozenset(l) for l in combinations(f_set, k)]
15     for t in temp:
16         if has_infrequent_subset(t, f_set):
17             del t
18         else:
19             k_cand.append(t)
20     return k_cand
21 
22 # 非频繁项集的超集也是非频繁的
23 def has_infrequent_subset(c_set, f_set):
24     for subset in c_set:
25         if not frozenset([subset]).issubset(f_set):
26             return True
27     return False
28 
29 # 输入（绝对）最小支持度, min_sup
30 # 输出：全部频繁项集（不包括一项集）, all_f_set
31 def get_f_set(min_sup=2):
32     all_f_set = []
33     L1 = frozenset([d for ds in data for d in ds])
34     k = 2
35     size = len(L1)
36     while k <= size:
37         c_k = frozenset(apriori_gen(L1, k))
38         for c in c_k:
39             count = 0
40             for d in data:
41                 if c.issubset(frozenset(d)):
42                     count += 1
43             if count >= min_sup:
44                 all_f_set.append((c, count))
45         k += 1
46     return all_f_set
47 
48 if __name__ == '__main__':
49     all_frequent_set = get_f_set()
50     for i in all_frequent_set:
51         print(i)