利用python实现《数据挖掘——概念与技术》一书中描述的Apriori算法
1 from itertools import combinations 2 3 data = [['I1', 'I2', 'I5'], ['I2', 'I4'], ['I2', 'I3'], ['I1', 'I2', 'I4'], ['I1', 'I3'], 4 ['I2', 'I3'], ['I1', 'I3'], ['I1', 'I2', 'I3', 'I5'], ['I1', 'I2', 'I3']] 5 6 7 # 候选集生成 8 # 输入: 9 # f_set: k-1项集, k:项集个数 10 # 输出: 11 # k_cand:k项候选集 12 def apriori_gen(f_set, k): 13 k_cand = [] 14 temp = [frozenset(l) for l in combinations(f_set, k)] 15 for t in temp: 16 if has_infrequent_subset(t, f_set): 17 del t 18 else: 19 k_cand.append(t) 20 return k_cand 21 22 # 非频繁项集的超集也是非频繁的 23 def has_infrequent_subset(c_set, f_set): 24 for subset in c_set: 25 if not frozenset([subset]).issubset(f_set): 26 return True 27 return False 28 29 # 输入(绝对)最小支持度, min_sup 30 # 输出:全部频繁项集(不包括一项集), all_f_set 31 def get_f_set(min_sup=2): 32 all_f_set = [] 33 L1 = frozenset([d for ds in data for d in ds]) 34 k = 2 35 size = len(L1) 36 while k <= size: 37 c_k = frozenset(apriori_gen(L1, k)) 38 for c in c_k: 39 count = 0 40 for d in data: 41 if c.issubset(frozenset(d)): 42 count += 1 43 if count >= min_sup: 44 all_f_set.append((c, count)) 45 k += 1 46 return all_f_set 47 48 if __name__ == '__main__': 49 all_frequent_set = get_f_set() 50 for i in all_frequent_set: 51 print(i)