apriori算法

  1 from numpy import *
  2 # 加载数据集
  3 
  4 def loadDataSet():
  5     return [[1,3,4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
  6 
  7 # 创建集合 C1。即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
  8 def createC1(dataSet):
  9     """createC1(创建集合 C1)
 10     Args:
 11         dataSet 原始数据集
 12     Returns:
 13         frozenset 返回一个 frozenset 格式的 list
 14     """
 15 
 16     C1 = []
 17     for transaction in dataSet:
 18         for item in transaction:
 19             if not [item] in C1:
 20                 # 遍历所有的元素,如果不在 C1 出现过,那么就 append
 21                 C1.append([item])
 22     # 对数组进行 `从小到大` 的排序
 23     # print('sort 前=', C1)
 24     C1.sort()
 25     # frozenset 表示冻结的 set 集合,元素无改变;可以把它当字典的 key 来使用
 26     # print('sort 后=', C1)
 27     #print ('frozenset=', list(map(frozenset, C1)))
 28     #frozenset= [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
 29     return list(map(frozenset, C1))
 30 
 31 # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度(minSupport)的数据
 32 def scanD(D, Ck, minSupport):
 33     """scanD(计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度 minSupport 的数据)
 34     Args:
 35         D 数据集
 36         Ck 候选项集列表
 37         minSupport 最小支持度
 38     Returns:
 39         retList 频繁项集(支持度大于minSupport的集合)
 40         supportData 候选项集支持度数据
 41     """
 42 
 43     # ssCnt 临时存放选数据集 Ck 的频率. 例如: a->10, b->5, c->8
 44     ssCnt = {} #建立一个空的字典
 45     for tid in D: 
 46         for can in Ck:
 47             # s.issubset(t)  测试是否 s 中的每一个元素都在 t 中
 48             if can.issubset(tid): #返回一个bool类型的数据
 49                 if can not in ssCnt:
 50                     ssCnt[can] = 1
 51                 else:
 52                     ssCnt[can] += 1
 53     numItems = float(len(D)) # 数据集 D 的数量
 54     retList = []  # 存放频繁项集
 55     supportData = {}  # 候选项集Ck的支持度字典(key:候选项,value:支持度)
 56     for key in ssCnt: # #{1:2,2:3,3:3,4:1,5:3}
 57         # 支持度 = 候选项(key)出现的次数 / 所有数据集的数量
 58         support = ssCnt[key]/numItems
 59         if support >= minSupport:
 60             # 在 retList 的首位插入元素,只存储支持度满足频繁项集的值
 61             retList.insert(0, key)
 62         # 存储所有的候选项(key)和对应的支持度(support)
 63         supportData[key] = support
 64     return retList, supportData #返回  频繁项集  及候选支持度
 65 
 66 # 输入频繁项集列表 Lk 与返回的元素个数 k,然后输出所有可能的候选项集 Ck
 67 def aprioriGen(Lk, k):
 68     """aprioriGen(输入频繁项集列表 Lk 与返回的元素个数 k,然后输出 候选项集 Ck。
 69        例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2}
 70        仅需要计算一次,不需要将所有的结果计算出来,然后进行去重操作
 71        这是一个更高效的算法)
 72     Args:
 73         Lk 频繁项集列表
 74         k 返回的项集元素个数(若元素的前 k-2 相同,就进行合并)
 75     Returns:
 76         retList (候选频繁项集)元素两两合并的数据集
 77     """
 78     
 79     retList = [] #存放频繁项集
 80     lenLk = len(Lk)
 81     for i in range(lenLk):
 82         for j in range(i+1, lenLk):
 83             L1 = list(Lk[i])[: k-2]
 84             L2 = list(Lk[j])[: k-2]
 85             #print ('-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2])
 86             #print ('-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2])
 87             L1.sort()
 88             L2.sort()
 89             # 第一次 L1,L2 为空,元素直接进行合并,返回元素两两合并的数据集
 90             # if first k-2 elements are equal
 91             if L1 == L2:
 92                 # set union
 93                 # print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
 94                 retList.append(Lk[i] | Lk[j])
 95     return retList #返回合并后的候选频繁项集
 96 
 97 # 找出数据集 dataSet 中支持度 >= 最小支持度的候选项集以及它们的支持度。即我们的频繁项集。
 98 def apriori(dataSet, minSupport=0.5):
 99     """apriori(首先构建集合 C1,然后扫描数据集来判断这些只有一个元素的项集是否满足最小支持度的要求。那么满足最小支持度要求的项集构成集合 L1。然后 L1 中的元素相互组合成 C2,C2 再进一步过滤变成 L2,然后以此类推,知道 CN 的长度为 0 时结束,即可找出所有频繁项集的支持度。)
100     Args:
101         dataSet 原始数据集
102         minSupport 支持度的阈值
103     Returns:
104         L 频繁项集的全集
105         supportData 所有元素和支持度的全集
106     """
107     # C1 即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
108     C1 = createC1(dataSet)
109     #print ('C1: ', C1) 
110         # C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
111     # 对每一行进行 set 转换,然后存放到集合中
112     D = list(map(set, dataSet))
113     #print ('D=', D)
114     #D= [{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
115     # 计算候选数据集 C1 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据
116     L1, supportData = scanD(D, C1, minSupport)  #L1为一项频繁项集
117     #print ("L1=", L1, "\n", "outcome: ", supportData)
118         #L1= [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]
119         #outcome:  {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75}
120 
121     # L 加了一层 list, L 一共 2 层 list
122     L = [L1] 
123         #L为[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。
124         #L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
125     k = 2
126     # 判断 L 的第 k-2 项的数据长度是否 > 0。第一次执行时 L 为 [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])],最后面 k += 1
127     while (len(L[k-2]) > 0): #第一次即长度大于0,执行循环体
128         #print('k=', k, L, L[k-2])
129         #k= 2 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]] [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]
130         #k= 3 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]] [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]
131         #k= 4 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})]] [frozenset({2, 3, 5})]
132         Ck = aprioriGen(L[k-2], k) # 第一次L[K-2]为频繁1项集,参数k=2,返回Ck候选频繁2项集
133         #print('Ck', Ck)
134         #Ck [frozenset({2, 5}), frozenset({3, 5}), frozenset({1, 5}), frozenset({2, 3}), frozenset({1, 2}), frozenset({1, 3})]
135         #Ck [frozenset({2, 3, 5})]
136         #Ck []
137         Lk, supK = scanD(D, Ck, minSupport) # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据(返回频繁项集)
138         # 保存所有候选项集的支持度,如果字典没有,就追加元素,如果有,就更新元素
139         supportData.update(supK)
140         if len(Lk) == 0:
141             break
142         # Lk 表示满足 频繁子项 的集合,L 元素在增加,例如: 
143         # l=[[set(1), set(2), set(3)]]
144         # l=[[set(1), set(2), set(3)], [set(1, 2), set(2, 3)]]
145         L.append(Lk)
146         #print('now_L:',L)
147         k += 1
148         #print('k=', k, len(L[k-2]))
149     return L, supportData
150 
151 # 计算可信度(confidence)
152 def calcConf(freqSet, H, supportData, brl, minConf=0.7):
153     """calcConf(对两个元素的频繁项,计算可信度,例如: {1,2}/{1} 或者 {1,2}/{2} 看是否满足条件)
154     Args:
155         freqSet 频繁项集中的元素,例如: frozenset([1, 3])    
156         H 频繁项集中的元素的集合,例如: [frozenset([1]), frozenset([3])]
157         supportData 所有元素的支持度的字典
158         brl 关联规则列表的空数组
159         minConf 最小可信度
160     Returns:
161         prunedH 记录 可信度大于阈值的集合
162     """
163     # 记录可信度大于最小可信度(minConf)的集合
164     prunedH = []
165     for conseq in H: 
166         # 假设 freqSet = frozenset([1, 3]), H = [frozenset([1]), frozenset([3])],
167         #那么现在需要求出 frozenset([1]) -> frozenset([3]) 的可信度和 frozenset([3]) -> frozenset([1]) 的可信度
168 
169         #print('freqSet=',freqSet,'H=',H,'conseq=',conseq,'freqSet-conseq=',freqSet-conseq)
170         """
171         freqSet= frozenset({2, 3}) H= [frozenset({2}), frozenset({3})] conseq= frozenset({2}) freqSet-conseq= frozenset({3})
172         freqSet= frozenset({2, 3}) H= [frozenset({2}), frozenset({3})] conseq= frozenset({3}) freqSet-conseq= frozenset({2})
173         freqSet= frozenset({3, 5}) H= [frozenset({3}), frozenset({5})] conseq= frozenset({3}) freqSet-conseq= frozenset({5})
174         freqSet= frozenset({3, 5}) H= [frozenset({3}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({3})
175         freqSet= frozenset({2, 5}) H= [frozenset({2}), frozenset({5})] conseq= frozenset({2}) freqSet-conseq= frozenset({5})
176         frozenset({5}) ------> frozenset({2}) conf: 1.0
177         freqSet= frozenset({2, 5}) H= [frozenset({2}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({2})
178         frozenset({2}) ------> frozenset({5}) conf: 1.0
179         freqSet= frozenset({1, 3}) H= [frozenset({1}), frozenset({3})] conseq= frozenset({1}) freqSet-conseq= frozenset({3})
180         freqSet= frozenset({1, 3}) H= [frozenset({1}), frozenset({3})] conseq= frozenset({3}) freqSet-conseq= frozenset({1})
181         frozenset({1}) ------> frozenset({3}) conf: 1.0
182 
183         freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({2}) freqSet-conseq= frozenset({3, 5})
184         frozenset({3, 5}) ------> frozenset({2}) conf: 1.0
185         freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({3}) freqSet-conseq= frozenset({2, 5})
186         freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({2, 3})
187         frozenset({2, 3}) ------> frozenset({5}) conf: 1.0
188         freqSet= frozenset({2, 3, 5}) H= [frozenset({2, 5})] conseq= frozenset({2, 5}) freqSet-conseq= frozenset({3})
189         """
190         conf = supportData[freqSet]/supportData[freqSet-conseq] 
191         # 可信度定义: a -> b = support(a | b) / support(a). 假设  freqSet = frozenset([1, 3]), conseq = [frozenset([1])],
192         # 那么 frozenset([1]) 至 frozenset([3]) 的可信度为 = support(a | b) / support(a) 
193         # = supportData[freqSet]/supportData[freqSet-conseq] = supportData[frozenset([1, 3])] / supportData[frozenset([1])]
194         if conf >= minConf: 
195         # 会买 conseq 集合(freqSet-conseq 集合和 conseq集合 是全集)
196             print(freqSet-conseq, '------>', conseq, 'conf:', conf)
197             # frozenset({5}) ------> frozenset({2}) conf: 1.0
198             # frozenset({2}) ------> frozenset({5}) conf: 1.0
199             # frozenset({1}) ------> frozenset({3}) conf: 1.0
200             # frozenset({3, 5}) ------> frozenset({2}) conf: 1.0
201             # frozenset({2, 3}) ------> frozenset({5}) conf: 1.0
202             brl.append((freqSet-conseq, conseq, conf))
203             prunedH.append(conseq)
204             #print('prunedH='append,prunedH)
205     return prunedH
206 
207 
208 # 递归计算频繁项集的规则
209 def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
210     """rulesFromConseq
211     Args:
212         freqSet 频繁项集中的元素,例如: frozenset([2, 3, 5])    
213         H 频繁项集中的元素的集合,例如: [frozenset([2]), frozenset([3]), frozenset([5])]
214         supportData 所有元素的支持度的字典
215         brl 关联规则列表的数组
216         minConf 最小可信度
217     """
218     # 此时----->>> freqSet= frozenset([2, 3, 5])  H= [frozenset([2]),frozenset([3]),frozenset(5)]
219     # H[0] 是 freqSet 的元素组合的第一个元素,并且 H 中所有元素的长度都一样,长度由 aprioriGen(H, m+1) 这里的 m + 1 来控制
220     # 该函数递归时,H[0] 的长度从 1 开始增长 1 2 3 ...
221     # 假设 freqSet = frozenset([2, 3, 5]), H = [frozenset([2]), frozenset([3]), frozenset([5])]
222     # 那么 m = len(H[0]) 的递归的值依次为 1 2
223     # 在 m = 2 时, 跳出该递归。假设再递归一次,那么 H[0] = frozenset([2, 3, 5]),freqSet = frozenset([2, 3, 5]) ,没必要再计算 freqSet 与 H[0] 的关联规则了。
224     m = len(H[0])
225     if (len(freqSet) > (m + 1)):
226         # print('freqSet******************', len(freqSet), m + 1, freqSet, H, H[0])
227         # freqSet****************** 3 2 frozenset({2, 3, 5}) [frozenset({2}), frozenset({3}), frozenset({5})] frozenset({2})
228         
229         # 生成 m+1 个长度的所有可能的 H 中的组合,假设 H = [frozenset([2]), frozenset([3]), frozenset([5])]
230         # 第一次递归调用时生成 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])]
231         # 第二次 。。。没有第二次,递归条件判断时已经退出了
232         Hmp1 = aprioriGen(H, m+1) # [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])]
233         # 返回可信度大于最小可信度的集合
234         Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
235         # print('freqSet::',freqSet)
236         # freqSet:: frozenset({2, 3, 5})
237         # print('Hmp1=', Hmp1)
238         # Hmp1= [frozenset({2, 3}), frozenset({2, 5}), frozenset({3, 5})]
239        
240         # print('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet))
241         # len(Hmp1)= 3 len(freqSet)= 3
242        
243         # 计算可信度后,还有数据大于最小可信度的话,那么继续递归调用,否则跳出递归
244         if (len(Hmp1) > 1):
245             # print('----------------------', Hmp1)
246             # ---------------------- [frozenset({2, 3}), frozenset({2, 5}), frozenset({3, 5})]
247             # print(len(freqSet),  len(Hmp1[0]) + 1)
248             # 3 3
249             rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf=0.7)
250 
251 # 生成关联规则
252 def generateRules(L, supportData, minConf=0.7):
253     """generateRules
254     Args:
255         L 频繁项集列表
256         supportData 频繁项集支持度的字典
257         minConf 最小置信度
258     Returns:
259         bigRuleList 可信度规则列表(关于 (A->B+置信度) 3个字段的组合)
260     """
261     bigRuleList = []
262     # 假设 L = [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]]
263     for i in range(1, len(L)): # i 取值 1 和 2
264         # 获取频繁项集中每个组合的所有元素
265         for freqSet in L[i]: 
266             # i=1------> L[1]: [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])]
267             # i=2------> L[2]: [frozenset([2, 3, 5])]
268             # 假设:freqSet= [frozenset([1, 3]), H1=[frozenset([1]), frozenset([3])]
269             # 组合总的元素并遍历子元素,并转化为 frozenset 集合,再存放到 list 列表中
270             H1 = [frozenset([item]) for item in freqSet] 
271             # H1=[frozenset([1]), frozenset([3])]
272             # H1=[frozenset([2]), frozenset([5])]
273             # H1=[frozenset([2]), frozenset([3])]
274             # H1=[frozenset([3]), frozenset([5])]
275 
276             # H2=[frozenset([2]),frozenset([3]),frozenset(5)]
277             # 2 个的组合,走 else, 2 个以上的组合,走 if
278             if (i > 1):
279                 # i = 2 
280                 # freqSet= frozenset([2, 3, 5])  H2=[frozenset([2]),frozenset([3]),frozenset(5)]
281                 H1 = calcConf(freqSet,H1,supportData,bigRuleList,minConf)
282                 # print('NOW_H1=',H1)
283                 # NOW_H1= [frozenset({2}), frozenset({3}), frozenset({5})]
284                 rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
285             else:
286                 calcConf(freqSet, H1, supportData, bigRuleList, minConf)
287                 # freqSet= frozenset([1, 3])   H1=[frozenset([1]), frozenset([3])]
288                 # freqSet= frozenset([2, 5])   H1=[frozenset([2]), frozenset([5])]
289                 # freqSet= frozenset([2, 3])   H1=[frozenset([2]), frozenset([3])]
290                 # freqSet= frozenset([3, 5])   H1=[frozenset([3]), frozenset([5])]
291     return bigRuleList
292 def testGenerateRules():
293     # 加载测试数据集
294     dataSet = loadDataSet()
295     print('dataSet: ', dataSet)
296 
297     # Apriori 算法生成频繁项集以及它们的支持度
298     L1, supportData1 = apriori(dataSet, minSupport=0.5)
299     print('频繁项集L: ', L1)
300     print('项集支持度supportData: ', supportData1)
301 
302     # 生成关联规则
303     rules = generateRules(L1, supportData1, minConf=0.7)
304     print('关联规则rules: ', rules)
305 
306 def main():
307     # 测试 Apriori 算法
308     # testApriori()
309     
310 
311     # 生成关联规则
312     testGenerateRules()
313 if __name__ == "__main__":
314     main()
1 dataSet:  [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
2 频繁项集L:  [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})]]
3 项集支持度supportData:  {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({2, 3, 5}): 0.5}
4 frozenset({5}) ------> frozenset({2}) conf: 1.0
5 frozenset({2}) ------> frozenset({5}) conf: 1.0
6 frozenset({1}) ------> frozenset({3}) conf: 1.0
7 frozenset({3, 5}) ------> frozenset({2}) conf: 1.0
8 frozenset({2, 3}) ------> frozenset({5}) conf: 1.0
9 关联规则rules:  [(frozenset({5}), frozenset({2}), 1.0), (frozenset({2}), frozenset({5}), 1.0), (frozenset({1}), frozenset({3}), 1.0), (frozenset({3, 5}), frozenset({2}), 1.0), (frozenset({2, 3}), frozenset({5}), 1.0)]

 

posted @ 2020-05-05 12:46  小他_W  阅读(274)  评论(0编辑  收藏  举报