apriori算法
1 from numpy import * 2 # 加载数据集 3 4 def loadDataSet(): 5 return [[1,3,4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] 6 7 # 创建集合 C1。即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset 8 def createC1(dataSet): 9 """createC1(创建集合 C1) 10 Args: 11 dataSet 原始数据集 12 Returns: 13 frozenset 返回一个 frozenset 格式的 list 14 """ 15 16 C1 = [] 17 for transaction in dataSet: 18 for item in transaction: 19 if not [item] in C1: 20 # 遍历所有的元素,如果不在 C1 出现过,那么就 append 21 C1.append([item]) 22 # 对数组进行 `从小到大` 的排序 23 # print('sort 前=', C1) 24 C1.sort() 25 # frozenset 表示冻结的 set 集合,元素无改变;可以把它当字典的 key 来使用 26 # print('sort 后=', C1) 27 #print ('frozenset=', list(map(frozenset, C1))) 28 #frozenset= [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})] 29 return list(map(frozenset, C1)) 30 31 # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度(minSupport)的数据 32 def scanD(D, Ck, minSupport): 33 """scanD(计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度 minSupport 的数据) 34 Args: 35 D 数据集 36 Ck 候选项集列表 37 minSupport 最小支持度 38 Returns: 39 retList 频繁项集(支持度大于minSupport的集合) 40 supportData 候选项集支持度数据 41 """ 42 43 # ssCnt 临时存放选数据集 Ck 的频率. 例如: a->10, b->5, c->8 44 ssCnt = {} #建立一个空的字典 45 for tid in D: 46 for can in Ck: 47 # s.issubset(t) 测试是否 s 中的每一个元素都在 t 中 48 if can.issubset(tid): #返回一个bool类型的数据 49 if can not in ssCnt: 50 ssCnt[can] = 1 51 else: 52 ssCnt[can] += 1 53 numItems = float(len(D)) # 数据集 D 的数量 54 retList = [] # 存放频繁项集 55 supportData = {} # 候选项集Ck的支持度字典(key:候选项,value:支持度) 56 for key in ssCnt: # #{1:2,2:3,3:3,4:1,5:3} 57 # 支持度 = 候选项(key)出现的次数 / 所有数据集的数量 58 support = ssCnt[key]/numItems 59 if support >= minSupport: 60 # 在 retList 的首位插入元素,只存储支持度满足频繁项集的值 61 retList.insert(0, key) 62 # 存储所有的候选项(key)和对应的支持度(support) 63 supportData[key] = support 64 return retList, supportData #返回 频繁项集 及候选支持度 65 66 # 输入频繁项集列表 Lk 与返回的元素个数 k,然后输出所有可能的候选项集 Ck 67 def aprioriGen(Lk, k): 68 """aprioriGen(输入频繁项集列表 Lk 与返回的元素个数 k,然后输出 候选项集 Ck。 69 例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2} 70 仅需要计算一次,不需要将所有的结果计算出来,然后进行去重操作 71 这是一个更高效的算法) 72 Args: 73 Lk 频繁项集列表 74 k 返回的项集元素个数(若元素的前 k-2 相同,就进行合并) 75 Returns: 76 retList (候选频繁项集)元素两两合并的数据集 77 """ 78 79 retList = [] #存放频繁项集 80 lenLk = len(Lk) 81 for i in range(lenLk): 82 for j in range(i+1, lenLk): 83 L1 = list(Lk[i])[: k-2] 84 L2 = list(Lk[j])[: k-2] 85 #print ('-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2]) 86 #print ('-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2]) 87 L1.sort() 88 L2.sort() 89 # 第一次 L1,L2 为空,元素直接进行合并,返回元素两两合并的数据集 90 # if first k-2 elements are equal 91 if L1 == L2: 92 # set union 93 # print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j] 94 retList.append(Lk[i] | Lk[j]) 95 return retList #返回合并后的候选频繁项集 96 97 # 找出数据集 dataSet 中支持度 >= 最小支持度的候选项集以及它们的支持度。即我们的频繁项集。 98 def apriori(dataSet, minSupport=0.5): 99 """apriori(首先构建集合 C1,然后扫描数据集来判断这些只有一个元素的项集是否满足最小支持度的要求。那么满足最小支持度要求的项集构成集合 L1。然后 L1 中的元素相互组合成 C2,C2 再进一步过滤变成 L2,然后以此类推,知道 CN 的长度为 0 时结束,即可找出所有频繁项集的支持度。) 100 Args: 101 dataSet 原始数据集 102 minSupport 支持度的阈值 103 Returns: 104 L 频繁项集的全集 105 supportData 所有元素和支持度的全集 106 """ 107 # C1 即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset 108 C1 = createC1(dataSet) 109 #print ('C1: ', C1) 110 # C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})] 111 # 对每一行进行 set 转换,然后存放到集合中 112 D = list(map(set, dataSet)) 113 #print ('D=', D) 114 #D= [{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}] 115 # 计算候选数据集 C1 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据 116 L1, supportData = scanD(D, C1, minSupport) #L1为一项频繁项集 117 #print ("L1=", L1, "\n", "outcome: ", supportData) 118 #L1= [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})] 119 #outcome: {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75} 120 121 # L 加了一层 list, L 一共 2 层 list 122 L = [L1] 123 #L为[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。 124 #L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])] 125 k = 2 126 # 判断 L 的第 k-2 项的数据长度是否 > 0。第一次执行时 L 为 [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])],最后面 k += 1 127 while (len(L[k-2]) > 0): #第一次即长度大于0,执行循环体 128 #print('k=', k, L, L[k-2]) 129 #k= 2 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]] [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})] 130 #k= 3 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]] [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})] 131 #k= 4 [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})]] [frozenset({2, 3, 5})] 132 Ck = aprioriGen(L[k-2], k) # 第一次L[K-2]为频繁1项集,参数k=2,返回Ck候选频繁2项集 133 #print('Ck', Ck) 134 #Ck [frozenset({2, 5}), frozenset({3, 5}), frozenset({1, 5}), frozenset({2, 3}), frozenset({1, 2}), frozenset({1, 3})] 135 #Ck [frozenset({2, 3, 5})] 136 #Ck [] 137 Lk, supK = scanD(D, Ck, minSupport) # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据(返回频繁项集) 138 # 保存所有候选项集的支持度,如果字典没有,就追加元素,如果有,就更新元素 139 supportData.update(supK) 140 if len(Lk) == 0: 141 break 142 # Lk 表示满足 频繁子项 的集合,L 元素在增加,例如: 143 # l=[[set(1), set(2), set(3)]] 144 # l=[[set(1), set(2), set(3)], [set(1, 2), set(2, 3)]] 145 L.append(Lk) 146 #print('now_L:',L) 147 k += 1 148 #print('k=', k, len(L[k-2])) 149 return L, supportData 150 151 # 计算可信度(confidence) 152 def calcConf(freqSet, H, supportData, brl, minConf=0.7): 153 """calcConf(对两个元素的频繁项,计算可信度,例如: {1,2}/{1} 或者 {1,2}/{2} 看是否满足条件) 154 Args: 155 freqSet 频繁项集中的元素,例如: frozenset([1, 3]) 156 H 频繁项集中的元素的集合,例如: [frozenset([1]), frozenset([3])] 157 supportData 所有元素的支持度的字典 158 brl 关联规则列表的空数组 159 minConf 最小可信度 160 Returns: 161 prunedH 记录 可信度大于阈值的集合 162 """ 163 # 记录可信度大于最小可信度(minConf)的集合 164 prunedH = [] 165 for conseq in H: 166 # 假设 freqSet = frozenset([1, 3]), H = [frozenset([1]), frozenset([3])], 167 #那么现在需要求出 frozenset([1]) -> frozenset([3]) 的可信度和 frozenset([3]) -> frozenset([1]) 的可信度 168 169 #print('freqSet=',freqSet,'H=',H,'conseq=',conseq,'freqSet-conseq=',freqSet-conseq) 170 """ 171 freqSet= frozenset({2, 3}) H= [frozenset({2}), frozenset({3})] conseq= frozenset({2}) freqSet-conseq= frozenset({3}) 172 freqSet= frozenset({2, 3}) H= [frozenset({2}), frozenset({3})] conseq= frozenset({3}) freqSet-conseq= frozenset({2}) 173 freqSet= frozenset({3, 5}) H= [frozenset({3}), frozenset({5})] conseq= frozenset({3}) freqSet-conseq= frozenset({5}) 174 freqSet= frozenset({3, 5}) H= [frozenset({3}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({3}) 175 freqSet= frozenset({2, 5}) H= [frozenset({2}), frozenset({5})] conseq= frozenset({2}) freqSet-conseq= frozenset({5}) 176 frozenset({5}) ------> frozenset({2}) conf: 1.0 177 freqSet= frozenset({2, 5}) H= [frozenset({2}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({2}) 178 frozenset({2}) ------> frozenset({5}) conf: 1.0 179 freqSet= frozenset({1, 3}) H= [frozenset({1}), frozenset({3})] conseq= frozenset({1}) freqSet-conseq= frozenset({3}) 180 freqSet= frozenset({1, 3}) H= [frozenset({1}), frozenset({3})] conseq= frozenset({3}) freqSet-conseq= frozenset({1}) 181 frozenset({1}) ------> frozenset({3}) conf: 1.0 182 183 freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({2}) freqSet-conseq= frozenset({3, 5}) 184 frozenset({3, 5}) ------> frozenset({2}) conf: 1.0 185 freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({3}) freqSet-conseq= frozenset({2, 5}) 186 freqSet= frozenset({2, 3, 5}) H= [frozenset({2}), frozenset({3}), frozenset({5})] conseq= frozenset({5}) freqSet-conseq= frozenset({2, 3}) 187 frozenset({2, 3}) ------> frozenset({5}) conf: 1.0 188 freqSet= frozenset({2, 3, 5}) H= [frozenset({2, 5})] conseq= frozenset({2, 5}) freqSet-conseq= frozenset({3}) 189 """ 190 conf = supportData[freqSet]/supportData[freqSet-conseq] 191 # 可信度定义: a -> b = support(a | b) / support(a). 假设 freqSet = frozenset([1, 3]), conseq = [frozenset([1])], 192 # 那么 frozenset([1]) 至 frozenset([3]) 的可信度为 = support(a | b) / support(a) 193 # = supportData[freqSet]/supportData[freqSet-conseq] = supportData[frozenset([1, 3])] / supportData[frozenset([1])] 194 if conf >= minConf: 195 # 会买 conseq 集合(freqSet-conseq 集合和 conseq集合 是全集) 196 print(freqSet-conseq, '------>', conseq, 'conf:', conf) 197 # frozenset({5}) ------> frozenset({2}) conf: 1.0 198 # frozenset({2}) ------> frozenset({5}) conf: 1.0 199 # frozenset({1}) ------> frozenset({3}) conf: 1.0 200 # frozenset({3, 5}) ------> frozenset({2}) conf: 1.0 201 # frozenset({2, 3}) ------> frozenset({5}) conf: 1.0 202 brl.append((freqSet-conseq, conseq, conf)) 203 prunedH.append(conseq) 204 #print('prunedH='append,prunedH) 205 return prunedH 206 207 208 # 递归计算频繁项集的规则 209 def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): 210 """rulesFromConseq 211 Args: 212 freqSet 频繁项集中的元素,例如: frozenset([2, 3, 5]) 213 H 频繁项集中的元素的集合,例如: [frozenset([2]), frozenset([3]), frozenset([5])] 214 supportData 所有元素的支持度的字典 215 brl 关联规则列表的数组 216 minConf 最小可信度 217 """ 218 # 此时----->>> freqSet= frozenset([2, 3, 5]) H= [frozenset([2]),frozenset([3]),frozenset(5)] 219 # H[0] 是 freqSet 的元素组合的第一个元素,并且 H 中所有元素的长度都一样,长度由 aprioriGen(H, m+1) 这里的 m + 1 来控制 220 # 该函数递归时,H[0] 的长度从 1 开始增长 1 2 3 ... 221 # 假设 freqSet = frozenset([2, 3, 5]), H = [frozenset([2]), frozenset([3]), frozenset([5])] 222 # 那么 m = len(H[0]) 的递归的值依次为 1 2 223 # 在 m = 2 时, 跳出该递归。假设再递归一次,那么 H[0] = frozenset([2, 3, 5]),freqSet = frozenset([2, 3, 5]) ,没必要再计算 freqSet 与 H[0] 的关联规则了。 224 m = len(H[0]) 225 if (len(freqSet) > (m + 1)): 226 # print('freqSet******************', len(freqSet), m + 1, freqSet, H, H[0]) 227 # freqSet****************** 3 2 frozenset({2, 3, 5}) [frozenset({2}), frozenset({3}), frozenset({5})] frozenset({2}) 228 229 # 生成 m+1 个长度的所有可能的 H 中的组合,假设 H = [frozenset([2]), frozenset([3]), frozenset([5])] 230 # 第一次递归调用时生成 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])] 231 # 第二次 。。。没有第二次,递归条件判断时已经退出了 232 Hmp1 = aprioriGen(H, m+1) # [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])] 233 # 返回可信度大于最小可信度的集合 234 Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) 235 # print('freqSet::',freqSet) 236 # freqSet:: frozenset({2, 3, 5}) 237 # print('Hmp1=', Hmp1) 238 # Hmp1= [frozenset({2, 3}), frozenset({2, 5}), frozenset({3, 5})] 239 240 # print('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet)) 241 # len(Hmp1)= 3 len(freqSet)= 3 242 243 # 计算可信度后,还有数据大于最小可信度的话,那么继续递归调用,否则跳出递归 244 if (len(Hmp1) > 1): 245 # print('----------------------', Hmp1) 246 # ---------------------- [frozenset({2, 3}), frozenset({2, 5}), frozenset({3, 5})] 247 # print(len(freqSet), len(Hmp1[0]) + 1) 248 # 3 3 249 rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf=0.7) 250 251 # 生成关联规则 252 def generateRules(L, supportData, minConf=0.7): 253 """generateRules 254 Args: 255 L 频繁项集列表 256 supportData 频繁项集支持度的字典 257 minConf 最小置信度 258 Returns: 259 bigRuleList 可信度规则列表(关于 (A->B+置信度) 3个字段的组合) 260 """ 261 bigRuleList = [] 262 # 假设 L = [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]] 263 for i in range(1, len(L)): # i 取值 1 和 2 264 # 获取频繁项集中每个组合的所有元素 265 for freqSet in L[i]: 266 # i=1------> L[1]: [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])] 267 # i=2------> L[2]: [frozenset([2, 3, 5])] 268 # 假设:freqSet= [frozenset([1, 3]), H1=[frozenset([1]), frozenset([3])] 269 # 组合总的元素并遍历子元素,并转化为 frozenset 集合,再存放到 list 列表中 270 H1 = [frozenset([item]) for item in freqSet] 271 # H1=[frozenset([1]), frozenset([3])] 272 # H1=[frozenset([2]), frozenset([5])] 273 # H1=[frozenset([2]), frozenset([3])] 274 # H1=[frozenset([3]), frozenset([5])] 275 276 # H2=[frozenset([2]),frozenset([3]),frozenset(5)] 277 # 2 个的组合,走 else, 2 个以上的组合,走 if 278 if (i > 1): 279 # i = 2 280 # freqSet= frozenset([2, 3, 5]) H2=[frozenset([2]),frozenset([3]),frozenset(5)] 281 H1 = calcConf(freqSet,H1,supportData,bigRuleList,minConf) 282 # print('NOW_H1=',H1) 283 # NOW_H1= [frozenset({2}), frozenset({3}), frozenset({5})] 284 rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) 285 else: 286 calcConf(freqSet, H1, supportData, bigRuleList, minConf) 287 # freqSet= frozenset([1, 3]) H1=[frozenset([1]), frozenset([3])] 288 # freqSet= frozenset([2, 5]) H1=[frozenset([2]), frozenset([5])] 289 # freqSet= frozenset([2, 3]) H1=[frozenset([2]), frozenset([3])] 290 # freqSet= frozenset([3, 5]) H1=[frozenset([3]), frozenset([5])] 291 return bigRuleList 292 def testGenerateRules(): 293 # 加载测试数据集 294 dataSet = loadDataSet() 295 print('dataSet: ', dataSet) 296 297 # Apriori 算法生成频繁项集以及它们的支持度 298 L1, supportData1 = apriori(dataSet, minSupport=0.5) 299 print('频繁项集L: ', L1) 300 print('项集支持度supportData: ', supportData1) 301 302 # 生成关联规则 303 rules = generateRules(L1, supportData1, minConf=0.7) 304 print('关联规则rules: ', rules) 305 306 def main(): 307 # 测试 Apriori 算法 308 # testApriori() 309 310 311 # 生成关联规则 312 testGenerateRules() 313 if __name__ == "__main__": 314 main()
1 dataSet: [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] 2 频繁项集L: [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})]] 3 项集支持度supportData: {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({2, 3, 5}): 0.5} 4 frozenset({5}) ------> frozenset({2}) conf: 1.0 5 frozenset({2}) ------> frozenset({5}) conf: 1.0 6 frozenset({1}) ------> frozenset({3}) conf: 1.0 7 frozenset({3, 5}) ------> frozenset({2}) conf: 1.0 8 frozenset({2, 3}) ------> frozenset({5}) conf: 1.0 9 关联规则rules: [(frozenset({5}), frozenset({2}), 1.0), (frozenset({2}), frozenset({5}), 1.0), (frozenset({1}), frozenset({3}), 1.0), (frozenset({3, 5}), frozenset({2}), 1.0), (frozenset({2, 3}), frozenset({5}), 1.0)]
正是江南好风景