【原创】利用python实现apriori关联算法并分析挖掘数据
# -*- coding: utf-8 -*- import json # Aprior算法 def loadDataSet(): '''创建一个用于测试的简单的数据集''' test_app=[] phone_app = json.load(open("phone_app.json")) for item in phone_app.keys(): for item_son in phone_app[item].keys(): test_app.append(phone_app[item][item_son]) return test_app def createC1(dataSet): ''' 构建初始候选项集的列表,即所有候选项集只包含一个元素, C1是大小为1的所有候选项集的集合 ''' C1 = [] for transaction in dataSet: for item in transaction: if [item] not in C1: C1.append([item]) C1.sort() return map(frozenset, C1) def scanD(D, Ck, minSupport): ''' 计算Ck中的项集在数据集合D(记录或者transactions)中的支持度, 返回满足最小支持度的项集的集合,和所有项集支持度信息的字典。 ''' ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): ssCnt[can] = ssCnt.get(can, 0) + 1 numItems = float(len(D)) retList = [] supportData = {} for key in ssCnt: support = ssCnt[key] / numItems if support >= minSupport: retList.insert(0, key) supportData[key] = support return retList, supportData def aprioriGen(Lk, k): ''' 由初始候选项集的集合Lk生成新的生成候选项集, k表示生成的新项集中所含有的元素个数 ''' retList = [] lenLk = len(Lk) for i in range(lenLk): for j in range(i + 1, lenLk): L1 = list(Lk[i])[: k - 2]; L2 = list(Lk[j])[: k - 2]; L1.sort(); L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataSet, minSupport=0.5): # 构建初始候选项集C1 C1 = createC1(dataSet) D = map(set, dataSet) L1, suppData = scanD(D, C1, minSupport) L = [L1] k = 2 while (len(L[k - 2]) > 0): Ck = aprioriGen(L[k - 2], k) Lk, supK = scanD(D, Ck, minSupport) suppData.update(supK) L.append(Lk) k += 1 return L, suppData if __name__ == '__main__': myDat = loadDataSet() # 选择频繁项集 L, suppData = apriori( myDat, 0.4 ) print u"频繁项集L:", L #print u"所有候选项集的支持度信息:", suppData