挖掘频繁模式--FP-growth

挖掘频繁模式主要有三种挖掘算法,分别是Apriori算法、Eclat算法、FP-growth算法。Apriori算法通过不断的构造候选集、筛选候选集挖掘出频繁项集,需要多次扫描原始数据,效率较低。Eclat算法使用垂直数据格式挖掘频繁项集,只用扫描一次数据集,效率较高。FP-growth算法,通过对数据集两次扫描,对数据进行压缩处理,效率高于Apriori。

FP-growth算法构建,我主要分为五步:

  • 数据集的清洗
  • 创建FP树和链表
  • 条件模式基
  • 条件FP树
  • 构造频繁项集

FP-growth算法:

复制代码
def loadDataSet():
    dataSet = [['l1', 'l2', 'l5'],
               ['l2', 'l4'],
               ['l2', 'l3'],
               ['l1', 'l2', 'l4'],
               ['l1', 'l3'],
               ['l2', 'l3'],
               ['l1', 'l3'],
               ['l1', 'l2', 'l3', 'l5'],
               ['l1', 'l2', 'l3']
               ]
    return dataSet


def initDataset(dataSet):
    resDataSet = {}
    for items in dataSet:
        for item in items:
            if item in resDataSet:
                resDataSet[item] += 1
            else:
                resDataSet[item] = 1

    return resDataSet


def filterDataSet(resDataSet, minSupp=2):
    for key in resDataSet:
        supp = resDataSet[key]
        if supp >= minSupp:
            continue
        else:
            del resDataSet[key]
    resDataSet = sorted(resDataSet.items(), key=lambda d: d[1], reverse=True)
    left = 0
    right = 1
    while right < len(resDataSet):
        x = resDataSet[left]
        y = resDataSet[right]
        if x[1] > y[1]:
            right += 1
            left += 1
        else:
            if x[0] > y[0]:
                temp = resDataSet[left]
                resDataSet[left] = resDataSet[right]
                resDataSet[right] = temp
            else:
                right += 1
                left += 1
    resDataSet = dict(resDataSet)
    return resDataSet

# 创建PF树
#dataSet 数据集
def createFPtree(dataSet): resDataSet = initDataset(dataSet) resDataSet = filterDataSet(resDataSet) headerTable = resDataSet freqItemSet = set(headerTable.keys()) if len(freqItemSet) == 0: return None, None for i in headerTable: # 链表 headerTable[i] = [headerTable[i], None] # FP树,创建树根 # print(headerTable.items()) retTree = treeNode("Null Set", 1, None) y = {} for items in dataSet: x = initDataset([[frozenset(items)]]) # 例:frozenset({'l1', 'l5', 'l2'}): 1 for key, value in x.items(): if key in y: y[key] += value else: y[key] = value # dict(sorted(y.items(), key=lambda d: d[1])) # print(y) for itemSets, count in y.items(): sortPattern = {} for i in itemSets: if i in freqItemSet: sortPattern[i] = headerTable[i][0] # print("mcm", sortPattern) # 例:{'l1': 5, 'l5': 2, 'l2': 6} if len(sortPattern) > 0: itemSet = list(filterDataSet(sortPattern).keys()) # print('苟富贵', itemSet) updateTree(itemSet, retTree, headerTable, count) return retTree, headerTable # 更新链表
#testNode 是链表的指针,targetNode 是树节点
def updateNodeLink(testNode, targetNode): while testNode.nodeLink is not None: testNode = testNode.nodeLink testNode.nodeLink = targetNode # 更新树节点
# itemSet是每次扫描数据集的事务(经过处理的),headerTable 链表 ,count支持度
def updateTree(itemSet, FPTree, headerTable, count): if itemSet[0] in FPTree.children: FPTree.children[itemSet[0]].inc(count) else: FPTree.children[itemSet[0]] = treeNode(itemSet[0], count, FPTree) if headerTable[itemSet[0]][1] is None: headerTable[itemSet[0]][1] = FPTree.children[itemSet[0]] else: updateNodeLink(headerTable[itemSet[0]][1], FPTree.children[itemSet[0]]) if len(itemSet) > 1: updateTree(itemSet[1::], FPTree.children[itemSet[0]], headerTable, count) # 回溯 def PFTreeUpTrans(leafNode, prefixPath): if leafNode.parent is not None: prefixPath.append(leafNode.name) PFTreeUpTrans(leafNode.parent, prefixPath) # 找前缀路径 def findPrefixPath(TreeNode): conPattern = {} while TreeNode is not None: prefixPath = [] PFTreeUpTrans(TreeNode, prefixPath) if len(prefixPath) > 1: prefixPath = prefixPath[::-1] conPattern[tuple(prefixPath[:len(prefixPath) - 1])] = TreeNode.count TreeNode = TreeNode.nodeLink # print(conPattern) return conPattern def filterContact(resDataSet, minSupp=2): resDataSet1 = list(resDataSet.keys()) for key in resDataSet1: supp = resDataSet[key] # print(supp) if supp >= minSupp: continue else: del resDataSet[key] return resDataSet def initContact(dataSet, conPattern): resContact = {} for items in dataSet: for item in items: if item in resContact: resContact[item] += conPattern[items] else: resContact[item] = conPattern[items] return resContact # 构造条件FP树
# conPattern 是条件模式基
def contact(conPattern): retList = [] o = conPattern # print(o) conPattern = list(sorted(o.keys(), key=lambda d: d[0], reverse=True)) # print('fdf', conPattern) if len(conPattern) == 1: retList.append(filterContact(initContact(conPattern, o))) elif len(conPattern) > 1: left = 0 right = 1 conList1 = [conPattern[left]] conList2 = [] while right < len(conPattern): if conPattern[left][0] == conPattern[right][0]: conList1.append(conPattern[right]) right += 1 retList.append(filterContact(initContact(conList1, o))) if right < len(conPattern): if conPattern[left][0] != conPattern[right][0]: left = right conList1 = conList2 conList1.append(conPattern[left]) if right + 1 > len(conPattern): retList.append(filterContact(initContact(conList1, o))) else: retList.append(filterContact(initContact(conList1, o))) right += 1 # print('条件FP数', retList) return retList #构建一个集合的所有子集,利用二进制法 def PowerSetsBinary(items): N = len(items) b = [] for i in range(2 ** N): # 子集个数,每循环一次一个子集 combo = [] for j in range(N): # 用来判断二进制下标为j的位置数是否为1 if (i >> j) % 2: combo.append(items[j]) b.append(combo) # print(b) return b[1::] # 构造频繁项集
# item是 项 ,retList是条件FP树是一个元素为字典的列表
#通过条件PF树的非空子集+项,构建频繁项集
def frequentItems(item, retList): res = {} for iii in retList: key = [ii for ii in iii.keys()] value = [ii for ii in iii.values()] subSet = PowerSetsBinary(key) for subList in subSet: subList.append(item) # print(subSet, 'hf') # print('zf', res) for ii in range(len(subSet)): if frozenset(subSet[ii]) in res: res[frozenset(subSet[ii])] += res[frozenset(subSet[ii])] # print('f', res[frozenset(subSet[ii])]) else: if ii <= len(subSet) - 2: res[frozenset(subSet[ii])] = iii[key[ii]] else: res[frozenset(subSet[ii])] = min(value) # print('产生的频繁项集', res) return res if __name__ == '__main__': retTree, headerTable = createFPtree(loadDataSet()) items = list(headerTable.keys())[::-1] # print(items) for item in items: print("--------------------------{0}--------------------------- -".format(item)) yu = findPrefixPath(headerTable[item][1]) print("{0},".format(item), "条件模式基:", yu) yi = contact(yu) print("{0},".format(item), "条件FP树:", yi) res = frequentItems(item, yi) print("{0},".format(item), "产生的频繁项集:", res) print("-----------------------------------------------------------")
复制代码

树节点:

复制代码
class treeNode:
    def __init__(self, nameValue, num, parentNode):
        self.name = nameValue
        self.count = num
        self.nodeLink = None
        self.parent = parentNode
        self.children = {}

    def inc(self, num):
        self.count += num
复制代码

 

posted @   壹米  阅读(262)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
点击右上角即可分享
微信分享提示