FP-Tree Java实现(二):模板挖掘
从下往上,使用循环+递归模式识别日志模板。
package com.coshaho.fptree; import java.util.*; import java.util.stream.Collectors; /** * FP树:仅考虑算法 * * @author coshaho * @since 2020/1/5 */ public class FPTree { // FP树根节点 private FPNode root = new FPNode("Root", -1); // FP树节点线索头 private Map<String, FPNode> firstNodeTable = new HashMap<>(); // FP树节点线索尾 private Map<String, FPNode> lastNodeTable = new HashMap<>(); // 支持度 private int support = 1; // 树的单词统计列表,降序 private List<FPNode> table = new ArrayList<>(); /** * 创建FP树 * @param data 多行数据 * @param count 每行数据出现次数 * @param support 支持度 */ public FPTree(List<List<String>> data, List<Integer> count, int support) { this.support = support; if (null == count) { int size = data.size(); count = new ArrayList<>(); for (int i = 0; i < size; i++) { count.add(1); } } data = sort(data, count); // line为一行日志 int i = 0; for (List<String> line : data) { FPNode curNode = root; for (String word : line) { if (curNode.getChildren().containsKey(word)) { // 子节点存在则访问次数加一 curNode.getChildren().get(word).increase(count.get(i)); } else { // 子节点不存在则新增子节点 FPNode child = new FPNode(word, count.get(i)); curNode.getChildren().put(word, child); child.setFather(curNode); } curNode = curNode.getChildren().get(word); // 当前节点有线索指向,则不必重复建立线索 if (curNode.isVisited()) { continue; } // 创建线索 if (firstNodeTable.containsKey(word)) { lastNodeTable.get(word).setNext(curNode); } else { firstNodeTable.put(word, curNode); } lastNodeTable.put(word, curNode); curNode.setVisited(true); } i++; } } public void print() { root.print(0); } /** * 获取日志模板 * @param last 下层节点 */ public void growth(List<String> last, List<LogTemplate> templates) { if (isSingleTree(this.root)) { getSingleTreeTemplate(last, templates); } else { getMultiTreeTemplate(last, templates); } } private void getWordTable(Map<String, Integer> wordCount) { for (Map.Entry<String, Integer> entry : wordCount.entrySet()) { if (entry.getValue() >= this.support) { table.add(new FPNode(entry.getKey(), entry.getValue())); } } if (0 != table.size()) { table = table.stream().sorted(Comparator.comparing(FPNode::getCount).reversed()) .collect(Collectors.toList()); } } private Map<String, Integer> getWordCount(List<List<String>> data, List<Integer> count) { Map<String, Integer> wordCount = new HashMap<>(); // 统计单词出现的次数 int i = 0; for (List<String> line : data) { for (String word : line) { if (wordCount.containsKey(word)) { wordCount.put(word, wordCount.get(word) + count.get(i)); } else { wordCount.put(word, count.get(i)); } } i++; } return wordCount; } private List<List<String>> sortData(Map<String, Integer> wordCount, List<List<String>> data) { List<List<String>> result = new ArrayList<>(); // 单词排序 for (List<String> line : data) { List<String> newLine = line.stream() .filter(word -> wordCount.get(word) >= support) .sorted(Comparator.comparing(word -> wordCount.get(word)).reversed()) .collect(Collectors.toList()); if (0 != newLine.size()) { result.add(newLine); } } return result; } private List<List<String>> sort(List<List<String>> data, List<Integer> count) { Map<String, Integer> wordCount = getWordCount(data, count); getWordTable(wordCount); return sortData(wordCount, data); } private void getSingleTreeTemplate(List<String> last, List<LogTemplate> templates) { // 获取单树路径上所有节点 List<FPNode> wordCount = new ArrayList<>(); FPNode child = getFirstChild(root); while (null != child) { wordCount.add(child); child = getFirstChild(child); } // 获取wordCount所有非空子集 List<LogTemplate> sonTemplates = getSonSet(wordCount); for (LogTemplate template : sonTemplates) { // 子集合出现次数大于支撑度则保留为模板 if (template.getCount() >= support) { templates.add(template); template.getWords().addAll(last); } } } private void getMultiTreeTemplate(List<String> last, List<LogTemplate> templates) { // table为树包含单词集合,降序 // 此处转换为升序,从下往上计算以每个节点结尾的模板 Collections.reverse(table); for (FPNode node : table) { List<String> curWords = new ArrayList<>(); curWords.add(node.getWord()); // last为上一层递归调用计算的节点 curWords.addAll(last); // 当前节点当做一个日志模板 if(null == last || 0 == last.size()) { LogTemplate template = new LogTemplate(); template.setCount(node.getCount()); List<String> words = new ArrayList<>(); words.add(node.getWord()); template.setWords(words); templates.add(template); } FPNode link = this.firstNodeTable.get(node.getWord()); List<List<String>> data = new ArrayList<>(); List<Integer> count = new ArrayList<>(); // 一条线索上有多个节点,每个节点从下往上对应一条日志模板路径 while (null != link) { FPNode me = link; List<String> meWords = new ArrayList<>(); me = me.getFather(); // 线索上每个节点往上走 while (null != me.getFather()) { meWords.add(me.getWord()); me = me.getFather(); } count.add(link.getCount()); // 不加这一句会导致排序不稳定 Collections.reverse(meWords); data.add(meWords); link = link.getNext(); } // 以上述节点构造新树 FPTree newTree = new FPTree(data, count, this.support); newTree.growth(curWords, templates); } } private List<LogTemplate> getSonSet(List<FPNode> wordCount) { List<LogTemplate> result = new ArrayList<>(); int length = wordCount.size(); int mark; int nEnd = 1 << length; // 对于length位二进制数,每个数字对应一个子集合取法 for (mark = 0; mark < nEnd; mark++) { LogTemplate template = new LogTemplate(); // 循环查找每位是否应该放入集合 for (int i = 0; i < length; i++) { //该位有元素输出 if (((1 << i) & mark) != 0) { template.getWords().add(wordCount.get(i).getWord()); // wordCount按照count降序排列,template count取最小值 template.setCount(wordCount.get(i).getCount()); } } // 空集合舍弃 if (template.getCount() != 0) { result.add(template); } } return result; } private boolean isSingleTree(FPNode tree) { if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) { return true; } // 有多个子节点则不是单树 if (1 < tree.getChildren().size()) { return false; } else { return isSingleTree(getFirstChild(tree)); } } private FPNode getFirstChild(FPNode tree) { if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) { return null; } else { for (FPNode child : tree.getChildren().values()) { return child; } return null; } } public static void main(String[] args) { List<String> line1 = new ArrayList<>(); line1.add("C"); line1.add("A"); line1.add("B"); List<String> line2 = new ArrayList<>(); line2.add("A"); line2.add("B"); line2.add("D"); List<String> line3 = new ArrayList<>(); line3.add("A"); line3.add("B"); List<String> line4 = new ArrayList<>(); line4.add("C"); line4.add("E"); List<List<String>> data = new ArrayList<>(); data.add(line1); data.add(line2); data.add(line3); data.add(line4); FPTree tree = new FPTree(data, null, 1); tree.print(); List<LogTemplate> templates = new ArrayList<>(); tree.growth(new ArrayList<>(), templates); for (LogTemplate template : templates) { template.print(); } } }
package com.coshaho.fptree; import java.util.HashMap; import java.util.Map; /** * FP树节点:仅考虑算法 * @author coshaho * @since 2020/1/5 */ public class FPNode { // 单词 private String word; // 单词出现次数 private int count = 1; // 子节点 private Map<String, FPNode> children = new HashMap<>(); // 父节点 private FPNode father; // 线索:指向下一个相同单词节点 private FPNode next; // 是否有线索指向自己 private boolean visited = false; public FPNode(String word, int count) { this.word = word; this.count = count; } public void increase(int i) { count += i; } public void print(int n) { for(int i = 0; i < n; i++) { if(i == n - 1) { System.out.print("--"); } else { System.out.print(" "); } } System.out.println(word + ": " + count); for(FPNode child : children.values()) { child.print(n + 1); } } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public Map<String, FPNode> getChildren() { return children; } public void setChildren(Map<String, FPNode> children) { this.children = children; } public FPNode getFather() { return father; } public void setFather(FPNode father) { this.father = father; } public FPNode getNext() { return next; } public void setNext(FPNode next) { this.next = next; } public boolean isVisited() { return visited; } public void setVisited(boolean visited) { this.visited = visited; } }
package com.coshaho.fptree; import java.util.ArrayList; import java.util.List; /** * 日志模板 * * @author coshaho * @since 2020/1/6 */ public class LogTemplate { private List<String> words = new ArrayList<>(); private int count; public List<String> getWords() { return words; } public void setWords(List<String> words) { this.words = words; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public void print() { System.out.println(words + ": " + count); } }