生成文本聚类java实现3
由于carrot2对中文的理解很不靠谱,所以参考了网络上的一些资料,现在贡献出来所有代码。
代码的思路就是找字或者词出现的频度,并进行打分,最后按照出现次数和重要性,找出重要的语汇。现在贴出来一些可用的代码。
ClusterBuilder.java
/**
* * @author * @version 创建时间:2011-3-8 下午02:02:36 * 聚类生成器 */ public class ClusterBuilder { private static final Log LOG; private List<DocCluster> clusters; private ICTHit[] docs; private int maxLevels; private ClusteringOptions[] options; private boolean useTagsAsTitle; private String wordsExcluded; private static short[] bit1Table; static { LOG = LogFactory.getLog(ClusterBuilder.class.getName()); bit1Table = new short[65536]; for (int n = 0; n < bit1Table.length; n++) { String s = Integer.toBinaryString(n); short m = 0; for (int k = 0; k < s.length(); k++) { if (s.charAt(k) == '1') { m = (short) (m + 1); } } bit1Table[n] = m; } } private static int getValidBitCount(long n) { int i3 = (int) (n % 65536L); n /= 65536L; int i2 = (int) (n % 65536L); n /= 65536L; int i1 = (int) (n % 65536L); n /= 65536L; int i0 = (int) (n % 65536L); return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3]; } private static int getDocHitCount(long[] hits) { assert (hits != null); if (hits == null) return 0; int n0 = 0; for (int i = 0; i < hits.length; i++) { n0 += getValidBitCount(hits[i]); } return n0; } public ClusterBuilder() { for (int n = 0; n < bit1Table.length; n++) { String s = Integer.toBinaryString(n); short m = 0; for (int k = 0; k < s.length(); k++) { if (s.getBytes()[k] == '1') { m = (short)(m + 1); } } bit1Table[n] = m; } } /** * * @param docsToCluster 要聚类的记录列表 * @param exWords 不使用的主题词列表,多个词用西文逗号分隔。这些词将不会作为主题词。 * @param maxLevels 最大聚类级数 * @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用,则根据文档标题自动生成类别主题词。 */ public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) { this.useTagsAsTitle = useTagsAsTitle; this.wordsExcluded = exWords; this.maxLevels = maxLevels; this.docs = docsToCluster; this.options = new ClusteringOptions[3]; this.options[0] = new ClusteringOptions(); this.options[0].setDocMaxTagCount(10); this.options[0].setMinTagRelevance(60); this.options[0].setMinSameDocPercent(80); this.options[1] = new ClusteringOptions(); this.options[1].setDocMaxTagCount(8); this.options[1].setMinTagRelevance(85); this.options[1].setMinSameDocPercent(70); this.options[1].setTagMinDocCount(2); this.options[1].setMinSameDocs(2); this.options[2] = new ClusteringOptions(); this.options[2].setDocMaxTagCount(8); this.options[2].setMinTagRelevance(50); this.options[2].setMinSameDocPercent(70); this.options[2].setTagMinDocCount(2); this.options[2].setMinSameDocs(2); } /** * 对Docs记录列表执行聚类,结果存放于Clusters中 */ public void cluster() { this.clusters = createLevelClusters(docs, 0, options[0]); List subs = null; if (this.maxLevels <= 1) { return; } for (DocCluster dc : this.clusters) { if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他")) continue; subs = createLevelClusters(dc.getDocList(), 1, options[1]); if (subs.size() > 1) dc.setSubclusters(subs); } } /** * 创建一个层级的聚类 * @param docs 文档列表 * @param level 层级号 * @param levelOpt 该层级的聚类选项 * @return */ private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) { TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount()); List clusters = new ArrayList(); int i, ValidTagCount; int DocCount = 0; // 扫描文档列表,根据每个文档的主题词列表,初始化主题词文档对照表。 for (i = 0; i < docs.length; i++) { ICTHit d = docs[i]; int validTagCount = 0; if (d.getTagList() != null) { String[] tagList = d.getTagList(); for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) { String tag = tagList[tagIdx].trim(); // 主题词长度大于6个字的丢弃 if ((tag.length() <= 0) || (tag.length() > 20) || ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded .contains(tag))))) continue; matrix.AddDocHit(tag, i); validTagCount++; } } } int maxKwDocCount = 0; List entryListToRemove = new ArrayList(); String kwWithMaxDocCount = ""; LOG.debug("有效关键词:"); for (Map.Entry entry : matrix.entrySet()) { // 统计当前主题词的命中文档数,文档数小于预设值,则该主题词将被删除 int n = getDocHitCount((long[]) entry.getValue()); if (n < levelOpt.getTagMinDocCount()) { entryListToRemove.add((String) entry.getKey()); } else { LOG.debug((String) entry.getKey() + "(" + n + "), "); DocCount += n; } if (n > maxKwDocCount) { maxKwDocCount = n; kwWithMaxDocCount = (String) entry.getKey(); } } LOG.debug(""); LOG.debug("被忽略的关键词:"); for (i = 0; i < entryListToRemove.size(); i++) { LOG.debug((String) entryListToRemove.get(i) + ", "); matrix.remove(entryListToRemove.get(i)); } LOG.debug(""); LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。"); LOG.debug("最大文档数的关键词:" + kwWithMaxDocCount + ",文档数:" + maxKwDocCount + "。"); double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D; LOG.debug("关键词平均文档数:" + docCountPerTag); levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level))); if (levelOpt.getMinSameDocs() < 1) { levelOpt.setMinSameDocs(1); } while (mergeClusters(matrix, levelOpt) > 0) { } return createResult(matrix, docs, level, levelOpt); } private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) { if (matrix.size() == 0) return 0; long[] docHitsMerged = (long[]) null; long[] maxDocHitsMerged = (long[]) null; String word1 = ""; String word2 = ""; String word1ToMerge = ""; String word2ToMerge = ""; int i,j; int sameDocs = 0; // 初始化一个相关度数组,0到100分,共101项 List rankMatrix = new ArrayList(); for (i = 0; i < 101; i++) { rankMatrix.add(new ArrayList()); } List matrix2List = new ArrayList(); matrix2List.addAll(matrix.entrySet()); // 将主题词文档映射表中的主题词两两比对 for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) { Map.Entry hits1 = (Map.Entry) matrix2List.get(i1); word1 = (String) hits1.getKey(); for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) { Map.Entry hits2 = (Map.Entry) matrix2List.get(i2); word2 = (String) hits2.getKey(); Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2), docHitsMerged, sameDocs, opt, matrix.hitsItemCount); // 计算两个词的相关性,获取两词的文档汇总表,以及相同文档数 int nRank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); // 相关度小于预设阈值的忽略 if (nRank >= opt.getMinTagRelevance()) { ((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2)); } } } List tagListToRemove = new ArrayList(); List entryListMerged = new ArrayList(); entryListMerged.add(new TagHitEntry("", null)); HashSet idPairTable = new HashSet(); TagHitEntry entryToMerge1; while (true) { // 找到最大相关性的两个主题词 for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){}; if (i < opt.getMinTagRelevance()) { break; } IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0); // 合并两个类别 ((List) rankMatrix.get(i)).remove(0); entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1)) : (TagHitEntry) entryListMerged.get(-ip.Id1); TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2)) : (TagHitEntry) entryListMerged.get(-ip.Id2); word1ToMerge = entryToMerge1.key; word2ToMerge = entryToMerge2.key; assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0)); String wordsMerged = word1ToMerge + "," + word2ToMerge; long[] lDocs0 = entryToMerge1.value; long[] lDocs1 = entryToMerge2.value; maxDocHitsMerged = new long[matrix.hitsItemCount]; for (i = 0; i < lDocs0.length; i++) { lDocs0[i] |= lDocs1[i];// 获取合并的文档集 } if (ip.Id1 >= 0) tagListToRemove.add(word1ToMerge); else entryListMerged.set(-ip.Id1, new TagHitEntry("", null)); if (ip.Id2 >= 0) tagListToRemove.add(word2ToMerge); else { entryListMerged.set(-ip.Id2, new TagHitEntry("", null)); } entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged)); // 替换与合并主题词有关联的其他相关主题词对的评分 int idMerged = -(entryListMerged.size() - 1); int id2 = 0; boolean CanDelete = false; for (i = 0; i <= 100; i++) { int ListCount = ((List) rankMatrix.get(i)).size(); if (ListCount == 0) { continue; } for (j = 0; j < ListCount; j++) { IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j); CanDelete = false; if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) { id2 = p.Id2; CanDelete = true; } else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) { id2 = p.Id1; CanDelete = true; } if (!CanDelete) continue; if (idMerged == id2) { continue; } ((List) rankMatrix.get(i)).remove(j); j--; ListCount--; IdPair pairMerged = new IdPair(idMerged, id2); if (idPairTable.contains(pairMerged)) { continue; } TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2)) : (TagHitEntry) entryListMerged.get(-id2); assert ((e2.key.length() != 0) && (e2.key != wordsMerged)); Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount); int rank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); if (rank <= opt.getMinTagRelevance()) continue; ((List) rankMatrix.get(rank)).add(pairMerged); idPairTable.add(pairMerged); } } } // 删除被合并的主题词 for (int m =0;m<tagListToRemove.size();m++){ matrix.remove(tagListToRemove.get(m)); } /** for (String w : tagListToRemove) matrix.remove(w); **/ // 添加合并而成的新主题词 for (int n=0;n<entryListMerged.size();n++){ TagHitEntry e = (TagHitEntry) entryListMerged.get(n); matrix.put(e.getKey(), e.getValue()); } /** for (TagHitEntry e : entryListMerged) { if (e.getKey().length() > 0) matrix.put(e.getKey(), e.getValue()); } **/ return 0; } private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) { if (matrix.size() == 0) return 0; long[] docHitsMerged = (long[]) null; long[] maxDocHitsMerged = (long[]) null; int nMaxRank = 0; String word1 = ""; String word2 = ""; String word1ToMerge = ""; String word2ToMerge = ""; int sameDocs = 0; List matrix2List = new ArrayList(); matrix2List.addAll(matrix.entrySet()); for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) { TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1)); word1 = hits1.getKey(); for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) { TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2)); word2 = hits2.getKey(); Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount); int nRank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance())) continue; nMaxRank = nRank; maxDocHitsMerged = docHitsMerged; word1ToMerge = word1; word2ToMerge = word2; } } if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) { return 0; } String wordsMerged = word1ToMerge + "," + word2ToMerge; if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) { matrix.remove(word1ToMerge); matrix.remove(word2ToMerge); matrix.put(wordsMerged, maxDocHitsMerged); LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")"); return 1; } return 0; } private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount, ClusteringOptions opt, int hitsItemCount) { Object[] re = new Object[3]; docHitsMerged = new long[hitsItemCount]; sameDocCount = 0; String tag1 = entry1.getKey(); String tag2 = entry2.getKey(); assert (tag2 != tag1); long[] lDocs0 = entry1.getValue(); long[] lDocs1 = entry2.getValue(); int n0 = 0; int n1 = 0; n0 = getDocHitCount(lDocs0); n1 = getDocHitCount(lDocs1); int docCountMin = Math.min(n0, n1); int docCountMax = Math.max(n0, n1); int docCountMerged = 0; long sameDocBits = 0L; long diffDocBits = 0L; int diffDocCount = 0; for (int i = 0; i < lDocs0.length; i++) { docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集 docCountMerged += getValidBitCount(docHitsMerged[i]); diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集 diffDocCount += getValidBitCount(diffDocBits); sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集 sameDocCount += getValidBitCount(sameDocBits); } boolean IsSubstring = false; // 一个主题词是另一个的子串,则得分较高 if ((tag2.contains(tag1)) || (tag1.contains(tag2))) { IsSubstring = true; docCountMin += opt.getTagMinDocCount(); } if ((sameDocCount == 0) && (!IsSubstring)) { re[0] = Integer.valueOf(0); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } if (docCountMin < opt.getTagMinDocCount()) { re[0] = Integer.valueOf(0); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged); int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin); int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged); LOG.debug("相关性:" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2); LOG.debug(", SamePercent=" + samePercent); LOG.debug(", SamePercentMin=" + samePercentMin); LOG.debug(", DiffPercent=" + diffPercent); int nRank; if ((sameDocCount >= opt.getMinSameDocs()) && ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) { nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D); } else { nRank = 0; } if (IsSubstring) nRank += 80; LOG.debug(", Rank=" + nRank); re[0] = Integer.valueOf(Math.min(nRank, 100)); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) { return new TagHitEntry((String) e.getKey(), (long[]) e.getValue()); } @SuppressWarnings("unchecked") private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) { int i,j; Map<String,DocValue> clsIdList = new HashMap(); List ClassTitleList = new ArrayList(); for (Map.Entry de : matrix.entrySet()) { DocValue dv = new DocValue(); clsIdList.put((String) de.getKey(), dv); } List<Integer> otherIdList = new ArrayList(); TagHitEntry maxTagHitEntry = new TagHitEntry(); int clsCount; String tag; // 确定每个文档所属的类别 for (i = 0; i < docs.length; i++) { ICTHit d = docs[i]; TagHitMatrix.ClusterDocInfo di = matrix.docs[i]; assert (docs[i] != null); int maxTagHit = 0; clsCount = 0; for (Map.Entry hits : matrix.entrySet()) { int tagHitCount = 0; int score = 0; String clsWordListStr = "," + (String) hits.getKey() + ","; // 那个类别包含当前文档的主题词最多,该文档就属于哪个类别 for (j = 0; j < di.TagCount; j++) { tag = di.TagList[j]; score = j < 3 ? 2 : 1; assert (tag.length() > 0); if (!clsWordListStr.contains("," + tag + ",")) continue; tagHitCount += score; clsCount++; } if (maxTagHit >= tagHitCount) continue; maxTagHit = tagHitCount; maxTagHitEntry = mapEntry2TagHitEntry(hits); } if (maxTagHit > 0) { DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey()); dv.idList.add(Integer.valueOf(i)); } else { otherIdList.add(Integer.valueOf(i)); } } // 生成类别列表 List<DocCluster> clusterList = new ArrayList(); String[] TagList; Object dc; for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) { DocValue dv = (DocValue) kv.getValue(); if (dv.idList.size() <= 0) continue; if (dv.idList.size() == 1) { otherIdList.add((Integer) dv.idList.get(0)); } else { dc = new DocCluster(); ((DocCluster) dc).setDocIdList(new String[dv.idList.size()]); ((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]); for (i = 0; i < dv.idList.size(); i++) { ((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId(); ((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()]; } ((DocCluster) dc).setLevel(level); ((DocCluster) dc).setTags((String) kv.getKey()); for (i = 0; (i < clusterList.size()) && (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) { i++; } clusterList.add(i, (DocCluster) dc); } } for (i = opt.getMaxClusterCount(); i < clusterList.size();) { DocCluster c = (DocCluster) clusterList.get(i); List idList = ((DocValue) clsIdList.get(c.getTags())).idList; for (dc = idList.iterator(); ((Iterator) dc).hasNext();) { int idx = ((Integer) ((Iterator) dc).next()).intValue(); otherIdList.add(Integer.valueOf(idx)); } clusterList.remove(i); } int i1; for (i = 0; i < clusterList.size(); i++) { DocCluster dc1 = (DocCluster) clusterList.get(i); String[] tagList = dc1.getTags().split(","); String newTags = ""; for (j = 0; j < tagList.length; j++) { i1 = dc1.getTags().indexOf(tagList[j]); int i2 = dc1.getTags().lastIndexOf(tagList[j]); if (i1 == i2) newTags = newTags + tagList[j] + ","; } if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) { newTags = newTags.substring(0, newTags.length() - 1); } dc1.setTags(newTags); dc1.setTitle(""); if (this.useTagsAsTitle) { tagList = dc1.getTags().split(","); for (j = 0; (tagList != null) && (j < tagList.length); j++) { if ((dc1.getTitle() + tagList[j]).length() > 16) break; boolean isSubstr = false; for (DocCluster c : clusterList) { if ((c.getTitle().length() <= 0) || ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle())))) continue; isSubstr = true; break; } if (!isSubstr) dc1.setTitle(dc1.getTitle() + tagList[j] + ","); } if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) { dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1)); } } if (dc1.getTitle() != "") continue; dc1.setTitle(dc1.getTags()); if (dc1.getTitle().length() <= 16) continue; String s = dc1.getTitle().substring(0, 16); int li = s.lastIndexOf(','); if (li > 0) { dc1.setTitle(s.substring(0, li)); } } if (otherIdList.size() > 0) { DocCluster clusterOther = new DocCluster(); clusterOther.setDocIdList(new String[otherIdList.size()]); clusterOther.setDocList(new ICTHit[otherIdList.size()]); clusterOther.setLevel(level); clusterOther.setTitle("其他"); clusterOther.setTags("其他"); i = 0; for (int k=0;k<otherIdList.size();k++) { int idx = otherIdList.get(k); clusterOther.getDocIdList()[i] = docs[idx].getDocId(); clusterOther.getDocList()[i] = docs[idx]; i++; } clusterList.add(clusterOther); } return (List<DocCluster>) clusterList; } public List<DocCluster> getClusters() { return this.clusters; } public void setClusters(List<DocCluster> clusters) { this.clusters = clusters; } public ICTHit[] getDocs() { return this.docs; } public void setDocs(ICTHit[] docs) { this.docs = docs; } public int getMaxLevels() { return this.maxLevels; } public void setMaxLevels(int maxLevels) { this.maxLevels = maxLevels; } public ClusteringOptions[] getOptions() { return this.options; } public void setOptions(ClusteringOptions[] options) { this.options = options; } public boolean isUseTagsAsTitle() { return this.useTagsAsTitle; } public void setUseTagsAsTitle(boolean useTagsAsTitle) { this.useTagsAsTitle = useTagsAsTitle; } public String getWordsExcluded() { return this.wordsExcluded; } public void setWordsExcluded(String wordsExcluded) { this.wordsExcluded = wordsExcluded; } private class DocValue { public List<Integer> idList = new ArrayList(); public String titleListStr = ""; private DocValue() { } } /** * 主题词ID对,主题词ID为该主题词在主题词文档映射表中的主键位置。 * @author * @version 创建时间:2011-3-9 下午02:52:44 */ private class IdPair { public int Id1; public int Id2; public IdPair(int id1, int id2) { assert (id1 != id2); if (id1 < id2) { this.Id1 = id1; this.Id2 = id2; } else { this.Id1 = id2; this.Id2 = id1; } } public int hashCode() { return -1; } public boolean equals(Object o) { return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2); } } public static class TagHitEntry { public String key; public long[] value; public TagHitEntry() { } public TagHitEntry(String k, long[] v) { this.key = k; this.value = v; } public String getKey() { return this.key; } public long[] getValue() { return this.value; } } }
ClusteringOptions.java
/** * * @author * @version 创建时间:2011-3-8 上午10:23:27 */ public class ClusteringOptions { public static int DefMaxClusterCount = 20; public static int DefMaxKeywordCount = 6; public static int DefMinWordsRelevance = 10; public static int DefTagMinDocCount = 3; public static int DefIgnoreSameDocs = 2; public static int DefSameDocPercent = 50; public static int DefMinDocsToCluster = 8; private int docMaxTagCount; private int maxClusterCount; private int minDocsToCluster; private int minSameDocPercent; private int minSameDocs; private int minTagRelevance; private int tagMinDocCount; public ClusteringOptions() { this.maxClusterCount = DefMaxClusterCount; this.minTagRelevance = DefMinWordsRelevance; this.tagMinDocCount = DefTagMinDocCount; this.minSameDocs = DefIgnoreSameDocs; this.minSameDocPercent = DefSameDocPercent; this.docMaxTagCount = DefMaxKeywordCount; this.minDocsToCluster = DefMinDocsToCluster; } public int getDocMaxTagCount() { return this.docMaxTagCount; } public void setDocMaxTagCount(int docMaxTagCount) { this.docMaxTagCount = docMaxTagCount; } public int getMaxClusterCount() { return this.maxClusterCount; } public void setMaxClusterCount(int maxClusterCount) { this.maxClusterCount = maxClusterCount; } public int getMinDocsToCluster() { return this.minDocsToCluster; } public void setMinDocsToCluster(int minDocsToCluster) { this.minDocsToCluster = minDocsToCluster; } public int getMinSameDocPercent() { return this.minSameDocPercent; } public void setMinSameDocPercent(int minSameDocPercent) { this.minSameDocPercent = minSameDocPercent; } public int getMinSameDocs() { return this.minSameDocs; } public void setMinSameDocs(int minSameDocs) { this.minSameDocs = minSameDocs; } public int getMinTagRelevance() { return this.minTagRelevance; } public void setMinTagRelevance(int minTagRelevance) { this.minTagRelevance = minTagRelevance; } public int getTagMinDocCount() { return this.tagMinDocCount; } public void setTagMinDocCount(int tagMinDocCount) { this.tagMinDocCount = tagMinDocCount; } }
DocCluster.java
/** * * @author * @version 创建时间:2011-3-8 上午10:23:35 */ public class DocCluster { private String[] docIdList; private ICTHit[] docList; private int level; private List<DocCluster> subclusters; private String tags; private String title; public String[] getDocIdList() { return this.docIdList; } public void setDocIdList(String[] docIdList) { this.docIdList = docIdList; } public ICTHit[] getDocList() { return this.docList; } public void setDocList(ICTHit[] docList) { this.docList = docList; } public int getLevel() { return level; } public void setLevel(int level) { this.level = level; } public List<DocCluster> getSubclusters() { return this.subclusters; } public void setSubclusters(List<DocCluster> subclusters) { this.subclusters = subclusters; } public String getTags() { return this.tags; } public void setTags(String tags) { this.tags = tags; } public String getTitle() { if (title == null) title = ""; return this.title; } public void setTitle(String title) { this.title = title; } }
ICTHit.java
public class ICTHit implements Serializable { /* * 关键词数组 */ private String[] TagList; private String docId; private String title; public String[] getTagList() { return TagList; } public void setTagList(String[] tagList) { TagList = tagList; } public String getDocId() { return docId; } public void setDocId(String docId) { this.docId = docId; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } }
TagHitMatrix.java
public class TagHitMatrix extends LinkedHashMap<String, long[]> { /** * */ private static final long serialVersionUID = -7511464445378974433L; public static int ii = 0; public ClusterDocInfo[] docs; public int hitsItemCount; public TagHitMatrix(int DocCount, int MaxTagCount) { this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D); this.docs = new ClusterDocInfo[DocCount]; for (int i = 0; i < this.docs.length; i++) this.docs[i] = new ClusterDocInfo(MaxTagCount); } public void AddDocHit(String TagStr, int Position) { TagStr = TagStr.trim(); int n = Position / 62; int m = Position % 62; long[] DocHits = (long[]) get(TagStr); if (DocHits == null) { DocHits = new long[this.hitsItemCount]; put(TagStr, DocHits); } DocHits[n] |= Math.round(Math.pow(2.0D, m)); ClusterDocInfo di = this.docs[Position]; di.TagList[(di.TagCount++)] = TagStr; } class ClusterDocInfo { public String[] TagList; public int TagCount; public ClusterDocInfo(int MaxTagCount) { this.TagList = new String[MaxTagCount]; this.TagCount = 0; } } }
测试方法:
public void test(ICTHit[] icthits) throws IOException { ClusterBuilder clusterBuilder = new ClusterBuilder(); // 设置需要聚类的数据集合,测试中用的null。 clusterBuilder.setDocs(icthits); // 设置聚类级别,只使用1级 clusterBuilder.setMaxLevels(10); clusterBuilder.setUseTagsAsTitle(true); // 一般将检索词设置为wordsExcluded clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国"); clusterBuilder .setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() }); // 开始聚类 clusterBuilder.cluster(); FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true); BufferedWriter bw1 = new BufferedWriter(fw1); // 打印结果 if (clusterBuilder.getClusters() != null) { int i = 0; for (DocCluster docCluster : clusterBuilder.getClusters()) { i++; System.out.println("tag:" + docCluster.getTags() + "(" + docCluster.getDocIdList().length + ")"); bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n "); if (docCluster.getDocList() != null && docCluster.getDocList().length > 0) { for (ICTHit co : docCluster.getDocList()) { System.out.println(" DocID: " + co.getDocId()); bw1.write("标题为: " + co.getTitle()+",ID为"+co.getDocId()+"\r\n "); for (int m = 0; m < co.getTagList().length; m++) { bw1.write("标题为: " + co.getTitle()+",ID为"+co.getDocId()+"\r\n "); System.out.println(" Key Word: " + co.getTagList()[m]); } System.out.println(""); } System.out.println(""); } else { bw1.write(" 该分类下无数据!"+"\r\n "); } bw1.write("-------------------------------------------------------------------------------\r\n"); } } bw1.close(); fw1.close(); }
如上方法可以,是一个示例性的,没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot2标准的方法要好很多。