Tag图是采用了Tag作为文章管理工具的网站经常需要呈现的一种视图。利用Lucene的优异性能,可以出色的完成这一功能。
生成一个Tag图,首先需要知道用于一共使用了哪些Tag,其次需要知道每个Tag被使用的次数。
对于这两个功能,都可以使用Lucene.Index.IndexReader.Terms方法。这个方法返回索引目录下所有Term,以及他们在全部文档中被使用的次数。这就为我们生成Tag提供了必要的基础。但是Terms方法返回的TermEnum的排序算法是按照FieldName,text的方式排序的,而不是按照docfreq排序的,所以需要还实现一个排序算法。
首先是索引的结构。我设计了如下的索引结构:
docurl:文档的url
contents:文档的内容,以便全文索引
doctags:文档相关的所有tags.tag以空格或逗号作为分割,可以使用单独的Analyzer进行解析。可以参考Analyzer以及PerFieldAnalyzerWrapper两个类。
排序算法,使用一个链表作为保存Tag的形式。它的两个方法GetList(int top)和Top(int freq)可以帮助我们设定Tag图中需要包含的Tag。TermFreq是每个Tag的数据内容。TermFreq.term是Tag的内容。TermFreq.freq是被使用的次数,这样就可以设定Tag的显示样式了。链表通过一个SortedList作为帮助信息,以便提高排序的效率。经过测试,这个排序算法对200M的TermFreq只需要11秒的时间。
2 {
3 public string term;
4 public int freq = 0;
5 }
6 internal class TermFreqCompare : System.Collections.IComparer
7 {
8 #region IComparer 成员
9
10 public int Compare(object x, object y)
11 {
12 TermFreq f1 = x as TermFreq;
13 TermFreq f2 = y as TermFreq;
14 int compareResult = f1.freq.CompareTo(f2.freq);
15 //if(compareResult==0) return f2.term.CompareTo(f1.term);
16 return compareResult;
17 }
18
19 #endregion
20
21 }
22 internal class TermFreqSortedList
23 {
24 private Element root;
25 private System.Collections.IComparer comparer;
26 private System.Collections.SortedList list;
27 internal class Element
28 {
29 public Element prev;
30 public Element next;
31 public TermFreq current;
32 }
33 public TermFreqSortedList(System.Collections.IComparer comparer)
34 {
35 root = new Element();
36 root.current = new TermFreq();
37 this.comparer = comparer;
38 list = new System.Collections.SortedList();
39 }
40 private Element GetStartElement(int freq)
41 {
42 Element ele = null;
43 if(list.ContainsKey(freq))
44 {
45 ele = list[freq] as Element;
46 }
47 else
48 {
49 list.Add(freq,null);
50 int index = list.IndexOfKey(freq)-1;
51 if(index<0) ele = list[0] as Element;
52 else ele = list[index] as Element;
53 }
54 return ele;
55 }
56 public void Add(TermFreq o)
57 {
58 Element ele = GetStartElement(o.freq);
59 if(ele==null) ele = root;
60 Element oEle = new Element();
61 oEle.current = o;
62 list[oEle.current.freq] = oEle;
63 while(ele !=null)
64 {
65 int compareResult = comparer.Compare(ele.current,oEle.current);
66 if(compareResult>0)
67 {
68 if(ele.next==null)
69 {
70 ele.next = oEle;
71 oEle.prev = ele;
72 break;
73 }
74 else if(comparer.Compare(ele.next.current,oEle.current)<0)
75 {
76 ele.next.prev = oEle;
77 oEle.next = ele.next;
78 ele.next = oEle;
79 oEle.prev = ele;
80 break;
81 }
82 else
83 {
84 ele = ele.next;
85 continue;
86 }
87 }
88 else if(compareResult<0)
89 {
90 if(ele.prev==null)
91 {
92 ele.prev = oEle;
93 oEle.next = ele;
94 root = oEle;
95 break;
96 }
97 else if(comparer.Compare(ele.prev.current,oEle.current)>0)
98 {
99 ele.prev.next = oEle;
100 oEle.prev = ele.prev;
101
102 ele.prev = oEle;
103 oEle.next = ele;
104 break;
105 }
106 else
107 {
108 ele = ele.prev;
109 continue;
110 }
111 }
112 if(ele.prev!=null)
113 {
114 ele.prev.next = oEle;
115 oEle.prev = ele.prev;
116 }
117 else
118 {
119 root = oEle;
120 }
121 oEle.next = ele;
122 ele.prev = oEle;
123 break;
124 }
125 }
126 public System.Collections.ArrayList GetList(int top)
127 {
128 System.Collections.ArrayList list = new System.Collections.ArrayList();
129 Element ele = root;
130 int i=0;
131 while((i++)<top)
132 {
133 list.Add(ele.current);
134 if(ele.next == null)
135 {
136 return list;
137 }
138 ele = ele.next;
139 }
140 return list;
141 }
142 public System.Collections.ArrayList Top(int freq)
143 {
144 System.Collections.ArrayList list = new System.Collections.ArrayList();
145 Element ele = root;
146 while(ele.current.freq >= freq)
147 {
148 list.Add(ele.current);;
149 if(ele.next==null)
150 return list;
151 ele = ele.next;
152 }
153 return list;
154 }
155 }
文档生成的代码:
2 doc.Add(Field.Keyword("docurl", docurl));
3 doc.Add(Field.Text("contents",contents));
4 //storeTermVector==true.这样我们以后就可以通过TermFreqVector来访问tag在每个文档中被标注的次数了,以便生成单个文档的Tag图
5 doc.Add(Field.Text("doctags", reader,true));
测试代码:
2 TermFreqSortedList list = new TermFreqSortedList(new TermFreqCompare());
3
4 while(enu.Next())
5 {
6 Lucene.Net.Index.Term t = enu.Term();
7
8 TermFreq f = new TermFreq();
9 f.freq = enu.DocFreq();
10 f.term = t.Text();
11 list.Add(f);
12 }
13 for(System.Collections.IEnumerator ienu = list.GetList(5).GetEnumerator();ienu.MoveNext();)
14 {
15 TermFreq ff = ienu.Current as TermFreq;
16
17 Console.WriteLine(string.Format("Term:{0}.\t\t\tDocFreq:{1}",
18 ff.term,
19 ff.freq));
20 }
21 for(System.Collections.IEnumerator ienu = list.Top(3).GetEnumerator();ienu.MoveNext();)
22 {
23 TermFreq ff = ienu.Current as TermFreq;
24
25 Console.WriteLine(string.Format("Term:{0}.\t\t\tDocFreq:{1}",
26 ff.term,
27 ff.freq));
28 }