Apache Solr的索引和查询顺序

拜读了solr的部分源码,却急于弄明白solr的索引顺序和查询顺序,如下是探访结果.

 所有的配置都在solr/example/solr/conf/schema.xml当中.

 

  1 <!-- 如下是对text类型的处理 -->
2 <fieldTypename="text"class="solr.TextField"positionIncrementGap="100"autoGeneratePhraseQueries="true">
3  <!-- 索引顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
4 <analyzertype="index">
5 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
6 <!-- in this example, we will only use synonyms at query time
7 <filterclass="solr.SynonymFilterFactory"synonyms="index_synonyms.txt"ignoreCase="true"expand="false"/>
8 -->
9 <!-- Case insensitive stop word removal.
10 add enablePositionIncrements=true in both the index and query
11 analyzers to leave a 'gap' for more accurate phrase queries.
12 -->
13 <filterclass="solr.StopFilterFactory"
14 ignoreCase="true"
15 words="stopwords.txt"
16 enablePositionIncrements="true"
17 />
18 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="1"/>
19 <filterclass="solr.LowerCaseFilterFactory"/>
20 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
21 <filterclass="solr.PorterStemFilterFactory"/>
22 </analyzer>
23 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
24 <analyzertype="query">
25 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
26 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
27 <filterclass="solr.StopFilterFactory"
28 ignoreCase="true"
29 words="stopwords.txt"
30 enablePositionIncrements="true"
31 />
32 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="1"/>
33 <filterclass="solr.LowerCaseFilterFactory"/>
34 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
35 <filterclass="solr.PorterStemFilterFactory"/>
36 </analyzer>
37 </fieldType>
38
39
40 <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
41 but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
42 <!-- 针对textTight类型-->
43 <fieldTypename="textTight"class="solr.TextField"positionIncrementGap="100">
44  <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7英文相近词8去除重复词
45  -->
46 <analyzer>
47 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
48 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="false"/>
49 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"/>
50 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="0"generateNumberParts="0"catenateWords="1"catenateNumbers="1"catenateAll="0"/>
51 <filterclass="solr.LowerCaseFilterFactory"/>
52 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
53 <filterclass="solr.EnglishMinimalStemFilterFactory"/>
54 <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
55 possible with WordDelimiterFilter in conjuncton with stemming. -->
56 <filterclass="solr.RemoveDuplicatesTokenFilterFactory"/>
57 </analyzer>
58 </fieldType>
59
60
61 <!-- A general unstemmed text field - good if one does not know the language of the field -->
62  <!-- 针对textgen类型 -->
63 <fieldTypename="textgen"class="solr.TextField"positionIncrementGap="100">
64 <!-- 索引顺序1空格2过滤词3拆字4小写过滤-->
65 <analyzertype="index">
66 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
67 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/>
68 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="0"/>
69 <filterclass="solr.LowerCaseFilterFactory"/>
70 </analyzer>
71 <!-- 查询顺序1空格2同义词3过滤词4小写过滤-->
72 <analyzertype="query">
73 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
74 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
75 <filterclass="solr.StopFilterFactory"
76 ignoreCase="true"
77 words="stopwords.txt"
78 enablePositionIncrements="true"
79 />
80 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="0"/>
81 <filterclass="solr.LowerCaseFilterFactory"/>
82 </analyzer>
83 </fieldType>
84
85
86 <!-- A general unstemmed text field that indexes tokens normally and also
87 reversed (via ReversedWildcardFilterFactory), to enable more efficient
88 leading wildcard queries. -->
89 <!-- 针对text_rev类型 -->
90 <fieldTypename="text_rev"class="solr.TextField"positionIncrementGap="100">
91 <!-- 索引顺序1空格2过滤词3拆字4小写过滤6转义通配符-->
92 <analyzertype="index">
93 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
94 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/>
95 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="0"/>
96 <filterclass="solr.LowerCaseFilterFactory"/>
97 <filterclass="solr.ReversedWildcardFilterFactory"withOriginal="true"
98 maxPosAsterisk="3"maxPosQuestion="2"maxFractionAsterisk="0.33"/>
99 </analyzer>
100  <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤 -->
101 <analyzertype="query">
102 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
103 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
104 <filterclass="solr.StopFilterFactory"
105 ignoreCase="true"
106 words="stopwords.txt"
107 enablePositionIncrements="true"
108 />
109 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="0"/>
110 <filterclass="solr.LowerCaseFilterFactory"/>
111 </analyzer>
112 </fieldType>
113
114 <!-- charFilter + WhitespaceTokenizer -->
115 <!--
116 <fieldTypename="textCharNorm"class="solr.TextField"positionIncrementGap="100">
117 <analyzer>
118 <charFilterclass="solr.MappingCharFilterFactory"mapping="mapping-ISOLatin1Accent.txt"/>
119 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
120 </analyzer>
121 </fieldType>
122 -->
123
124 <!-- This is an example of using the KeywordTokenizer along
125 With various TokenFilterFactories to produce a sortable field
126 that does not include some properties of the source text
127 -->
128 <fieldTypename="alphaOnlySort"class="solr.TextField"sortMissingLast="true"omitNorms="true">
129 <analyzer>
130 <!-- KeywordTokenizer does no actual tokenizing, so the entire
131 input string is preserved as a single token
132 -->
133 <tokenizerclass="solr.KeywordTokenizerFactory"/>
134 <!-- The LowerCase TokenFilter does what you expect, which can be
135 when you want your sorting to be case insensitive
136 -->
137 <filterclass="solr.LowerCaseFilterFactory"/>
138 <!-- The TrimFilter removes any leading or trailing whitespace -->
139 <filterclass="solr.TrimFilterFactory"/>
140 <!-- The PatternReplaceFilter gives you the flexibility to use
141 Java Regular expression to replace any sequence of characters
142 matching a pattern with an arbitrary replacement string,
143 which may include back references to portions of the original
144 string matched by the pattern.
145
146 See the Java Regular Expression documentation for more
147 information on pattern and replacement string syntax.
148
149 http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
150 -->
151 <filterclass="solr.PatternReplaceFilterFactory"
152 pattern="([^a-z])"replacement=""replace="all"
153 />
154 </analyzer>
155 </fieldType>
156
157 <fieldtypename="phonetic"stored="false"indexed="true"class="solr.TextField">
158 <analyzer>
159 <tokenizerclass="solr.StandardTokenizerFactory"/>
160 <filterclass="solr.DoubleMetaphoneFilterFactory"inject="false"/>
161 </analyzer>
162 </fieldtype>
163
164 <fieldtypename="payloads"stored="false"indexed="true"class="solr.TextField">
165 <analyzer>
166 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
167 <!--
168 The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
169 a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
170 Attributes of the DelimitedPayloadTokenFilterFactory :
171 "delimiter" - a one character delimiter. Default is | (pipe)
172 "encoder" - how to encode the following value into a playload
173 float -> org.apache.lucene.analysis.payloads.FloatEncoder,
174 integer -> o.a.l.a.p.IntegerEncoder
175 identity -> o.a.l.a.p.IdentityEncoder
176 Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
177 -->
178 <filterclass="solr.DelimitedPayloadTokenFilterFactory"encoder="float"/>
179 </analyzer>
180 </fieldtype>
181
182 <!-- lowercases the entire field value, keeping it as a single token. -->
183 <fieldTypename="lowercase"class="solr.TextField"positionIncrementGap="100">
184 <analyzer>
185 <tokenizerclass="solr.KeywordTokenizerFactory"/>
186 <filterclass="solr.LowerCaseFilterFactory"/>
187 </analyzer>
188 </fieldType>

 

大致的索引顺序会是:

 1.空格..............................solr.WhitespaceTokenizerFactory

2同义词............................solr.SynonymFilterFactory

3过滤词...........................solr.StopFilterFactory

4拆字..............................solr.WordDelimiterFilterFactory

5小写过滤.....................solr.LowerCaseFilterFactory

6关键字.........................solr.KeywordMarkerFilterFactory

7词干抽取算法............solr.PorterStemFilterFactory

 

 大致的搜索顺序是:

 

 1.空格..............................solr.WhitespaceTokenizerFactory

2同义词............................solr.SynonymFilterFactory

3过滤词...........................solr.StopFilterFactory

4拆字..............................solr.WordDelimiterFilterFactory

5小写过滤.....................solr.LowerCaseFilterFactory

6关键字.........................solr.KeywordMarkerFilterFactory

7英文相近词..................solr.EnglishMinimalStemFilterFactory

8去除重复词.................solr.RemoveDuplicatesTokenFilterFactory

 

当然了,你可以根据自己的权重来重新分配索引和搜索顺序。

 

posted @ 2011-10-31 18:54  爱开卷360  阅读(3924)  评论(0编辑  收藏  举报