[solr] - spell check
solr提供了一个spell check,又叫suggestions,可以用于查询输入的自动完成功能auto-complete。
参考文献:
https://cwiki.apache.org/confluence/display/solr/Spell+Checking
http://www.cnblogs.com/ibook360/archive/2011/11/30/2269077.html
方法:
修改core的solrconfig.xml
加入这段到<config />内
<searchComponent name="spellcheck" class="solr.SpellCheckComponent"> <lst name="spellchecker"> <str name="name">wordbreak</str> <str name="classname">org.apache.solr.spelling.suggest.Suggester</str> <str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str> <str name="field">content</str> <str name="combineWords">true</str> <str name="breakWords">true</str> <int name="maxChanges">10</int> </lst> </searchComponent> <requestHandler name="/spellcheck" class="org.apache.solr.handler.component.SearchHandler"> <lst name="defaults"> <str name="spellcheck">true</str> <str name="spellcheck.dictionary">wordbreak</str> <str name="spellcheck.count">20</str> </lst> <arr name="last-components"> <str>spellcheck</str> </arr> </requestHandler>
schema.xml配置:
<?xml version="1.0" ?> <schema name="my core" version="1.1"> <fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/> <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> <fieldtype name="binary" class="solr.BinaryField"/> <fieldType name="text_cn" class="solr.TextField"> <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer" /> <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer" /> <analyzer> <tokenizer class="solr.KeywordTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> </fieldType> <!-- general --> <field name="id" type="long" indexed="true" stored="true" multiValued="false" required="true"/> <field name="subject" type="text_cn" indexed="true" stored="true" /> <field name="content" type="text_cn" indexed="true" stored="true" /> <field name="category_id" type="long" indexed="true" stored="true" /> <field name="category_name" type="text_cn" indexed="true" stored="true" /> <field name="last_update_time" type="tdate" indexed="true" stored="true" /> <field name="_version_" type="long" indexed="true" stored="true"/> <!-- field to use to determine and enforce document uniqueness. --> <uniqueKey>id</uniqueKey> <!-- field for the QueryParser to use when an explicit fieldname is absent --> <defaultSearchField>subject</defaultSearchField> <!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> <solrQueryParser defaultOperator="OR"/> </schema>
关键在于这句:
<analyzer> <tokenizer class="solr.KeywordTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> </analyzer>
意思是词组搜索
设置完xml,重启tomcat,在浏览器中运行:
http://localhost:8899/solr/mycore/spellcheck?spellcheck.build=true
运行结果:
然后在浏览器中运行:
http://localhost:8899/solr/mycore/spellcheck?q=中央&rows=0
运行结果:
Java代码:
Java bean:
package com.my.entity; import java.util.Date; import org.apache.solr.client.solrj.beans.Field; public class Item { @Field private long id; @Field private String subject; @Field private String content; @Field("category_id") private long categoryId; @Field("category_name") private String categoryName; @Field("last_update_time") private Date lastUpdateTime; public long getId() { return id; } public void setId(long id) { this.id = id; } public String getSubject() { return subject; } public void setSubject(String subject) { this.subject = subject; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public long getCategoryId() { return categoryId; } public void setCategoryId(long categoryId) { this.categoryId = categoryId; } public String getCategoryName() { return categoryName; } public void setCategoryName(String categoryName) { this.categoryName = categoryName; } public Date getLastUpdateTime() { return lastUpdateTime; } public void setLastUpdateTime(Date lastUpdateTime) { this.lastUpdateTime = lastUpdateTime; } }
测试代码:
package com.my.solr; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.apache.solr.client.solrj.response.SpellCheckResponse.Collation; import org.apache.solr.client.solrj.response.SpellCheckResponse.Correction; import org.apache.solr.client.solrj.response.SpellCheckResponse.Suggestion; import com.my.entity.Item; public class TestSolr { public static void main(String[] args) throws IOException, SolrServerException { String url = "http://localhost:8899/solr/mycore"; HttpSolrServer core = new HttpSolrServer(url); core.setMaxRetries(1); core.setConnectionTimeout(5000); core.setParser(new XMLResponseParser()); // binary parser is used by default core.setSoTimeout(1000); // socket read timeout core.setDefaultMaxConnectionsPerHost(100); core.setMaxTotalConnections(100); core.setFollowRedirects(false); // defaults to false core.setAllowCompression(true); // ------------------------------------------------------ // remove all data // ------------------------------------------------------ core.deleteByQuery("*:*"); List<Item> items = new ArrayList<Item>(); items.add(makeItem(1, "cpu", "this is intel cpu", 1, "cpu-intel")); items.add(makeItem(2, "cpu AMD", "this is AMD cpu", 2, "cpu-AMD")); items.add(makeItem(3, "cpu intel", "this is intel-I7 cpu", 1, "cpu-intel")); items.add(makeItem(4, "cpu AMD", "this is AMD 5000x cpu", 2, "cpu-AMD")); items.add(makeItem(5, "cpu intel I6", "this is intel-I6 cpu", 1, "cpu-intel-I6")); items.add(makeItem(6, "处理器", "中央处理器英特儿", 1, "cpu-intel")); items.add(makeItem(7, "处理器AMD", "中央处理器AMD", 2, "cpu-AMD")); items.add(makeItem(8, "中央处理器", "中央处理器Intel", 1, "cpu-intel")); items.add(makeItem(9, "中央空调格力", "格力中央空调", 3, "air")); items.add(makeItem(10, "中央空调海尔", "海尔中央空调", 3, "air")); items.add(makeItem(11, "中央空调美的", "美的中央空调", 3, "air")); core.addBeans(items); // commit core.commit(); // ------------------------------------------------------ // search // ------------------------------------------------------ SolrQuery query = new SolrQuery(); String token = "中央"; query.set("qt", "/spellcheck"); query.set("q", token); query.set("spellcheck", "on"); query.set("spellcheck.build", "true"); query.set("spellcheck.onlyMorePopular", "true"); query.set("spellcheck.count", "100"); query.set("spellcheck.alternativeTermCount", "4"); query.set("spellcheck.onlyMorePopular", "true"); query.set("spellcheck.extendedResults", "true"); query.set("spellcheck.maxResultsForSuggest", "5"); query.set("spellcheck.collate", "true"); query.set("spellcheck.collateExtendedResults", "true"); query.set("spellcheck.maxCollationTries", "5"); query.set("spellcheck.maxCollations", "3"); QueryResponse response = null; try { response = core.query(query); System.out.println("查询耗时:" + response.getQTime()); } catch (SolrServerException e) { System.err.println(e.getMessage()); e.printStackTrace(); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } finally { core.shutdown(); } SpellCheckResponse spellCheckResponse = response.getSpellCheckResponse(); if (spellCheckResponse != null) { List<Suggestion> suggestionList = spellCheckResponse.getSuggestions(); for (Suggestion suggestion : suggestionList) { System.out.println("Suggestions NumFound: " + suggestion.getNumFound()); System.out.println("Token: " + suggestion.getToken()); System.out.print("Suggested: "); List<String> suggestedWordList = suggestion.getAlternatives(); for (String word : suggestedWordList) { System.out.println(word + ", "); } System.out.println(); } System.out.println(); Map<String, Suggestion> suggestedMap = spellCheckResponse.getSuggestionMap(); for (Map.Entry<String, Suggestion> entry : suggestedMap.entrySet()) { System.out.println("suggestionName: " + entry.getKey()); Suggestion suggestion = entry.getValue(); System.out.println("NumFound: " + suggestion.getNumFound()); System.out.println("Token: " + suggestion.getToken()); System.out.print("suggested: "); List<String> suggestedList = suggestion.getAlternatives(); for (String suggestedWord : suggestedList) { System.out.print(suggestedWord + ", "); } System.out.println("\n\n"); } Suggestion suggestion = spellCheckResponse.getSuggestion(token); System.out.println("NumFound: " + suggestion.getNumFound()); System.out.println("Token: " + suggestion.getToken()); System.out.print("suggested: "); List<String> suggestedList = suggestion.getAlternatives(); for (String suggestedWord : suggestedList) { System.out.print(suggestedWord + ", "); } System.out.println("\n\n"); System.out.println("The First suggested word for solr is : " + spellCheckResponse.getFirstSuggestion(token)); System.out.println("\n\n"); List<Collation> collatedList = spellCheckResponse.getCollatedResults(); if (collatedList != null) { for (Collation collation : collatedList) { System.out.println("collated query String: " + collation.getCollationQueryString()); System.out.println("collation Num: " + collation.getNumberOfHits()); List<Correction> correctionList = collation.getMisspellingsAndCorrections(); for (Correction correction : correctionList) { System.out.println("original: " + correction.getOriginal()); System.out.println("correction: " + correction.getCorrection()); } System.out.println(); } } System.out.println(); System.out.println("The Collated word: " + spellCheckResponse.getCollatedResult()); System.out.println(); } System.out.println("查询耗时:" + response.getQTime()); } private static Item makeItem(long id, String subject, String content, long categoryId, String categoryName) { Item item = new Item(); item.setId(id); item.setSubject(subject); item.setContent(content); item.setLastUpdateTime(new Date()); item.setCategoryId(categoryId); item.setCategoryName(categoryName); return item; } }
测试结果:
这种方式可以使用于对现在数据内容的查询拼写检查。