Lucence使用入门

参考:
https://blog.csdn.net/u014209975/article/details/50525624
https://www.cnblogs.com/hanyinglong/p/5395600.html
http://lucene.apache.org/core/4_0_0/core/overview-summary.html
https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {
		return id;
	}

	public void setId(Integer id) {
		this.id = id;
	}

	public String getType() {
		return type;
	}

	public void setType(String type) {
		this.type = type;
	}

	public String getVirtualDoc() {
        if (null == virtualDoc) {
            // TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
        	// 格式:字段1:属性值1,字段2:属性值2,...
        }
        return virtualDoc;
    }

    public void setVirtualDoc(String virtualDoc) {
        this.virtualDoc = virtualDoc;
    }

    public String getSummary() {
        StringBuilder sb = new StringBuilder();
        String tmpSum = summary;
        tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
        tmpSum = tmpSum.replace("</SPAN>", "");
        String virtualDoc2 = getVirtualDoc();
        int length = tmpSum.length();
        int firstIndex = virtualDoc2.indexOf(tmpSum);
        if (firstIndex > 0) {
            sb.append("...");
        }
        sb.append(summary);
        if (firstIndex + length < virtualDoc2.length()) {
            sb.append("...");
        }

        return sb.toString();
    }

    public void setSummary(String summary) {
        this.summary = summary;
    }

    public float getScore() {
        return score;
    }

    public void setScore(float score) {
        this.score = score;
    }
}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Demo {
    /** lucene索引目录 */
    private static Directory ciIndexDir;

    private static final String CI_CONTENT_FLAG = "virtualDoc";

    /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
    private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    private static Pattern VALID_IPV4_PATTERN = null;
    private static Pattern VALID_IPV6_PATTERN = null;
    private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
    private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}";

    private static IndexWriter indexWriter;

    static {
    	VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
        VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        try {
            indexWriter = new IndexWriter(getCiIndexDir(), conf);
        } catch (IOException e) {
        	e.printStackTrace();
        }
    }

    private static Directory getCiIndexDir() {
        if (null == ciIndexDir) {
            try {
                ciIndexDir = FSDirectory.open(new File("D://indexs"));
            } catch (IOException e) {
            	e.printStackTrace();
            }
        }
        return ciIndexDir;
    }

    private static boolean isIpAddress(String ipAddress) {
        Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
        Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
        return m1.matches() || m2.matches();
    }

    private static boolean isChinese(char c) {
        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
        if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
                || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
                || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
            return true;
        }
        return false;
    }

    private static BooleanQuery parseChineseCharacters(String inputString){
    	BooleanQuery query = new BooleanQuery();
    	if(isIpAddress(inputString)){
    		query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
    		return query;
    	}
        BooleanQuery fieldQuery = new BooleanQuery();
        boolean isWord = false;
        StringBuilder tempWord = new StringBuilder();
        inputString = inputString.toLowerCase();
    	BooleanQuery booleanQuery = new BooleanQuery();
    	int length = inputString.length();
    	Query termQuery = null;
    	for(int i=0; i<length; i++){
    		char c = inputString.charAt(i);
    		if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
    			isWord = true;
    			tempWord.append(c);
    		}
    		else{//Delimiter or Chinese character
    			isWord = false;
    			if(tempWord.length() > 0){
    				termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
//    				booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
    				booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    				tempWord = new StringBuilder();
    			}
    		}
    		if(!isWord){
    			termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
    			if(isChinese(c)){//Chinese character
//        			booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
        			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    			}
    			else{//Delimiter
        			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
    			}

    		}
    	}
    	if(tempWord.length() > 0){
    		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

			termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
		}

    	// Begin 处理全局字段匹配
        termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
		// End 处理全局字段匹配

    	BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
    	fieldQuery.add(clause);

        BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
        query.add(fieldClause);

        return query;
    }

    /**
     * 全文检索
     * @param queryStr
     * @throws Exception
     */
    private static void contentSearch(String queryStr, boolean highlight) throws Exception {
        IndexReader indexReader = null;
        IndexSearcher indexSearcher = null;
        try {
            indexReader = IndexReader.open(getCiIndexDir());
            indexSearcher = new IndexSearcher(indexReader);

            //组合查询条件,需要根据业务自己定义
            Query query = parseChineseCharacters(queryStr);

			TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
            if(hits.totalHits > 0) {
	            if (highlight) {
	                QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
	                SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
	                Highlighter highlighter = new Highlighter(formatter, scorer);
	                highlighter
	                    .setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));

	                for (ScoreDoc scoreDoc : hits.scoreDocs) {
	                    Document doc = indexSearcher.doc(scoreDoc.doc);
	                    System.out.println(doc.get("virtualDoc"));
	                    Entity entity = null;
	                    entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
	                    entity.setScore(scoreDoc.score);
	                }
	            } else {
	                for (ScoreDoc scoreDoc : hits.scoreDocs) {
	                    Document doc = indexSearcher.doc(scoreDoc.doc);
	                    System.out.println(doc.get("virtualDoc"));
	                    Entity entity = null;
	                    entity = convertToEntity(doc);
	                    entity.setScore(scoreDoc.score);
	                }
	            }
            }
        } catch (IOException ioe) {
        	ioe.printStackTrace();
        } finally {
        	close(indexSearcher);
        	close(indexReader);
        }
    }

    /**
     * 对实现Closeable接口的统一关闭
     * @param object
     */
    private static void close(Closeable object) {
    	if(null != object) {
    		try {
				object.close();
			} catch (IOException e) {
			}
    	}
    }

    /**
     * 实体转换为Doc
     * @param entity
     * @return
     */
    public static Document convertToDocument(Entity entity) {
        Document doc = new Document();
        String virtualDoc = entity.getVirtualDoc();
        //Field.Store.Yes存储,Field.Index.ANALYZED分词
        doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
        return doc;
    }

    /**
     * Doc转换为实体
     * @param doc
     * @return
     */
    public static Entity convertToEntity(Document doc) {
    	Entity ci = new Entity();
    	ci.setId(Integer.valueOf(doc.get("id")));
    	ci.setType(doc.get("type"));
        ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
        return ci;
    }

    /**
     * 检索Entity,含高亮信息
     * @param doc
     * @param indexReader
     * @param docId
     * @param highlighter
     * @return
     * @throws IOException
     * @throws InvalidTokenOffsetsException
     */
    public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
        throws IOException, InvalidTokenOffsetsException {

    	Entity entity = convertToEntity(doc);
        String virtualDoc = entity.getVirtualDoc();
        TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
        String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
        if(highlighterSummary == null){
        	highlighterSummary = virtualDoc;
        }
        entity.setSummary(highlighterSummary);

        return entity;
    }

    /**
     * 给entity信息增加索引
     * @param entity
     */
    public static void addIndex(Entity entity) {
        try {
        	deleteIndex(entity);
            Document doc = convertToDocument(entity);
            indexWriter.addDocument(doc);
            indexWriter.commit();
        } catch (Exception e) {
           e.printStackTrace();
        }
    }

    /**
     * 批量增加索引
     * @param list
     */
    public static void addIndexs(List<Entity> list) {
        try {
        	List<Document> docs = new ArrayList<Document>();
        	deleteIndexs(list);
            for (Entity entity : list) {
                Document doc = convertToDocument(entity);
                docs.add(doc);
            }
            indexWriter.addDocuments(docs);
            indexWriter.commit();
        } catch (Exception e) {
        	e.printStackTrace();
        }
    }

    /**
     * 给实体信息更新索引
     * @param entity
     */
    public static void updateIndex(Entity entity) {
        try {
            addIndex(entity);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 删除entity列表信息对应的索引
     * @param entity
     */
    public static void deleteIndexs(List<Entity> list) {
        try {
        	int size = list.size();
        	Term[] terms = new Term[size];
        	for(int i=0; i<size; i++) {
        		terms[i] = new Term("id", list.get(i).getId().toString());
        	}
            indexWriter.deleteDocuments(terms);
            indexWriter.commit();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 删除实体信息对应的索引
     * @param entity
     */
    public static void deleteIndex(Entity entity) {
        try {
            indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
            indexWriter.commit();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 删除实体类型对应的所以索引信息
     * @param type
     */
    public static void deleteIndexByType(String type) {
        try {
            indexWriter.deleteDocuments(new Term("type", type));
            indexWriter.commit();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    protected void finalize() throws Throwable {
        indexWriter.close();
    }

    public static void main(String[] args) throws Exception {
		String queryStr = "http://mail6c1.shenzhenair.com";
		contentSearch(queryStr, true);
	}
}
posted @ 2018-10-11 17:32  发挥哥  阅读(1023)  评论(0编辑  收藏  举报