Lucene学习入门记录

前一段时间,公司让用Lucene检索文档,自己写了些代码,在这里记录一下,以免忘记了。

其实,简单的Lucene的入门还是很简单的,它的整体构造和关系型数据库差不多,一个键对应一个值,生成索引,然后根据索引去查找文档内容,在将内容通过别的方式显示出来。

Lucene创建、增、删、改索引:

package com.haiyisoft.szgl.file.service.impl;

import java.io.File;

/**
 * 档案管理的创建索引
 *
 * @author    haojiahong
 * 
 * <p>Modification History:</p>
 * <p>Date             Author      Description</p>
 * <p>--------------------------------------------------------------</p>
 * <p>20151027        haojiahong              new</p>
 * <p>  </p>
 */
@Component("schDocForDocBuilderService")
public class SchDocForDocBuilderServiceImpl implements SchDocForDocBuilderService {

	@Autowired
	public DocService docService;

	@Autowired
	public FileContentService fileContentService;

	private long time = 0;

	/**
	 * 创建lucene索引
	 */
	public void creatLucene() {
		IndexWriter indexWriter = null;
		try {
			File indexDir = new File(FileManage.searchCenterForDocPath);
			creatFile(indexDir);
			delAllFile(indexDir);
			Directory dir = FSDirectory.open(indexDir);
			Analyzer luceneAnalyzer = new IKAnalyzer();
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
			indexWriter = new IndexWriter(dir, iwc);

			LogUtil.getAppLoger().debug("开始创建索引");

			long indexcount = this.createIndex(indexWriter);
			LogUtil.getAppLoger().debug("创建索引结束,共处理数据行数" + indexcount + "条");
			indexWriter.commit();
			indexWriter.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}
	}

	/**
	 * 按照数据集创建索引
	 * @param indexWriter
	 * @return
	 */
	private long createIndex(IndexWriter indexWriter) {
		try {
			this.showtime();
			long current = 0;
			// current += this.initFile(indexWriter);//根据文档建立索引
			current += this.initFileWithDocument(indexWriter);// 根据档案建立索引

			return current;
		} catch (Exception e) {
			e.printStackTrace();
			return -1;
		}
	}

	private long initFileWithDocument(IndexWriter indexWriter) {
		long current = 0;
		String jpql = "select file from FileManage file where 1=1";
		List<FileManage> fmLs = (List) JPAUtil.find(jpql);
		for (FileManage fm : fmLs) {
			try {
				indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
				current++;
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
		return current;
	}

	/**
	 * 增量添加 lucene索引
	 */
	@Override
	public void insertLucene(String fmUuid) {
		IndexWriter indexWriter = null;
		try {
			File indexDir = new File(FileManage.searchCenterForDocPath);
			Directory dir = FSDirectory.open(indexDir);
			Analyzer luceneAnalyzer = new IKAnalyzer();
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
			indexWriter = new IndexWriter(dir, iwc);
			LogUtil.getAppLoger().debug("开始增量添加索引");
			long indexcount = this.insertIndex(indexWriter, fmUuid);
			LogUtil.getAppLoger().debug("添加索引结束,共处理数据行数" + indexcount + "条");
			indexWriter.commit();
			indexWriter.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}
	}

	/**
	 * 增量添加索引
	 * @param indexWriter
	 * @return
	 */
	private long insertIndex(IndexWriter indexWriter, String fmUuid) {
		try {
			this.showtime();
			long current = 0;
			current += this.insertFileWithDocument(indexWriter, fmUuid);// 根据档案添加索引
			return current;
		} catch (Exception e) {
			e.printStackTrace();
			return -1;
		}
	}

	private long insertFileWithDocument(IndexWriter indexWriter, String fmUuid) {
		long current = 0;
		FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
		try {
			indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
			current++;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return current;
	}

	/**
	 * 更新Lucene索引
	 */
	@Override
	public void updateLucene(String fmUuid) {
		IndexWriter indexWriter = null;
		try {
			File indexDir = new File(FileManage.searchCenterForDocPath);
			Directory dir = FSDirectory.open(indexDir);
			Analyzer luceneAnalyzer = new IKAnalyzer();
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
			indexWriter = new IndexWriter(dir, iwc);
			LogUtil.getAppLoger().debug("开始更新索引");
			long indexcount = this.updateIndex(indexWriter, fmUuid);
			LogUtil.getAppLoger().debug("更新索引结束,共处理数据行数" + indexcount + "条");
			indexWriter.commit();
			indexWriter.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}
	}

	/**
	 * 更新Lucene索引
	 */
	private long updateIndex(IndexWriter indexWriter, String fmUuid) {
		try {
			this.showtime();
			long current = 0;
			current += this.updateFileWithDocument(indexWriter, fmUuid);// 根据档案更新索引
			return current;
		} catch (Exception e) {
			e.printStackTrace();
			return -1;
		}
	}

	private long updateFileWithDocument(IndexWriter indexWriter, String fmUuid) {
		long current = 0;
		FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
		try {
			indexWriter.updateDocument(new Term("UUID", fmUuid), initLuceneDocument(indexWriter, current, fm));
			current++;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return current;
	}

	/**
	 * 删除Lucene索引
	 */
	@Override
	public void deteleLucene(String fmUuid) {
		IndexWriter indexWriter = null;
		try {
			File indexDir = new File(FileManage.searchCenterForDocPath);
			Directory dir = FSDirectory.open(indexDir);
			Analyzer luceneAnalyzer = new IKAnalyzer();
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
			indexWriter = new IndexWriter(dir, iwc);
			LogUtil.getAppLoger().debug("开始删除索引");
			long indexcount = this.deteleIndex(indexWriter, fmUuid);
			LogUtil.getAppLoger().debug("删除索引结束,共处理数据行数" + indexcount + "条");
			indexWriter.commit();
			indexWriter.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}

	}

	/**
	 * 删除Lucene索引
	 * @param indexWriter
	 * @param fmUuid
	 * @return
	 */
	private long deteleIndex(IndexWriter indexWriter, String fmUuid) {
		try {
			this.showtime();
			long current = 0;
			current += this.deleteFileWithDocument(indexWriter, fmUuid);// 根据档案删除索引
			return current;
		} catch (Exception e) {
			e.printStackTrace();
			return -1;
		}
	}

	private long deleteFileWithDocument(IndexWriter indexWriter, String fmUuid) {
		long current = 0;
		try {
			indexWriter.deleteDocuments(new Term("UUID", fmUuid));
			current++;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return current;
	}

	/**
	 * 批量删除索引
	 */
	@Override
	public void deteleLuceneLs(List<FileManage> fmList) {
		for (FileManage fm : fmList) {
			this.deteleLucene(fm.getUuid());
		}
	}

	/**
	 * 每个档案创建一个Lucene的document(创建、添加、更新索引用到此方法)
	 * @param indexWriter
	 * @param current
	 * @param fm
	 * @return
	 */
	private Document initLuceneDocument(IndexWriter indexWriter, long current, FileManage fm) {
		String fileName = "";
		String fileType = "";
		String fileContent = "";
		// TODO 版本号待定
		List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm.getClass()
				.getName(), null);
		if (fileList.size() > 0) {
			for (EdocFileObjectRelation file : fileList) {
				InputStream fs = docService.getFileInputStream(file.getFileId());
				fileName = file.getEdocFile().getName();
				fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
				fileContent += FileManageUtil.getContent(fileType, fs);
				// String content = fileContentService(file.getEdocFile().)

			}
		}
		/**
		 * 写入索引文件
		 */
		Document doc = new Document();
		doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NOT_ANALYZED));
		// doc.add(new Field("FILENAME", fileName, Store.YES,
		// Index.NO));// 文档名称
		if (!SzglCommonUtil.strIsNull(fileContent)) {
			doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
		}
		if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
			doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
		}
		if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
			doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
		}
		if (fm.getFromMan() != null) {
			doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
		}
		if (fm.getType() != null) {
			doc.add(new Field("TYPE", fm.getType(), Store.YES, Index.ANALYZED));// 档案类别
		}
		if (fm.getStatus() != null) {
			doc.add(new Field("STATUS", fm.getStatus(), Store.YES, Index.ANALYZED));// 档案状态
		}

		if (!SzglCommonUtil.strIsNull(fm.getIsShare())) {
			doc.add(new Field("ISHARE", fm.getIsShare(), Store.YES, Index.ANALYZED));// 档案是否共享
		}
		if (fm.getFromTime() != null) {
			doc.add(new Field("FROMTIME", fm.getFromTime() + "", Store.YES, Index.ANALYZED));// 档案传来时间
		}
		if (!SzglCommonUtil.strIsNull(fm.getBoxUuid())) {
			FileBox filebox = JPAUtil.loadById(FileBox.class, fm.getBoxUuid());
			if (!SzglCommonUtil.strIsNull(filebox.getYearCode())) {
				doc.add(new Field("YEARCODE", filebox.getYearCode(), Store.YES, Index.ANALYZED));// 档案所属的档案盒年度
			}
		}

		return doc;
	}

	private boolean delAllFile(File file) {
		boolean flag = false;
		if (file != null) {
			File[] tempList = file.listFiles();
			File temp = null;
			for (int i = 0; i < tempList.length; i++) {
				temp = tempList[i];
				if (temp.isFile()) {
					temp.delete();
				}
			}
		}
		return flag;
	}

	/**
	 * 显示时间
	 */
	private void showtime() {
		long time1 = System.currentTimeMillis();
		if (time > 0) {
			LogUtil.getAppLoger().debug("MilliSecond:" + (time1 - time));
		} else {
			LogUtil.getAppLoger().debug("Start time:" + (new Timestamp(System.currentTimeMillis())));
		}
		time = time1;
	}

	private void creatFile(File file) {
		if (!file.exists()) {
			file.mkdirs();
		}
	}

	private long initFile(IndexWriter indexWriter) {
		long current = 0;
		String jpql = "select file from FileManage file where 1=1";
		List<FileManage> fmLs = (List) JPAUtil.find(jpql);
		for (FileManage fm : fmLs) {
			// TODO 版本号待定
			List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm
					.getClass().getName(), null);
			if (fileList.size() > 0) {
				for (EdocFileObjectRelation file : fileList) {
					InputStream fs = docService.getFileInputStream(file.getFileId());
					String fileName = file.getEdocFile().getName();
					String fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
					String fileContent = FileManageUtil.getContent(fileType, fs);
					// String content = fileContentService(file.getEdocFile().)
					/**
					 * 写入索引文件
					 */
					try {
						Document doc = new Document();
						// doc.add(new Field("TYPE", fm.getType(), Store.YES,
						// Index.ANALYZED)); // 档案类别
						doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NO));
						doc.add(new Field("FILENAME", fileName, Store.YES, Index.NO));// 文档名称
						if (!SzglCommonUtil.strIsNull(fileContent)) {
							doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
						}
						if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
							doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
						}
						if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
							doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
						}
						if (fm.getFromMan() != null) {
							doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
						}

						indexWriter.addDocument(doc);
						current++;
						if ((current - (current / 10000) * 10000) == 0) {
							LogUtil.getAppLoger().debug("current row num:" + current);
						}
					} catch (Exception ex) {
						ex.printStackTrace();
					}
				}
			}

		}
		return current;
	}
}

根据索引去查询,并将关键词标红

package com.haiyisoft.szgl.file.service.impl;

import java.io.File;

/**
 * 通过Lucene查询
 *
 * @author    haojiahong
 * 
 * <p>Modification History:</p>
 * <p>Date             Author      Description</p>
 * <p>--------------------------------------------------------------</p>
 * <p>20151102        haojiahong              new</p>
 * <p>  </p>
 */

@Component("fileSchByLuceneService")
public class FileSchByLuceneServiceImpl implements FileSchByLuceneService {

	@Override
	public List<FileManage> retrieveByLucene(String keyword, String titleSch, String docNumSch, String typeSch,
			String isShareSch, Timestamp yearCodeBegin, Timestamp yearCodeEnd, SortParamList sortParamList,
			PageInfo pageInfo) {
		List<FileManage> result = new ArrayList<FileManage>();
		IndexSearcher searcher = null;
		String indexDir = FileManage.searchCenterForDocPath;
		File file = new File(indexDir);
		if ((SzglCommonUtil.strIsNull(keyword) && SzglCommonUtil.strIsNull(docNumSch)
				&& SzglCommonUtil.strIsNull(titleSch) && SzglCommonUtil.strIsNull(typeSch) && SzglCommonUtil
					.strIsNull(isShareSch)) || (!file.exists())) {
			if (pageInfo != null) {
				pageInfo.setAllRowNum(0);
			}
			return null;
		}

		try {
			Directory dir = FSDirectory.open(new File(indexDir));
			IndexReader reader = IndexReader.open(dir);
			searcher = new IndexSearcher(reader);
			BooleanQuery query = new BooleanQuery();

			Analyzer anal = new IKAnalyzer();
			QueryParser qp = new QueryParser(Version.LUCENE_36, "CONTENT", anal);
			QueryParser qpTitle = new QueryParser(Version.LUCENE_36, "TITLE", anal);
			QueryParser qpDocNum = new QueryParser(Version.LUCENE_36, "DOCNUM", anal);
			QueryParser qpIshare = new QueryParser(Version.LUCENE_36, "ISHARE", anal);
			if (!SzglCommonUtil.strIsNull(keyword)) {
				query.add(qp.parse(keyword), Occur.MUST);
			}
			if (!SzglCommonUtil.strIsNull(titleSch)) {
				query.add(qpTitle.parse(titleSch), Occur.MUST);
			}
			if (!SzglCommonUtil.strIsNull(docNumSch)) {
				query.add(qpDocNum.parse(docNumSch), Occur.MUST);
			}
			if (!SzglCommonUtil.strIsNull(typeSch)) {
				query.add(new TermQuery(new Term("TYPE", typeSch)), Occur.MUST);
			}
			if (!SzglCommonUtil.strIsNull(isShareSch)) {
				query.add(qpIshare.parse(isShareSch), Occur.MUST);
			}
			if (yearCodeBegin != null || yearCodeEnd != null) {
				query.add(new TermRangeQuery("YEARCODE", yearCodeBegin.toString(), yearCodeEnd.toString(), true, true),
						Occur.MUST);
			}
			ScoreDoc[] hits = searcher.search(query, Integer.MAX_VALUE).scoreDocs;
			int nowPagestart = (pageInfo.getCurPageNum() - 1) * pageInfo.getRowOfPage();// 当前页第一条数据是总数的第几条数据
			int allPage = hits.length;// 总条数
			pageInfo.setAllRowNum(allPage);
			int nowPageEnd = (nowPagestart + pageInfo.getRowOfPage()) < allPage ? (nowPagestart + pageInfo
					.getRowOfPage()) : allPage;
			for (int i = nowPagestart; i < nowPageEnd; i++) {
				FileManage fm = new FileManage();
				Document doc = searcher.doc(hits[i].doc);
				fm.setUuid(doc.get("UUID"));
				if (!SzglCommonUtil.strIsNull(docNumSch)) {
					fm.setDocNum(lighterStr(anal, qpDocNum.parse(docNumSch), doc.get("DOCNUM"), "DOCNUM"));
				} else {
					fm.setDocNum(doc.get("DOCNUM"));
				}
				if (!SzglCommonUtil.strIsNull(titleSch)) {
					fm.setTitle(lighterStr(anal, qpTitle.parse(titleSch), doc.get("TITLE"), "TITLE"));
				} else {
					fm.setTitle(doc.get("TITLE"));
				}
				if (!SzglCommonUtil.strIsNull(keyword)) {
					fm.setFileContent(lighterStr(anal, qp.parse(keyword), doc.get("CONTENT"), "CONTENT"));
				} else {
					fm.setFileContent(doc.get("CONTENT"));
				}
				fm.setFromMan(Long.valueOf(doc.get("FROMMAN")));
				fm.setFileName(doc.get("FILENAME"));
				fm.setType(doc.get("TYPE"));
				fm.setStatus(doc.get("STATUS"));
				fm.setFromTime(Timestamp.valueOf(doc.get("FROMTIME")));
				result.add(fm);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (searcher != null)
				try {
					searcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}

		return result;
	}

	private String lighterStr(Analyzer a, Query query, String txt, String fieldname) throws Exception {
		String str = null;
		QueryScorer scorer = new QueryScorer(query);
		Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
		Formatter fmt = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
		Highlighter lighter = new Highlighter(fmt, scorer);
		lighter.setTextFragmenter(fragmenter);
		str = lighter.getBestFragment(a, fieldname, txt);
		if (str == null)
			return txt;
		return str;
	}

}

  一些工具类,用于读文档内容。

 * Copyright (C) 2014-2020 Yantai HaiYi Software Co.,Ltd
package com.haiyisoft.szgl.file.util;

import java.io.BufferedReader;

/**
 * 档案管理工具类
 *
 * @author    haojiahong
 * 
 * <p>Modification History:</p>
 * <p>Date             Author      Description</p>
 * <p>--------------------------------------------------------------</p>
 * <p>20151102       haojiahong              new</p>
 * <p>  </p>
 */
public class FileManageUtil {

	/**
	 * 根据文件类型,获取文本内容
	 * @param type
	 * @param fs
	 * @return
	 */
	public static String getContent(String type, InputStream fs) {
		String text = null;
		if ("doc".equals(type)) {
			POITextExtractor ex = null;
			try {
				ex = new WordExtractor(fs);
				text = ex.getText();
			} catch (Exception e) {
				e.printStackTrace();
			}
		} else if ("docx".equals(type)) {
			POITextExtractor ex = null;
			try {
				OPCPackage opcPackage = OPCPackage.open(fs);
				ex = new XWPFWordExtractor(opcPackage);
				text = ex.getText();
			} catch (Exception e) {
				e.printStackTrace();
			}
		} else if ("txt".equals(type)) {
			BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
			String line = null;
			try {
				while ((line = reader.readLine()) != null) {
					text += line;
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		} else if ("pdf".equals(type)) {
			try {
				PDDocument pdfDocument = PDDocument.load(fs);
				text = new PDFTextStripper().getText(pdfDocument);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		return text;
	}
}

  

 

posted @ 2015-11-12 16:25  Jokerone  阅读(305)  评论(0编辑  收藏  举报