X-man

导航

lucene 3.0.2 搜索

 

 

1.lucene 词频

转载:http://mxdxm.iteye.com/blog/989031

lucene in action作为action系列,确实坚持了其实用性的特色。全书花了很大的篇幅来讲解查询的解析,结果的优化和lucene应用上。很适合要做全文检索的人学习使用。但是lucen的功能决不仅仅在做搜索引擎上。如果不是最近看到一篇介绍用lucene作词频,文档统计的文章的话,我可能到现在还在为寻找一种用于专业研究的工具而苦恼。其实lucene可以很轻松地实现信息检索课中提到的要求,例如:

* 统计,实现以下功能 *

(1) 统计term在整个collection中的文档频度(document frequency, DF);

(2) 统计term在整个collection中出现的词次(term frequency in whole collection);

(3) 统计term在某个文档中出现的频度(term frequency, TF); (4) 列出term在某文档中出现的位置(position); (5) 整个collection中文档的个数;

另一个参考:http://www.360doc.com/content/11/0427/03/1947337_112596569.shtml

package lia.meetlucene;

import java.io.File;
import java.io.IOException;
import java.util.Date;


import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;


public class Searchnum {

    //static final Log log = LogFactory.getLog(Statistic.class);

    public static void printIndex(IndexReader reader) throws Exception {
/*
        // 显示document数
         System.out.println(new Date() + "n");
         System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");

        for (int i = 0; i < reader.numDocs(); i++) {
             System.out.println("文档" + i + ":" + reader.document(i) + "n");
        }
*/
        // 枚举term,获得<document, term freq, position* >信息
        TermEnum termEnum = reader.terms();
        while (termEnum.next()) {
             System.out.println("n" + termEnum.term().field() + "域中出现的词语:"
                    + termEnum.term().text());
             System.out.println(" 出现该词的文档数=" + termEnum.docFreq());

            TermPositions termPositions = reader.termPositions(termEnum.term());
            int i = 0;
            int j = 0;
            while (termPositions.next()) {
                 System.out.println("n" + (i++) + "->" + "    文章编号:"
                        + termPositions.doc() + ", 出现次数:"
                        + termPositions.freq() + "    出现位置:");
                for (j = 0; j < termPositions.freq(); j++)
                     System.out.println("[" + termPositions.nextPosition() + "]");
                 System.out.println("n");
            }

        }

    }
/*
    public static void main(String args[]) throws Exception {
        // String index = ReadConfig.getPara("indexdir");

        IndexReader reader = IndexReader.open(index);
        printIndex(reader);

    }*/

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName() + " <index dir> <query>");
        }

        String indexDir = args[0]; // 1 索引路径
        //String q = args[1]; // 2 解析输入的查询字符串

        Directory dir = FSDirectory.open(new File(indexDir)); //3打开索引文件
        //IndexSearcher is = new IndexSearcher(dir);   //3   
        IndexSearcher search = new IndexSearcher(dir);
        IndexReader reader = search.getIndexReader();
        //search(indexDir);
        System.out.println("asdfsasdfasd");
        printIndex(reader);
        
    }
}
View Code

 结果:

n
ncontents域中出现的词语:精神
 出现该词的文档数=1
n0->    文章编号:0, 出现次数:1    出现位置:
[388]
n
ncontents域中出现的词语:繁荣
 出现该词的文档数=1
n0->    文章编号:0, 出现次数:3    出现位置:
[254]
[353]
[450]
n
ncontents域中出现的词语:给予
 出现该词的文档数=1
n0->    文章编号:0, 出现次数:1    出现位置:
[85]

参考:http://hanyuanbo.iteye.com/blog/812847

 

2.lucene 统计term(与建立索引时的分词器有关)出现的个数

package lia.meetlucene;

import java.io.File;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searchnumber {

    public static void main(String[] args) throws CorruptIndexException,
            IOException {
        String indexDir = args[0]; // 1 索引路径
        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);
    }

    public static void search(String indexDir, String keyword) {
        try {
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            IndexSearcher is = new IndexSearcher(dir, true);
            IndexReader reader = is.getIndexReader();
            int num = reader.numDocs();
            for (int i = 0; i < num; i++) {
                Document doc = reader.document(i);
                System.out.println(doc);
            }

            Term term = new Term("contents", keyword);
            TermDocs docs = reader.termDocs(term);
            while (docs.next()) {
                // System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
                System.out.print("doc num\t" + docs.doc() + "\t");
                System.out.println("frequency:\t" + docs.freq());
            }
            reader.close();
            is.close();
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
View Code

 结果:

Document<stored,indexed<filename:commentbyme.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\commentbyme.txt>>
Document<stored,indexed<filename:gettrendweek.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\gettrendweek.txt>>
Document<stored,indexed<filename:no.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\no.txt>>
Document<stored,indexed<filename:showuser.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\showuser.txt>>
Document<stored,indexed<filename:suggestionusermayinst.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\suggestionusermayinst.txt>>
doc num    0    frequency:    15
doc num    2    frequency:    2
doc num    3    frequency:    1
doc num    4    frequency:    30

 

3.lucene统计多个文档中出现关键词的文档数

package lia.meetlucene;

import java.io.File;
import java.io.IOException;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searchnumbers {

    public static void main(String[] args) throws CorruptIndexException,
            IOException {
        String indexDir = args[0]; // 1 索引路径
        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);
    }

    public static void search(String indexDir, String keyword) {
        try {
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            IndexSearcher is = new IndexSearcher(dir, true);
            QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",
                    new StandardAnalyzer(Version.LUCENE_30));// 对Document中的哪个Field进行QueryParser
            Query query = parser.parse(keyword);
            TopScoreDocCollector collector = TopScoreDocCollector.create(100,
                    false);

            long start = new Date().getTime();
            is.search(query, collector);// IndexSearcher对Query进行索引,并将结果保存在TopScoreDocCollector中
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            System.out.println(hits.length);
            for (int i = 0; i < hits.length; i++) {
                Document doc = is.doc(hits[i].doc);
                System.out.println(doc.getField("filename") + "\t"
                        + hits[i].toString());// 得到doc中的filename的Field
            }
            is.close();
            long end = new Date().getTime();

            System.out.println("Found " + collector.getTotalHits()
                    + " document(s) (in " + (end - start)
                    + " milliseconds) that matched query '" + keyword + "'");
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }

}
View Code

 

4.lucene搜索产生的索引term(分割词)中关键词出现个数。(同2)

package lia.meetlucene;

import java.io.File;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searchnumber {

    public static void main(String[] args) throws CorruptIndexException,
            IOException {
        String indexDir = args[0]; // 1 索引路径
        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);
    }

    public static void search(String indexDir, String keyword) {
        try {
            Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
            IndexSearcher is = new IndexSearcher(dir, true);
            IndexReader reader = is.getIndexReader();
            int num = reader.numDocs();
            for (int i = 0; i < num; i++) {
                Document doc = reader.document(i);
                System.out.println(doc);
            }

            Term term = new Term("contents", keyword);
            TermDocs docs = reader.termDocs(term);
            while (docs.next()) {
                System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
                System.out.print("doc num\t" + docs.doc() + "\t");
                System.out.println("frequency:\t" + docs.freq());
            }
            reader.close();
            is.close();
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
View Code

 

Document<stored,indexed<filename:texthz.txt> stored,indexed<fullpath:E:\xdj\weibodata\text\texthz.txt>>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc num    0    frequency:    27254

 

 5.lucene搜索文档中是否包含关键字,打印文档名

package lia.meetlucene;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
 */

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.IOException;

// From chapter 1

/**
 * This code was originally written for Erik's Lucene intro java.net article
 */
public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException {
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName() + " <index dir> <query>");
        }

        String indexDir = args[0]; // 1 索引路径
        String q = args[1]; // 2 解析输入的查询字符串

        search(indexDir, q);
    }

    public static void search(String indexDir, String q) throws IOException,
            ParseException {
        // ////////////////////////////////////////////////////////////////////////////////
        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
        IndexSearcher is = new IndexSearcher(dir); // 3
        IndexReader reader = is.getIndexReader();
        // TermEnum termEnum=reader.terms();
        // termEnum.
        /*
         * QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
         * "contents", //4 new StandardAnalyzer( //4 Version.LUCENE_30)); //4
         */
        QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
                "contents", // 4
                 new SmartChineseAnalyzer(Version.LUCENE_30)); // 4
        Query query = parser.parse(q); // 4
        long start = System.currentTimeMillis();
        TopDocs hits = is.search(query, 10); // 5 搜索索引
        // Hits hits = is.search(query);
        long end = System.currentTimeMillis();

        System.err.println("Found " + hits.totalHits + // 6记录索引状态
                " document(s) (in " + (end - start) + // 6
                " milliseconds) that matched query '" + // 6+reader.docFreq(new
                                                        // Term("雨"))
                q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字

            System.out.println(doc.get("fullpath") + "  " + scoreDoc.doc); // 8
                                                                            // 显示匹配文件名
        }
        // ////////////////////////////////////////////////////////////////////////////////
        is.close(); // 9
    }
}

/*
 * #1 Parse provided index directory #2 Parse provided query string #3 Open
 * index #4 Parse query #5 Search index #6 Write search stats
 * 
 * #7 Retrieve matching document #8 Display filename #9 Close IndexSearcher
 */
View Code

 

Found 1 document(s) (in 16 milliseconds) that matched query '雨钝':
E:\xdj\weibodata\text\texthz.txt  0

 

6.查询词排序输出

package lia.meetlucene;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class Searchnumbers {
    /**
     * 建索引<br>
     * 一共4个Document,每个文档两个Field:text,size。text存放内容,size用于排序
     * 
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {
        IndexWriter writer = new IndexWriter(FSDirectory.open(new File("index")), new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);
        Document document = new Document();
        document.add(new Field("text", "google", Store.YES, Index.ANALYZED));
        document.add(new Field("size", "1", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        writer.addDocument(document);
        document = new Document();
        document.add(new Field("text", "google earth apache", Store.YES, Index.ANALYZED));
        document.add(new Field("size", "2", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        writer.addDocument(document);
        document = new Document();
        document.add(new Field("text", "baidu earth", Store.YES, Index.ANALYZED));
        document.add(new Field("size", "3", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        writer.addDocument(document);
        document = new Document();
        document.add(new Field("text", "baidu earth apache", Store.YES, Index.ANALYZED));
        document.add(new Field("size", "4", Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        writer.addDocument(document);
        writer.optimize();
        writer.close();
    }
    /**
     * lucene3.0已经没有返回Hits的方法,使用返回TopDocs的方法进行搜索
     * 
     * @param keyword
     *            要搜索的关键词
     * @throws CorruptIndexException
     * @throws IOException
     * @throws ParseException
     */
    private static void searchWithTopDocs(String keyword) throws CorruptIndexException, IOException, ParseException {
        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));
        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
        TopDocs topDocs = searcher.search(parser.parse(keyword), 10);// 取前10个搜索结果,如果没有这么多,就取实际大小
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;// 获取ScoreDoc
        System.out.println("hits:" + topDocs.totalHits);
        for (ScoreDoc scoreDoc : scoreDocs) {
            int docNum = scoreDoc.doc;// 文档编号
            Document doc = searcher.doc(docNum);
            String text = doc.get("text");
            String size = doc.get("size");
            float score = scoreDoc.score;// 评分
            System.out.println(text + " " + size + " " + score);
        }
    }
    /**
     * 对命中文档进行排序的搜索,也不再返回Hits,而是返回TopFieldDocs
     * 
     * @param keyword
     *            要搜索的关键词
     * @throws CorruptIndexException
     * @throws IOException
     * @throws ParseException
     */
    private static void searchWithSort(String keyword) throws CorruptIndexException, IOException, ParseException {
        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30));
        Searcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
        Query query = parser.parse(keyword);
        SortField sortField = new SortField("size", SortField.INT, true);// 需要排序的字段
        TopFieldDocs topFieldDocs = searcher.search(query, null, 10, new Sort(sortField));// 第二个参数是过滤器,此例中不需要
        ScoreDoc[] socDocs = topFieldDocs.scoreDocs;
        System.out.println("hits:" + topFieldDocs.totalHits);
        for (ScoreDoc scoreDoc : socDocs) {
            int docNum = scoreDoc.doc;
            Document doc = searcher.doc(docNum);
            String text = doc.get("text");
            String size = doc.get("size");
            float score = scoreDoc.score;// 评分,这里的评分不可用,分值都是NaN
            System.out.println(text + " " + size + " " + score);
        }
    }
    public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
        build();
        String keyword = "google";
        searchWithTopDocs(keyword);
        System.out.println("---------");
        searchWithSort(keyword);
    }
}
View Code

 

hits:2
google 1 1.287682
google earth apache 2 0.643841
---------
hits:2
google earth apache 2 NaN
google 1 NaN

 

7.lucene关键字的高亮显示

package lia.meetlucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class Searchnum {
    /**
     * lucene3.0开始已经抛弃了原来的分词方式,转而使用新的分词方式<br>
     * 本方法以SmartChineseAnalyzer为例,演示如何分词以及取得分词之后的term
     * 
     * @throws Exception
     */
    public static void analysis() throws Exception {
        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);
        String string = "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?";
        StringReader reader = new StringReader(string);
        TokenStream ts = analyzer.tokenStream("", reader);
        TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);
        while (ts.incrementToken()) {
            System.out.print(termAttribute.term() + "  ");
        }
        System.out.println();
    }
    /**
     * 建索引<br>
     * 在构造IndexWriter时必须使用Directory作为参数了
     * 
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {
        String path = "index";
        IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new SmartChineseAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED);
        Document document = new Document();
        document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?", Store.YES, Index.ANALYZED));
        writer.addDocument(document);
        writer.optimize();
        writer.close();
    }
    /**
     * 搜索也没有返回Hits类型结果的方法了
     * 
     * @param keyword
     * @throws CorruptIndexException
     * @throws IOException
     * @throws ParseException
     * @throws InvalidTokenOffsetsException
     */
    private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException {
        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer);
        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));
        Query query = parser.parse(keyword);
        System.out.println(query);
        TopDocs topDocs = searcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        System.out.println("hits:" + topDocs.totalHits);
        for (ScoreDoc scoreDoc : scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);
            String text = doc.get("text");
            System.out.println(highlight(text, query, analyzer));
        }
    }
    /**
     * 高亮关键词
     * 
     * @param content
     *            需要高亮的内容
     * @param query
     *            搜索时使用的Query对象
     * @param analyzer
     *            分词器
     * @return 高亮之后的文本
     * @throws IOException
     * @throws InvalidTokenOffsetsException
     */
    private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {
        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");
        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(25));
        String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content);
        return resultString + "...";
    }
    public static void main(String[] args) throws Exception {
        analysis();
        build();
        search("人民币");
    }
}
View Code
中国  人民  银行  采取  了  一  系列  措施  防止  人民币  升值  但是  很  遗憾  这些  措施  在  今天  看来  其  作用  是  微乎其微  的  难道  真  的  就  没有  什么  别的  措施  防止  人民币  再次  疯狂  升值  了  吗  
text:人民币
hits:1
中国人民银行采取了一系列措施防止<b>人民币</b>升值,但是...

 8.输出分词后,出现次数较多的前1000个terms

package lia.meetlucene;

import java.io.File;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

import java.util.List;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searchnumbers {

    // static final Log log = LogFactory.getLog(Statistic.class);

    public static void printIndex(IndexReader reader) throws Exception {
        /*
         * // 显示document数 System.out.println(new Date() + "n");
         * System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n");
         * 
         * for (int i = 0; i < reader.numDocs(); i++) { System.out.println("文档"
         * + i + ":" + reader.document(i) + "n"); }
         */
        // 枚举term,获得<document, term freq, position* >信息
        TermEnum termEnum = reader.terms();

        // List ve = new List();
        List<Person_Term> listA = new ArrayList<Person_Term>();

        while (termEnum.next()) {
            Person_Term pa = new Person_Term();
            /*
             * System.out.println("n" + termEnum.term().field() + "域中出现的词语:" +
             * termEnum.term().text());
             */
            pa.setterm(termEnum.term().text());
            TermPositions termPositions = reader.termPositions(termEnum.term());
            int i = 0;
            int j = 0;
            // while (termPositions.next())
            {
                /*
                 * System.out.println("n" + (i++) + "->" + "    文章编号:" +
                 * termPositions.doc() + ", 出现次数:" + termPositions.freq() +
                 * "    出现位置:"); for (j = 0; j < termPositions.freq(); j++)
                 * System.out.println("[" + termPositions.nextPosition() + "]");
                 * System.out.println("n");
                 */
                termPositions.next();
                pa.setfreq(termPositions.freq());
                // System.out.println(termPositions.);
            }

            listA.add(pa);
        }
        Collections.sort(listA, new Comparator<Person_Term>() {

            public int compare(Person_Term arg0, Person_Term arg1) {

                return arg1.getfreq().compareTo(arg0.getfreq());

            }

        });
        int i = 0;
        for (Person_Term p : listA) {
            i++;
            System.out.println(p.getterm() + "\t" + p.getfreq());
            if (i > 1000)
                break;
        }

    }

    /*
     * public static void main(String args[]) throws Exception { // String index
     * = ReadConfig.getPara("indexdir");
     * 
     * IndexReader reader = IndexReader.open(index); printIndex(reader);
     * 
     * }
     */

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName() + " <index dir> <query>");
        }

        String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";// args[0];
                                                                        // // 1
                                                                        // 索引路径
        // String indexDir = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";
        // String q = args[1]; // 2 解析输入的查询字符串

        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
        // IndexSearcher is = new IndexSearcher(dir); //3
        IndexSearcher search = new IndexSearcher(dir);
        IndexReader reader = search.getIndexReader();
        // search(indexDir);
        // System.out.println("asdfsasdfasd");
        printIndex(reader);

    }
}
View Code

 Person_Term.java的代码:

package lia.meetlucene;

public class Person_Term implements Comparable<Person_Term> {

    private String term;

    private Integer freq;

    /**
     * 
     * @return the term
     */

    public String getterm() {

        return term;

    }

    /**
     * 
     * @param term
     * 
     *            the term to set
     */

    public void setterm(String term) {

        this.term = term;

    }

    /**
     * 
     * @return the freq
     */

    public Integer getfreq() {

        return freq;

    }

    /**
     * 
     * @param freq
     * 
     *            the freq to set
     */

    public void setfreq(Integer freq) {

        this.freq = freq;

    }

    @Override
    public int compareTo(Person_Term arg0) {

        return this.getfreq().compareTo(arg0.getfreq());

    }

}

/*
 * TermPositions termPositions = reader.termPositions(termEnum.term()); int i =
 * 0; int j = 0; while (termPositions.next()) { System.out.println("n" + (i++) +
 * "->" + "    文章编号:" + termPositions.doc() + ", 出现次数:" + termPositions.freq() +
 * "    出现位置:"); for (j = 0; j < termPositions.freq(); j++)
 * System.out.println("[" + termPositions.nextPosition() + "]");
 * System.out.println("n"); }
 */
View Code

输出(部分):

文化    191698
中国    777
先达    33
委员    3
家中    33
C:\Users\Administrator\Desktop\xdj\weibohanzi\weibo.txt    1

 9.多条件搜索

package lia.meetlucene;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
 */

import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;

// From chapter 1

/**
 * This code was originally written for Erik's Lucene intro java.net article
 */
public class Searcherw {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException {


        // String indexDir = args[0]; //1 索引路径
        //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
        String indexDir = "E:/xdj/tengxunsuoying";
        String q = "雨天";// args[1]; //2 解析输入的查询字符串

        search(indexDir, q);
    }

    public static void search(String indexDir, String q) throws IOException,
            ParseException {
        // ////////////////////////////////////////////////////////////////////////////////
        Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件
        IndexSearcher is = new IndexSearcher(dir); // 3
        IndexReader reader = is.getIndexReader();
        // TermEnum termEnum=reader.terms();
        // termEnum.
        QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
                "context", // 4
                //new StandardAnalyzer(Version.LUCENE_30)); // 4
                //new CJKAnalyzer(Version.LUCENE_30));
                new SmartChineseAnalyzer(Version.LUCENE_30));
        
        
//        SimpleDateFormat sdf  =   new  SimpleDateFormat( " yyyy-MM-dd" ); 

  //       FileFilter filter = new FileFilter("time", datetime.parse(”2005-10-1′), datetime.parse(”2005-10-30′));
        Filter filter = new RangeFilter("time", "20141001", "20141031", true, true);     
        // NumericRangeQuery rangeQuery = NumericRangeQuery.("carPrice",st,ed,true, true);  
        

        Query query = parser.parse(q); // 4
        
        query = new FilteredQuery(query, filter); // 带过滤条件的搜索
        long start = System.currentTimeMillis();
        
        

        
        //is.search(query, filter, n, sort)
        
        //TopDocs hits = is.search(query, 10,new Sort(new SortField("time",SortField.STRING,true))); // 5 搜索索引
        TopDocs hits = is.search(query, 10); // 5 搜索索引
        long end = System.currentTimeMillis();
        // System.err

        System.err.println("Found " + hits.totalHits + // 6记录索引状态
                " document(s) (in " + (end - start) + // 6
                " milliseconds) that matched query '" + // 6+reader.docFreq(new
                                                        // Term("雨"))
                q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字

            System.out.println(doc.get("time") + "  "+doc.get("context")); // 8
                                                                            // 显示匹配文件名
        }
        // ////////////////////////////////////////////////////////////////////////////////
        is.close(); // 9
    }
        
    
}

/*
 *
public  List<Document> rangeSearch(){  
         List<Document> docList = new ArrayList<Document>();  
         Double start = 20.0;  
         Double end = 40.0;  
        NumericRangeQuery rangeQuery = NumericRangeQuery.newDoubleRange("carPrice",start,end,true, true);  
        try {  
         directory = FSDirectory.open(new File(LuceneConstant.INDEX_PATH));//打开索引文件  
         IndexReader reader = DirectoryReader.open(directory);//读取目录  
         IndexSearcher search = new IndexSearcher(reader);//初始化查询组件  
         TopDocs td = search.search(rangeQuery, 10000);//获取匹配上元素的一个docid    
         for (ScoreDoc doc : td.scoreDocs) {  
             docList.add(search.doc(doc.doc));  
         }  
         reader.close();//关闭资源    
         directory.close();//关闭连接    
     } catch (IOException ex) {  
         Logger.getLogger(LuceneDao.class.getName()).log(Level.SEVERE, null, ex);  
  }  
 }  

 * 
 * 
 * 
 */
View Code

 

posted on 2015-03-14 15:51  雨钝风轻  阅读(218)  评论(0编辑  收藏  举报