☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆AnalyzerTool分词工具.非常实用!【转】

 

AnalyzerTool分词工具.非常实用!

可以查看某串字符最终被分割成什么样子,这样便于查询时深刻明白为什么有的查不到有的却能查到.

package test.main;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;

/**
 * 展示分词后的效果
 */
public class AnalyzerTool {

    /**
     * 打印分词后的信息
     * 
     * @param str
     *            待分词的字符串
     * @param analyzer
     *            分词器
     */
    public static void displayToken(String str, Analyzer analyzer) {
        TokenStream stream = null;
        try {
            // 将一个字符串创建成Token流
            stream = analyzer.tokenStream("content", new StringReader(str));
            CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
            stream.reset();// 一定要重置,不然老报错
            while (stream.incrementToken()) {
                System.out.print("【" + cta + "】");
            }
            System.out.println();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (stream != null) {
                    stream.end();
                    stream.close();
                }
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    public static void main(String[] args) {
        Analyzer aly1 = new StandardAnalyzer(Version.LUCENE_40);
        Analyzer aly2 = new StopAnalyzer(Version.LUCENE_40);
        Analyzer aly3 = new SimpleAnalyzer(Version.LUCENE_40);
        Analyzer aly4 = new WhitespaceAnalyzer(Version.LUCENE_40);

        String str = "-LT1TT132#########LJRT1326#########LJRT1226#########)";
        // LT1TT132#########LJRT1326#########LJRT1226#########
        // LA939VRG###AJA###LA939VRG###WSJ###
        // LA99HRD3###SYC###
        // LZ1B22EE#########
        AnalyzerTool.displayToken(str, aly1);
        AnalyzerTool.displayToken(str, aly2);
        AnalyzerTool.displayToken(str, aly3);
        AnalyzerTool.displayToken(str, aly4);
    }
}

 

  

 

posted @ 2015-12-01 09:22  苦涩泪滴  阅读(221)  评论(0编辑  收藏  举报