lucene分词2.0

Posted on 2007-08-08 07:26 analyzer 阅读(652) 评论(0) 编辑收藏举报

比起上一个版本,思路更清楚,同时支持2元;字典(字典可以按长匹配,上个版本中不支持) 由于上个版本是GBK,所以这个版本是UTF8. 使用最新SOGOU的词库,感觉切词还行,此版本中已经加了点语意,当然对于大虾来说都是小事.以后我会不段完善

相信使用PHP+MYSQL的朋友,有的需要使用到LUCENE的全文索引(当然也有很多哥们不用,反正希望大家资源共享)

<?php
//作者:font
package com.tupa.search.blog.analysis.tao;

import org.apache.lucene.analysis.*;

import java.io.*;
import java.util.*;

public final class taoTokenizer extends Tokenizer {

    //吃了我的再给我吐出来

    private static TreeMap zhwords;
    private static TreeSet cnumbers;

    private boolean debug;

    private int offset = 0;

    private int bufferIndex = 0;

    private int dataLen = 0;

    private String tokenType = "word";

    private static final int IO_BUFFER_SIZE = 256;

    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    int count = 0;

    private int charform = 0;

    private boolean loadwordfile = true;

    public taoTokenizer(Reader in) {

        input = in;
        debug = false;
        cnumbers = new TreeSet();

        if (charform == SIMP) {
            loadset(cnumbers, "c:\dict\snumbers_u8.txt");
        } else if (charform == TRAD) {
            loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
        } else {
            loadset(cnumbers, "c:\dict\snumbers_u8.txt");
        }

        }//end init

    public void loadWords() {
        long starttime = System.currentTimeMillis();
        if (zhwords != null)return;
        zhwords = new TreeMap();

        try {
            InputStream words = new FileInputStream("c:\dict\sogou.txt");
            BufferedReader in = new BufferedReader(new InputStreamReader(words,"utf8"));
            String word = null;

            while ((word = in.readLine()) != null) {
                if ((word.indexOf("#") == -1) && (word.length() < 5)) {
                    zhwords.put(word.intern(), "1");
                    if (word.length() == 3) {
                        if (!zhwords.containsKey(word.substring(0, 2).intern())) {
                            zhwords.put(word.substring(0, 2).intern(), "2");
                        }

                    }
                    if (word.length() == 4) {
                        if (!zhwords.containsKey(word.substring(0, 2).intern())) {
                            zhwords.put(word.substring(0, 2).intern(), "2");
                        }
                        if (!zhwords.containsKey(word.substring(0, 3).intern())) {
                            zhwords.put(word.substring(0, 3).intern(), "2");
                        }


                    }
                }
            }
            in.close();
            long endtime = System.currentTimeMillis();
            System.out.println(endtime - starttime);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }



        private void loadset(TreeSet targetset, String sourcefile) {
            String dataline = null;

            try {

                InputStream words = new FileInputStream(sourcefile);
                BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));

                while ((dataline = in.readLine()) != null) {
                if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
                    continue;
                }
                    targetset.add(dataline.intern());
                }

                in.close();

            }
            catch (Exception e) {
                System.err.println("Exception loading data file" + sourcefile + " " + e.getMessage());
                System.exit(1);
            }

        }//end loadset

        public boolean isNumber(String testword) {
            boolean result = true;
            for (int i = 0; i < testword.length(); i++) {
                if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
                    result = false;
                    break;
                }
            }

            if (debug) {
                try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
                catch (Exception a) { };
            }

            return result;
        }

        public Token next() throws IOException {
            int type = 1;//1表示词典 2表示2元

            if (type == 1) loadWords();
            int start = offset;
            StringBuffer currentWord = new StringBuffer();
            StringBuffer currentWord2 = new StringBuffer();
            while (true) {
                char c;
                Character.UnicodeBlock ub;

                offset++;

                if (bufferIndex >= dataLen) {
                    dataLen = input.read(ioBuffer);
                    bufferIndex = 0;
                }

                if (dataLen == -1) {
                    if (currentWord.length() == 0) {
                        return null;
                    } else {
                        break;
                    }
                } else {
                    c = (char)ioBuffer[bufferIndex++];
                    ub = Character.UnicodeBlock.of(c);
                }

                    if (Character.isLetter(c)
                            && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                        tokenType = "double";
                        if (currentWord.length() == 0) {
                            start = offset - 1;
                            currentWord.append(c);
                        } else {


                            if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
                                currentWord.append(c);
                            }
                            else{
                                if (type == 1){
                                    //词典
                                    if (zhwords.containsKey(new String(currentWord.toString()+c).intern())) {
                                        currentWord.append(c);
                                    } else {
                                        offset--;
                                        bufferIndex--;
                                        break;
                                    }//end else
                                }// end if
                                else if (type == 2)
                                {
                                    //2元
                                    currentWord.append(c);
                                    offset--;
                                    bufferIndex--;
                                    break;
                                }//end else if

                            }// end else
                        }
                    }
                    else
                    {
                        if (Character.isWhitespace(c) == false) {//如果不是空白字
                            if ((Character.isDigit(c)==true) || (Character.isLetter(c)))
                            {
                                currentWord2.append(c);
                            }
                        }
                        else
                        {
                            if (currentWord2.toString().compareTo("")!=0)
                            {
                                                                Token token = new Token(currentWord2.toString(), start, start + bufferIndex, tokenType);
                                currentWord2.setLength(0);
                                return token;
                            }
                        }//end else
                    }//end else
                }


            Token token = new Token(currentWord.toString(), start, start + bufferIndex, tokenType);
            currentWord.setLength(0);
            return token;

        }


}//end class
?>

刷新页面返回顶部

Lee's 程序人生

公告

lucene分词2.0