关于JAVA LUCENE的分词

Posted on 2007-08-08 07:25 analyzer 阅读(1126) 评论(0) 收藏举报

很多写PHP的朋友,在需要全文索引的时候,可能都觉得很迷惑,其实PHP在全文索引方面,在很大程度上,还是依赖了数据库.
我这里使用JAVA 的LUCENE,当然使用的朋友很多，网上教程也不少.我这里主要是重写了LUCENE了分词,让他正向最大匹配+词典+2元+中英文混排,当然还有其他功能,你自己看吧,你可以将他稍微改一下,可以改成语意.

如果需要商业上使用,可以和我联系一下,我这里有更强大的分词,当然不收费.

msn:webmaster@good8.com

也不罗嗦了.直接看下.参考了一些网上的资料.这里就一并谢过.

<?php
package taojava.search.analysis.tao;

import taojava.search.analysis.Token;
import taojava.search.analysis.Tokenizer;

import java.lang.*;
import java.io.*;
import java.util.*;

public final class taoTokenizer extends Tokenizer {

    //吃了我的给我吐出来
    private int offset = 0;

    private static TreeMap zhwords; //红黑树，是一种特殊的2叉树，是有序的，所以可以排序，HASHMAP不能排序
    private static TreeSet csurname, cforeign, cnumbers, cnotname;//需要求子集，与TREEMAP配合使用
    private String debugencoding;

    private boolean debug;

    private int bufferIndex = 0;

    private int dataLen = 0;

    private String tokenType = "word";

    private static final int IO_BUFFER_SIZE = 256;

    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
    // Char form
    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    int count = 0;

    private int charform = 0;

    private boolean loadwordfile = true;

    public taoTokenizer(Reader in) {

        input = in;

        debug = false;
        debugencoding = "UTF-8";

        int count = 0;

        int treelevel;



//    缓存
        csurname = new TreeSet();
        cforeign = new TreeSet();
        cnumbers = new TreeSet();
        cnotname = new TreeSet();


        if (charform == SIMP) {//简体
            loadset(cnumbers, "c:\dict\snumbers_u8.txt");//全角数字+中文数字
            //loadset(cforeign, "data/sforeign_u8.txt");//外国人名的姓
            //loadset(csurname, "data/ssurname_u8.txt");//中文姓
            //loadset(cnotname, "data/snotname_u8.txt");//过滤字符串
        } else if (charform == TRAD) {//翻体
            loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
            //loadset(cdm , "c:\dict\dm.txt");
            //loadset(cforeign, "data/tforeign_u8.txt");
            //loadset(csurname, "data/tsurname_u8.txt");
            //loadset(cnotname, "data/tnotname_u8.txt");
        } else {  // 简体 + 翻体
            loadset(cnumbers, "c:\dict\snumbers_u8.txt");
            //loadset(cdm , "c:\dict\dm.txt");
            //loadset(cforeign, "data/sforeign_u8.txt");
            //loadset(csurname, "data/ssurname_u8.txt");
            //loadset(cnotname, "data/snotname_u8.txt");
            //loadset(cnumbers, "data/tnumbers_u8.txt");
            //loadset(cforeign, "data/tforeign_u8.txt");
            //loadset(csurname, "data/tsurname_u8.txt");
            //loadset(cnotname, "data/tnotname_u8.txt");
        }

        //zhwords = new Hashtable(120000);//本来准备开一个12W的HASHTABLE

        }//end init


        public void loadWords()
        {
            long starttime = System.currentTimeMillis();

            if (zhwords != null)return;

               zhwords = new TreeMap();//实例化(所有字典文件)

            if (!loadwordfile) {//如果预读字典文件则终止程序
                return;
            }

            String newword = null;
            try {
                InputStream worddata = null;
                if (charform == SIMP) {//如果是简体中文，则读取字典
                    //worddata = new FileInputStream("c:\dict\simplexu8.txt");
                    worddata = new FileInputStream("c:\simchinese.txt");

                //worddata = getClass().getResourceAsStream("c:\dict\simplexu8.txt");
                } else if (charform == TRAD) {//如果是翻体，则读取翻体字典
                    worddata = new FileInputStream("c:\dict\tradlexu8.txt");
                //worddata = getClass().getResourceAsStream("c:\dict\tradlexu8.txt");
                } else if (charform == BOTH) {//读取简体+翻体字体
                    worddata = new FileInputStream("c:\dict\bothlexu8.txt");
                //worddata = getClass().getResourceAsStream("c:\dict\bothlexu8.txt");
                }

                BufferedReader inn = new BufferedReader(new InputStreamReader(worddata, "UTF8"));//

                while ((newword = inn.readLine()) != null) {//开始读取词库，并进行筛选
                    if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {//如果是#或者长度大于5的

                        zhwords.put(newword.intern(), "1");//“key , value”

                        if (newword.length() == 3) {//如果字典长度 == 3 则选前2字
                            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {//返回key
                                zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                        }

                        if (newword.length() == 4) {//选前2 与前3
                            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
                                zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                            if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
                                zhwords.put(newword.substring(0,3).intern(), "2");
                            }
                        }

                        //if (count++ % 20000 == 0) { System.err.println("wokao"+count); }//如果大于等于20000
                    }
                }
                inn.close();//关闭READBUFFER
                long endtime = System.currentTimeMillis();
                System.out.println("dict time:" + (endtime - starttime));
            }
            catch (IOException e) {
                System.err.println("IOException: "+e);
            }
        }

        private void loadset(TreeSet targetset, String sourcefile) {//这里读取字典文件
            String dataline = null;//定义一个准备读取的String

            try {
                //InputStream words = getClass().getResourceAsStream(sourcefile);//按照流数据来读取字典文件

                InputStream words = new FileInputStream(sourcefile);
                BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));

                while ((dataline = in.readLine()) != null) {//开始循环读取
                if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {//如果不是空，或者不是注释
                    continue;
                }
                    targetset.add(dataline.intern());//向TREESET中添加
                }

                in.close();

            }
            catch (Exception e) {
                System.err.println("Exception loading data file" + sourcefile + " " + e);
            }

        }//end loadset


        public boolean isNumber(String testword) {//是否是数字
            boolean result = true;
            for (int i = 0; i < testword.length(); i++) {
                if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {//判断数字里面是否有全角数字或者是中文数字
                result = false;
                break;
                }
            }

            if (debug) {
                try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
                catch (Exception a) { };
            }

            return result;//返回结果 FALSE表示是数字
        }

        public boolean isAllForeign(String testword) {//是否是外国人
            boolean result = true;
            for (int i = 0; i < testword.length(); i++) {
                if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
                result = false;
                break;
                }
            }

            return result;//同上
        }

        public boolean isNotCJK(String testword) {//是否有中文，这里用CHAR来判断
            boolean result = true;
            for (int i = 0; i < testword.length(); i++) {
                if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//使用CJK同意的文件 CJK就是CHINESE JAPANESE KOREA 一个大国和2个小村的文字
                result = false;
                break;
                }
            }

            return result;//SB就是日本人，谢谢
        }


        public final Token next() throws java.io.IOException {

            loadWords();

            int length = 0;//长度　ＴＭＤ

            int start =  this.offset;//扁移量

            tokenType = "double";

            String separator = "";

            StringBuffer currentWord = new StringBuffer();
            StringBuffer currentWord2 = new StringBuffer();//专门处理英文加数字
            while (true)
            {

                char c;

                Character.UnicodeBlock ub;


                if (bufferIndex >= dataLen) {//缓冲区
                    dataLen = input.read(ioBuffer);
                    bufferIndex = 0;
                }

                if (dataLen == -1) {
                    if (currentWord.length() == 0) {
                        return null;
                    } else {
                        break;
                    }
                } else {
                    c = (char)ioBuffer[bufferIndex++];
                    ub = Character.UnicodeBlock.of(c);
                }

                if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//指定的字符是否为字母

                    tokenType = "double";
                    //System.out.println("a:" + c + " is:" + Character.isLetter(c));
                    //System.out.println("len:" + currentWord.length() + "char:" + currentWord);

                    if (currentWord.length() == 0) {  // start looking for next word//此行如果是空则到下一行
                        currentWord.append(c);
                    }
                    else
                    {
                        //开始搜索－主要搜索
                        if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
                            currentWord.append(c);
                        }
                        else if (zhwords.containsKey(new String(currentWord.toString() + c).intern()) == true && ((String)(zhwords.get(new String(currentWord.toString() + c).intern()))).equals("1") == true) {  //哈哈在2叉树里找当前字是否在词典里
                            currentWord.append(c);
                            bufferIndex--;
                            Token token = new Token(currentWord.toString(), bufferIndex
                                    - currentWord.length(), bufferIndex, tokenType);
                            currentWord.setLength(0);
                            return token;


                        }//
                        else {
                            //currentWord.setLength(0);//单字
                            //System.out.print("c "+c+" c ");
                            currentWord.append(c);
                            bufferIndex--;
                            //System.out.print("bi "+bufferIndex+" ");

                            Token token = new Token(currentWord.toString(), bufferIndex
                                    - currentWord.length(), bufferIndex, tokenType);
                            currentWord.setLength(0);
                            return token;


                        }
                        ////////////////////这里只判断了数字　与字典　人名暂时没有添加////////////////////////


                    }

                }
                else {  // Not chinese character//数字

                    //if (Character.isDigit(c)) {

                    if (Character.isWhitespace(c) == false) {//如果不是空白字

                        currentWord2.append(c);

                    }
                    else
                    {
                        if (currentWord2.toString().compareTo("")!=0)
                        {
                            Token token = new Token(currentWord2.toString(), bufferIndex
                                    - currentWord2.length(), bufferIndex, tokenType);
                            currentWord2.setLength(0);
                            return token;
                        }
                    }



                        /*
                        if (currentWord.length() > 0) {
                            //currentWord.setLength(0);
                            //tokenType = "single";


                            bufferIndex += currentWord.length();

                            if (Character.isWhitespace(c) == false) {//如果不是空白字
                                //System.out.println("tostring1: "+c);
                                bufferIndex += currentWord.length();
                            }
                            //System.out.println("not chinese2" + c);//这里都是空格
                            currentWord.setLength(0);
                            bufferIndex--;
                            break;
                        }
                        else
                        {
                            //System.out.print("len " + currentWord.length());//数字＋英文按照空格来划分
                            if (Character.isWhitespace(c) == false) {//如果不是空白字
                                bufferIndex += currentWord.length();
                                currentWord.append(c);
                            }

                            //System.out.println("tostring2: "+currentWord.toString());
                            //bufferIndex--;
                            //break;

                        }//end else

                    */
                    //}


                }//end else

            }// end while

            Token token = new Token(currentWord.toString(), bufferIndex
                    - currentWord.length(), bufferIndex, tokenType);
            currentWord.setLength(0);
            return token;


        }//end token next();





    }//end class
?>

刷新页面返回顶部

Lee's 程序人生

公告

关于JAVA LUCENE的分词