Bookmark and Share

Lee's 程序人生

HTML CSS Javascript XML AJAX ATLAS C# C++ 数据结构 软件工程 设计模式 asp.net Java 数字图象处理 Sql 数据库
  博客园  :: 首页  :: 新随笔  :: 联系 :: 管理

关于JAVA LUCENE的分词

Posted on 2007-08-08 07:25  analyzer  阅读(1117)  评论(0编辑  收藏  举报
很多写PHP的朋友,在需要全文索引的时候,可能都觉得很迷惑,其实PHP在全文索引方面,在很大程度上,还是依赖了数据库.
我这里使用JAVA 的LUCENE,当然使用的朋友很多,网上教程也不少.我这里主要是重写了LUCENE了分词,让他正向最大匹配+词典+2元+中英文混排,当然还有其他功能,你自己看吧,你可以将他稍微改一下,可以改成语意.

如果需要商业上使用,可以和我联系一下,我这里有更强大的分词,当然不收费.
msn:webmaster@good8.com

也不罗嗦了.直接看下.参考了一些网上的资料.这里就一并谢过.

<?php
package taojava
.search.analysis.tao;

import taojava.search.analysis.Token;
import taojava.search.analysis.Tokenizer;

import java.lang.*;
import java.io.*;
import java.util.*;

public final class 
taoTokenizer extends Tokenizer {
    
    
//吃了我的给我吐出来
    
private int offset 0;
    
    private static 
TreeMap zhwords//红黑树,是一种特殊的2叉树,是有序的,所以可以排序,HASHMAP不能排序
    
private static TreeSet csurnamecforeigncnumberscnotname;//需要求子集,与TREEMAP配合使用
    
private String debugencoding;
    
    private 
boolean debug;

    private 
int bufferIndex 0;

    private 
int dataLen 0;
    
    private 
String tokenType "word";
    
    private static final 
int IO_BUFFER_SIZE 256;
    
    private final 
char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
// Char form
    
public final static int TRAD 0;
    public final static 
int SIMP 1;
    public final static 
int BOTH 2;
    
    
int count 0;
    
    private 
int charform 0;
    
    private 
boolean loadwordfile true;
    
    public 
taoTokenizer(Reader in) {
        
        
input in;
        
        
debug false;
        
debugencoding "UTF-8";

        
int count 0;

        
int treelevel;
        
     

//    缓存
        
csurname = new TreeSet();
        
cforeign = new TreeSet();
        
cnumbers = new TreeSet();
        
cnotname = new TreeSet();
        
        
        if (
charform == SIMP) {//简体
            
loadset(cnumbers"c:\dict\snumbers_u8.txt");//全角数字+中文数字
            //loadset(cforeign, "data/sforeign_u8.txt");//外国人名的姓
            //loadset(csurname, "data/ssurname_u8.txt");//中文姓
            //loadset(cnotname, "data/snotname_u8.txt");//过滤字符串
        
} else if (charform == TRAD) {//翻体
            
loadset(cnumbers"c:\dict\tnumbers_u8.txt");
            
//loadset(cdm , "c:\dict\dm.txt");
            //loadset(cforeign, "data/tforeign_u8.txt");
            //loadset(csurname, "data/tsurname_u8.txt");
            //loadset(cnotname, "data/tnotname_u8.txt");
        
} else {  // 简体 + 翻体
            
loadset(cnumbers"c:\dict\snumbers_u8.txt");
            
//loadset(cdm , "c:\dict\dm.txt");
            //loadset(cforeign, "data/sforeign_u8.txt");
            //loadset(csurname, "data/ssurname_u8.txt");
            //loadset(cnotname, "data/snotname_u8.txt");
            //loadset(cnumbers, "data/tnumbers_u8.txt");
            //loadset(cforeign, "data/tforeign_u8.txt");
            //loadset(csurname, "data/tsurname_u8.txt");
            //loadset(cnotname, "data/tnotname_u8.txt");
        
}
        
        
//zhwords = new Hashtable(120000);//本来准备开一个12W的HASHTABLE
 

        
}//end init
        

        
public void loadWords()
        {
            
long starttime System.currentTimeMillis();
            
            if (
zhwords != null)return;
            
               
zhwords = new TreeMap();//实例化(所有字典文件)
            
            
if (!loadwordfile) {//如果预读字典文件则终止程序
                
return;
            }
            
            
String newword null;
            try {
                
InputStream worddata null;
                if (
charform == SIMP) {//如果是简体中文,则读取字典
                    //worddata = new FileInputStream("c:\dict\simplexu8.txt");
                    
worddata = new FileInputStream("c:\simchinese.txt");
                    
                
//worddata = getClass().getResourceAsStream("c:\dict\simplexu8.txt");
                
} else if (charform == TRAD) {//如果是翻体,则读取翻体字典
                    
worddata = new FileInputStream("c:\dict\tradlexu8.txt");
                
//worddata = getClass().getResourceAsStream("c:\dict\tradlexu8.txt");
                
} else if (charform == BOTH) {//读取简体+翻体字体
                    
worddata = new FileInputStream("c:\dict\bothlexu8.txt");
                
//worddata = getClass().getResourceAsStream("c:\dict\bothlexu8.txt");
                
}

                
BufferedReader inn = new BufferedReader(new InputStreamReader(worddata"UTF8"));//
                
                
while ((newword inn.readLine()) != null) {//开始读取词库,并进行筛选
                    
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {//如果是#或者长度大于5的
        
                        
zhwords.put(newword.intern(), "1");//“key , value”
        
                        
if (newword.length() == 3) {//如果字典长度 == 3 则选前2字
                            
if (zhwords.containsKey(newword.substring(02).intern()) == false) {//返回key
                                
zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                        }
        
                        if (
newword.length() == 4) {//选前2 与前3
                            
if (zhwords.containsKey(newword.substring(02).intern()) == false) {
                                
zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                            if (
zhwords.containsKey(newword.substring(03).intern()) == false) {
                                
zhwords.put(newword.substring(0,3).intern(), "2");
                            }
                        }
                        
                        
//if (count++ % 20000 == 0) { System.err.println("wokao"+count); }//如果大于等于20000
                    
}
                } 
                
inn.close();//关闭READBUFFER
                
long endtime System.currentTimeMillis();
                
System.out.println("dict time:" + (endtime starttime));
            }
            catch (
IOException e) {
                
System.err.println("IOException: "+e);
            }
        }
    
        private 
void loadset(TreeSet targetsetString sourcefile) {//这里读取字典文件
            
String dataline null;//定义一个准备读取的String

            
try {                
                
//InputStream words = getClass().getResourceAsStream(sourcefile);//按照流数据来读取字典文件
    
                
InputStream words = new FileInputStream(sourcefile);
                
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
               
                while ((
dataline in.readLine()) != null) {//开始循环读取
                
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {//如果不是空,或者不是注释
                    
continue;
                }
                    
targetset.add(dataline.intern());//向TREESET中添加
                
}
                
                
in.close();
                
            }
            catch (
Exception e) {
                
System.err.println("Exception loading data file" sourcefile " " e);
            }

        }
//end loadset


        
        
public boolean isNumber(String testword) {//是否是数字
            
boolean result true;
            for (
int i 0testword.length(); i++) {
                if (
cnumbers.contains(testword.substring(ii+1).intern()) == false) {//判断数字里面是否有全角数字或者是中文数字
                
result false;
                break;
                }
            }

            if (
debug) {
                try {
System.out.println(new String(testword.getBytes("UTF-8")) + " " result);} 
                catch (
Exception a) { };
            }

            return 
result;//返回结果 FALSE表示是数字
        
}

        public 
boolean isAllForeign(String testword) {//是否是外国人
            
boolean result true;
            for (
int i 0testword.length(); i++) {
                if (
cforeign.contains(testword.substring(ii+1).intern()) == false) {
                
result false;
                break;
                }
            }

            return 
result;//同上
        
}

        public 
boolean isNotCJK(String testword) {//是否有中文,这里用CHAR来判断
            
boolean result true;
            for (
int i 0testword.length(); i++) {
                if (
Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//使用CJK同意的文件 CJK就是CHINESE JAPANESE KOREA 一个大国和2个小村的文字
                
result false;
                break;
                }
            }

            return 
result;//SB就是日本人,谢谢
        
}        
        
        
        public final 
Token next() throws java.io.IOException {
       
            
loadWords();
            
            
int length 0;//长度 TMD
            
            
int start =  this.offset;//扁移量
            
            
tokenType "double";
            
            
String separator "";
            
            
StringBuffer currentWord = new StringBuffer();
            
StringBuffer currentWord2 = new StringBuffer();//专门处理英文加数字
            
while (true)
            {

                
char c;
                
                
Character.UnicodeBlock ub;
                

                if (
bufferIndex >= dataLen) {//缓冲区
                    
dataLen input.read(ioBuffer);
                    
bufferIndex 0;
                }
                
                if (
dataLen == -1) {
                    if (
currentWord.length() == 0) {
                        return 
null;
                    } else {
                        break;
                    }
                } else {
                    
= (char)ioBuffer[bufferIndex++];                
                    
ub Character.UnicodeBlock.of(c);
                }

                if (
Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//指定的字符是否为字母
                    
                    
tokenType "double";
                    
//System.out.println("a:" + c + " is:" + Character.isLetter(c));
                    //System.out.println("len:" + currentWord.length() + "char:" + currentWord);
                    
                    
if (currentWord.length() == 0) {  // start looking for next word//此行如果是空 则到下一行
                        
currentWord.append(c);
                    }
                    else
                    {
                        
//开始搜索-主要搜索
                        
if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
                            
currentWord.append(c);                    
                        }
                        else if (
zhwords.containsKey(new String(currentWord.toString() + c).intern()) == true && ((String)(zhwords.get(new String(currentWord.toString() + c).intern()))).equals("1") == true) {  //哈哈 在2叉树里找当前字是否在词典里
                            
currentWord.append(c);
                            
bufferIndex--;
                            
Token token = new Token(currentWord.toString(), bufferIndex
                                    
currentWord.length(), bufferIndextokenType);
                            
currentWord.setLength(0);
                            return 
token;
                            
                            
                        }
//
                        
else {
                            
//currentWord.setLength(0);//单字
                            //System.out.print("c "+c+" c ");
                            
currentWord.append(c);
                            
bufferIndex--;
                            
//System.out.print("bi "+bufferIndex+" ");
                            
                            
Token token = new Token(currentWord.toString(), bufferIndex
                                    
currentWord.length(), bufferIndextokenType);
                            
currentWord.setLength(0);
                            return 
token;
                            
                            
                        }
                        
////////////////////这里只判断了数字 与字典 人名暂时没有添加////////////////////////

                        
                    
}
                    
                }
                else {  
// Not chinese character//数字
                    
                    //if (Character.isDigit(c)) {
                    
                    
if (Character.isWhitespace(c) == false) {//如果不是空白字

                        
currentWord2.append(c);    

                    }
                    else
                    {
                        if (
currentWord2.toString().compareTo("")!=0)
                        {
                            
Token token = new Token(currentWord2.toString(), bufferIndex
                                    
currentWord2.length(), bufferIndextokenType);
                            
currentWord2.setLength(0);
                            return 
token;
                        }
                    }
                        
                        

                        
/*
                        if (currentWord.length() > 0) {
                            //currentWord.setLength(0);
                            //tokenType = "single";
                            
                            
                            bufferIndex += currentWord.length();
                            
                            if (Character.isWhitespace(c) == false) {//如果不是空白字
                                //System.out.println("tostring1: "+c);
                                bufferIndex += currentWord.length(); 
                            }
                            //System.out.println("not chinese2" + c);//这里都是空格
                            currentWord.setLength(0);
                            bufferIndex--;
                            break;
                        }
                        else
                        {
                            //System.out.print("len " + currentWord.length());//数字+英文按照空格来划分
                            if (Character.isWhitespace(c) == false) {//如果不是空白字
                                bufferIndex += currentWord.length(); 
                                currentWord.append(c);
                            }
                            
                            //System.out.println("tostring2: "+currentWord.toString());
                            //bufferIndex--;
                            //break;
                            
                        }//end else
                    
                    */
                    //}
                    
                    
                
}//end else

            
}// end while
            
            
Token token = new Token(currentWord.toString(), bufferIndex
                    
currentWord.length(), bufferIndextokenType);
            
currentWord.setLength(0);
            return 
token;
            
        
        }
//end token next();
        
        
        
        
        
    
}//end class
?>
我要啦免费统计