Bookmark and Share

Lee's 程序人生

HTML CSS Javascript XML AJAX ATLAS C# C++ 数据结构 软件工程 设计模式 asp.net Java 数字图象处理 Sql 数据库
  博客园  :: 首页  :: 新随笔  :: 联系 :: 管理

lucene分词2.0

Posted on 2007-08-08 07:26  analyzer  阅读(652)  评论(0编辑  收藏  举报
比起上一个版本,思路更清楚,同时支持2元;字典(字典可以按长匹配,上个版本中不支持) 由于上个版本是GBK,所以这个版本是UTF8. 使用最新SOGOU的词库,感觉切词还行,此版本中已经加了点语意,当然对于大虾来说都是小事.以后我会不段完善

相信使用PHP+MYSQL的朋友,有的需要使用到LUCENE的全文索引(当然也有很多哥们不用,反正希望大家资源共享)

<?php
//作者:font 
package com.tupa.search.blog.analysis.tao;

import org.apache.lucene.analysis.*;

import java.io.*;
import java.util.*;

public final class 
taoTokenizer extends Tokenizer {
    
    
//吃了我的再给我吐出来
    
    
private static TreeMap zhwords
    private static 
TreeSet cnumbers;
    
    private 
boolean debug;

    private 
int offset 0;
    
    private 
int bufferIndex 0;

    private 
int dataLen 0;
    
    private 
String tokenType "word";
    
    private static final 
int IO_BUFFER_SIZE 256;
    
    private final 
char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
    public final static 
int TRAD 0;
    public final static 
int SIMP 1;
    public final static 
int BOTH 2;
    
    
int count 0;
    
    private 
int charform 0;
    
    private 
boolean loadwordfile true;
    
    public 
taoTokenizer(Reader in) {
        
        
input in;
        
debug false;
        
cnumbers = new TreeSet();
    
        if (
charform == SIMP) {
            
loadset(cnumbers"c:\dict\snumbers_u8.txt");
        } else if (
charform == TRAD) {
            
loadset(cnumbers"c:\dict\tnumbers_u8.txt");
        } else {  
            
loadset(cnumbers"c:\dict\snumbers_u8.txt");
        }
        
        }
//end init
    
    
public void loadWords() {
        
long starttime System.currentTimeMillis();
        if (
zhwords != null)return;
        
zhwords = new TreeMap();
        
        try {
            
InputStream words = new FileInputStream("c:\dict\sogou.txt");
            
BufferedReader in = new BufferedReader(new InputStreamReader(words,"utf8"));
            
String word null;

            while ((
word in.readLine()) != null) {
                if ((
word.indexOf("#") == -1) && (word.length() < 5)) {
                    
zhwords.put(word.intern(), "1");
                    if (
word.length() == 3) {
                        if (!
zhwords.containsKey(word.substring(02).intern())) {
                            
zhwords.put(word.substring(02).intern(), "2");
                        }
                        
                    }
                    if (
word.length() == 4) {
                        if (!
zhwords.containsKey(word.substring(02).intern())) {
                            
zhwords.put(word.substring(02).intern(), "2");
                        }
                        if (!
zhwords.containsKey(word.substring(03).intern())) {
                            
zhwords.put(word.substring(03).intern(), "2");
                        }
                        

                    }
                }
            }
            
in.close();
            
long endtime System.currentTimeMillis();
            
System.out.println(endtime starttime);
        } catch (
IOException e) {
            
e.printStackTrace();
        }
    }   
    
    
    
        private 
void loadset(TreeSet targetsetString sourcefile) {
            
String dataline null;

            try {                

                
InputStream words = new FileInputStream(sourcefile);
                
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
               
                while ((
dataline in.readLine()) != null) {
                if ((
dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
                    continue;
                }
                    
targetset.add(dataline.intern());
                }
                
                
in.close();
                
            }
            catch (
Exception e) {
                
System.err.println("Exception loading data file" sourcefile " " e.getMessage());
                
System.exit(1);
            }

        }
//end loadset
        
        
public boolean isNumber(String testword) {
            
boolean result true;
            for (
int i 0testword.length(); i++) {
                if (
cnumbers.contains(testword.substring(ii+1).intern()) == false) {
                    
result false;
                    break;
                }
            }

            if (
debug) {
                try {
System.out.println(new String(testword.getBytes("UTF-8")) + " " result);} 
                catch (
Exception a) { };
            }

            return 
result;
        }

        public 
Token next() throws IOException {
            
int type 1;//1表示词典 2表示2元

            
if (type == 1loadWords();
            
int start offset;
            
StringBuffer currentWord = new StringBuffer();
            
StringBuffer currentWord2 = new StringBuffer();
            while (
true) {
                
char c;
                
Character.UnicodeBlock ub;
                
                
offset++;
                
                if (
bufferIndex >= dataLen) {
                    
dataLen input.read(ioBuffer);
                    
bufferIndex 0;
                }
                
                if (
dataLen == -1) {
                    if (
currentWord.length() == 0) {
                        return 
null;
                    } else {
                        break;
                    }
                } else {
                    
= (char)ioBuffer[bufferIndex++];                
                    
ub Character.UnicodeBlock.of(c);
                }
                
                    if (
Character.isLetter(c)
                            && 
ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                        
tokenType "double";
                        if (
currentWord.length() == 0) {
                            
start offset 1;
                            
currentWord.append(c);                    
                        } else {
                    
                            
                            if (
isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
                                
currentWord.append(c);                    
                            }
                            else{
                                if (
type == 1){
                                    
//词典
                                    
if (zhwords.containsKey(new String(currentWord.toString()+c).intern())) {
                                        
currentWord.append(c);                        
                                    } else {
                                        
offset--;
                                        
bufferIndex--;
                                        break;
                                    }
//end else
                                
}// end if
                                
else if (type == 2)
                                {
                                    
//2元
                                    
currentWord.append(c);
                                    
offset--;
                                    
bufferIndex--;
                                    break;
                                }
//end else if
                                            
                            
}// end else
                        
}
                    }
                    else
                    {
                        if (
Character.isWhitespace(c) == false) {//如果不是空白字
                            
if ((Character.isDigit(c)==true) || (Character.isLetter(c)))
                            {
                                
currentWord2.append(c);
                            }
                        }
                        else
                        {
                            if (
currentWord2.toString().compareTo("")!=0)
                            {
                                                                
Token token = new Token(currentWord2.toString(), startstart bufferIndextokenType);
                                
currentWord2.setLength(0);
                                return 
token;
                            }
                        }
//end else
                    
}//end else
                
}
            
            
            
Token token = new Token(currentWord.toString(), startstart bufferIndextokenType);
            
currentWord.setLength(0);
            return 
token;
            
        }    
            
        
}
//end class
?>
我要啦免费统计