比起上一个版本,思路更清楚,同时支持2元;字典(字典可以按长匹配,上个版本中不支持) 由于上个版本是GBK,所以这个版本是UTF8. 使用最新SOGOU的词库,感觉切词还行,此版本中已经加了点语意,当然对于大虾来说都是小事.以后我会不段完善
相信使用PHP+MYSQL的朋友,有的需要使用到LUCENE的全文索引(当然也有很多哥们不用,反正希望大家资源共享)
相信使用PHP+MYSQL的朋友,有的需要使用到LUCENE的全文索引(当然也有很多哥们不用,反正希望大家资源共享)
<?php
//作者:font
package com.tupa.search.blog.analysis.tao;
import org.apache.lucene.analysis.*;
import java.io.*;
import java.util.*;
public final class taoTokenizer extends Tokenizer {
//吃了我的再给我吐出来
private static TreeMap zhwords;
private static TreeSet cnumbers;
private boolean debug;
private int offset = 0;
private int bufferIndex = 0;
private int dataLen = 0;
private String tokenType = "word";
private static final int IO_BUFFER_SIZE = 256;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
int count = 0;
private int charform = 0;
private boolean loadwordfile = true;
public taoTokenizer(Reader in) {
input = in;
debug = false;
cnumbers = new TreeSet();
if (charform == SIMP) {
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
} else if (charform == TRAD) {
loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
} else {
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
}
}//end init
public void loadWords() {
long starttime = System.currentTimeMillis();
if (zhwords != null)return;
zhwords = new TreeMap();
try {
InputStream words = new FileInputStream("c:\dict\sogou.txt");
BufferedReader in = new BufferedReader(new InputStreamReader(words,"utf8"));
String word = null;
while ((word = in.readLine()) != null) {
if ((word.indexOf("#") == -1) && (word.length() < 5)) {
zhwords.put(word.intern(), "1");
if (word.length() == 3) {
if (!zhwords.containsKey(word.substring(0, 2).intern())) {
zhwords.put(word.substring(0, 2).intern(), "2");
}
}
if (word.length() == 4) {
if (!zhwords.containsKey(word.substring(0, 2).intern())) {
zhwords.put(word.substring(0, 2).intern(), "2");
}
if (!zhwords.containsKey(word.substring(0, 3).intern())) {
zhwords.put(word.substring(0, 3).intern(), "2");
}
}
}
}
in.close();
long endtime = System.currentTimeMillis();
System.out.println(endtime - starttime);
} catch (IOException e) {
e.printStackTrace();
}
}
private void loadset(TreeSet targetset, String sourcefile) {
String dataline = null;
try {
InputStream words = new FileInputStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
while ((dataline = in.readLine()) != null) {
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " " + e.getMessage());
System.exit(1);
}
}//end loadset
public boolean isNumber(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
if (debug) {
try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;
}
public Token next() throws IOException {
int type = 1;//1表示词典 2表示2元
if (type == 1) loadWords();
int start = offset;
StringBuffer currentWord = new StringBuffer();
StringBuffer currentWord2 = new StringBuffer();
while (true) {
char c;
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = (char)ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
if (Character.isLetter(c)
&& ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (currentWord.length() == 0) {
start = offset - 1;
currentWord.append(c);
} else {
if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
currentWord.append(c);
}
else{
if (type == 1){
//词典
if (zhwords.containsKey(new String(currentWord.toString()+c).intern())) {
currentWord.append(c);
} else {
offset--;
bufferIndex--;
break;
}//end else
}// end if
else if (type == 2)
{
//2元
currentWord.append(c);
offset--;
bufferIndex--;
break;
}//end else if
}// end else
}
}
else
{
if (Character.isWhitespace(c) == false) {//如果不是空白字
if ((Character.isDigit(c)==true) || (Character.isLetter(c)))
{
currentWord2.append(c);
}
}
else
{
if (currentWord2.toString().compareTo("")!=0)
{
Token token = new Token(currentWord2.toString(), start, start + bufferIndex, tokenType);
currentWord2.setLength(0);
return token;
}
}//end else
}//end else
}
Token token = new Token(currentWord.toString(), start, start + bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
}//end class
?>
//作者:font
package com.tupa.search.blog.analysis.tao;
import org.apache.lucene.analysis.*;
import java.io.*;
import java.util.*;
public final class taoTokenizer extends Tokenizer {
//吃了我的再给我吐出来
private static TreeMap zhwords;
private static TreeSet cnumbers;
private boolean debug;
private int offset = 0;
private int bufferIndex = 0;
private int dataLen = 0;
private String tokenType = "word";
private static final int IO_BUFFER_SIZE = 256;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
int count = 0;
private int charform = 0;
private boolean loadwordfile = true;
public taoTokenizer(Reader in) {
input = in;
debug = false;
cnumbers = new TreeSet();
if (charform == SIMP) {
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
} else if (charform == TRAD) {
loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
} else {
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
}
}//end init
public void loadWords() {
long starttime = System.currentTimeMillis();
if (zhwords != null)return;
zhwords = new TreeMap();
try {
InputStream words = new FileInputStream("c:\dict\sogou.txt");
BufferedReader in = new BufferedReader(new InputStreamReader(words,"utf8"));
String word = null;
while ((word = in.readLine()) != null) {
if ((word.indexOf("#") == -1) && (word.length() < 5)) {
zhwords.put(word.intern(), "1");
if (word.length() == 3) {
if (!zhwords.containsKey(word.substring(0, 2).intern())) {
zhwords.put(word.substring(0, 2).intern(), "2");
}
}
if (word.length() == 4) {
if (!zhwords.containsKey(word.substring(0, 2).intern())) {
zhwords.put(word.substring(0, 2).intern(), "2");
}
if (!zhwords.containsKey(word.substring(0, 3).intern())) {
zhwords.put(word.substring(0, 3).intern(), "2");
}
}
}
}
in.close();
long endtime = System.currentTimeMillis();
System.out.println(endtime - starttime);
} catch (IOException e) {
e.printStackTrace();
}
}
private void loadset(TreeSet targetset, String sourcefile) {
String dataline = null;
try {
InputStream words = new FileInputStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
while ((dataline = in.readLine()) != null) {
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " " + e.getMessage());
System.exit(1);
}
}//end loadset
public boolean isNumber(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
if (debug) {
try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;
}
public Token next() throws IOException {
int type = 1;//1表示词典 2表示2元
if (type == 1) loadWords();
int start = offset;
StringBuffer currentWord = new StringBuffer();
StringBuffer currentWord2 = new StringBuffer();
while (true) {
char c;
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = (char)ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
if (Character.isLetter(c)
&& ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (currentWord.length() == 0) {
start = offset - 1;
currentWord.append(c);
} else {
if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
currentWord.append(c);
}
else{
if (type == 1){
//词典
if (zhwords.containsKey(new String(currentWord.toString()+c).intern())) {
currentWord.append(c);
} else {
offset--;
bufferIndex--;
break;
}//end else
}// end if
else if (type == 2)
{
//2元
currentWord.append(c);
offset--;
bufferIndex--;
break;
}//end else if
}// end else
}
}
else
{
if (Character.isWhitespace(c) == false) {//如果不是空白字
if ((Character.isDigit(c)==true) || (Character.isLetter(c)))
{
currentWord2.append(c);
}
}
else
{
if (currentWord2.toString().compareTo("")!=0)
{
Token token = new Token(currentWord2.toString(), start, start + bufferIndex, tokenType);
currentWord2.setLength(0);
return token;
}
}//end else
}//end else
}
Token token = new Token(currentWord.toString(), start, start + bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
}//end class
?>