很多写PHP的朋友,在需要全文索引的时候,可能都觉得很迷惑,其实PHP在全文索引方面,在很大程度上,还是依赖了数据库.
我这里使用JAVA 的LUCENE,当然使用的朋友很多,网上教程也不少.我这里主要是重写了LUCENE了分词,让他正向最大匹配+词典+2元+中英文混排,当然还有其他功能,你自己看吧,你可以将他稍微改一下,可以改成语意.
如果需要商业上使用,可以和我联系一下,我这里有更强大的分词,当然不收费.
msn:webmaster@good8.com
也不罗嗦了.直接看下.参考了一些网上的资料.这里就一并谢过.
我这里使用JAVA 的LUCENE,当然使用的朋友很多,网上教程也不少.我这里主要是重写了LUCENE了分词,让他正向最大匹配+词典+2元+中英文混排,当然还有其他功能,你自己看吧,你可以将他稍微改一下,可以改成语意.
如果需要商业上使用,可以和我联系一下,我这里有更强大的分词,当然不收费.
msn:webmaster@good8.com
也不罗嗦了.直接看下.参考了一些网上的资料.这里就一并谢过.
<?php
package taojava.search.analysis.tao;
import taojava.search.analysis.Token;
import taojava.search.analysis.Tokenizer;
import java.lang.*;
import java.io.*;
import java.util.*;
public final class taoTokenizer extends Tokenizer {
//吃了我的给我吐出来
private int offset = 0;
private static TreeMap zhwords; //红黑树,是一种特殊的2叉树,是有序的,所以可以排序,HASHMAP不能排序
private static TreeSet csurname, cforeign, cnumbers, cnotname;//需要求子集,与TREEMAP配合使用
private String debugencoding;
private boolean debug;
private int bufferIndex = 0;
private int dataLen = 0;
private String tokenType = "word";
private static final int IO_BUFFER_SIZE = 256;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
int count = 0;
private int charform = 0;
private boolean loadwordfile = true;
public taoTokenizer(Reader in) {
input = in;
debug = false;
debugencoding = "UTF-8";
int count = 0;
int treelevel;
// 缓存
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
if (charform == SIMP) {//简体
loadset(cnumbers, "c:\dict\snumbers_u8.txt");//全角数字+中文数字
//loadset(cforeign, "data/sforeign_u8.txt");//外国人名的姓
//loadset(csurname, "data/ssurname_u8.txt");//中文姓
//loadset(cnotname, "data/snotname_u8.txt");//过滤字符串
} else if (charform == TRAD) {//翻体
loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
//loadset(cdm , "c:\dict\dm.txt");
//loadset(cforeign, "data/tforeign_u8.txt");
//loadset(csurname, "data/tsurname_u8.txt");
//loadset(cnotname, "data/tnotname_u8.txt");
} else { // 简体 + 翻体
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
//loadset(cdm , "c:\dict\dm.txt");
//loadset(cforeign, "data/sforeign_u8.txt");
//loadset(csurname, "data/ssurname_u8.txt");
//loadset(cnotname, "data/snotname_u8.txt");
//loadset(cnumbers, "data/tnumbers_u8.txt");
//loadset(cforeign, "data/tforeign_u8.txt");
//loadset(csurname, "data/tsurname_u8.txt");
//loadset(cnotname, "data/tnotname_u8.txt");
}
//zhwords = new Hashtable(120000);//本来准备开一个12W的HASHTABLE
}//end init
public void loadWords()
{
long starttime = System.currentTimeMillis();
if (zhwords != null)return;
zhwords = new TreeMap();//实例化(所有字典文件)
if (!loadwordfile) {//如果预读字典文件则终止程序
return;
}
String newword = null;
try {
InputStream worddata = null;
if (charform == SIMP) {//如果是简体中文,则读取字典
//worddata = new FileInputStream("c:\dict\simplexu8.txt");
worddata = new FileInputStream("c:\simchinese.txt");
//worddata = getClass().getResourceAsStream("c:\dict\simplexu8.txt");
} else if (charform == TRAD) {//如果是翻体,则读取翻体字典
worddata = new FileInputStream("c:\dict\tradlexu8.txt");
//worddata = getClass().getResourceAsStream("c:\dict\tradlexu8.txt");
} else if (charform == BOTH) {//读取简体+翻体字体
worddata = new FileInputStream("c:\dict\bothlexu8.txt");
//worddata = getClass().getResourceAsStream("c:\dict\bothlexu8.txt");
}
BufferedReader inn = new BufferedReader(new InputStreamReader(worddata, "UTF8"));//
while ((newword = inn.readLine()) != null) {//开始读取词库,并进行筛选
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {//如果是#或者长度大于5的
zhwords.put(newword.intern(), "1");//“key , value”
if (newword.length() == 3) {//如果字典长度 == 3 则选前2字
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {//返回key
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {//选前2 与前3
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
//if (count++ % 20000 == 0) { System.err.println("wokao"+count); }//如果大于等于20000
}
}
inn.close();//关闭READBUFFER
long endtime = System.currentTimeMillis();
System.out.println("dict time:" + (endtime - starttime));
}
catch (IOException e) {
System.err.println("IOException: "+e);
}
}
private void loadset(TreeSet targetset, String sourcefile) {//这里读取字典文件
String dataline = null;//定义一个准备读取的String
try {
//InputStream words = getClass().getResourceAsStream(sourcefile);//按照流数据来读取字典文件
InputStream words = new FileInputStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
while ((dataline = in.readLine()) != null) {//开始循环读取
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {//如果不是空,或者不是注释
continue;
}
targetset.add(dataline.intern());//向TREESET中添加
}
in.close();
}
catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " " + e);
}
}//end loadset
public boolean isNumber(String testword) {//是否是数字
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {//判断数字里面是否有全角数字或者是中文数字
result = false;
break;
}
}
if (debug) {
try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;//返回结果 FALSE表示是数字
}
public boolean isAllForeign(String testword) {//是否是外国人
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
return result;//同上
}
public boolean isNotCJK(String testword) {//是否有中文,这里用CHAR来判断
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//使用CJK同意的文件 CJK就是CHINESE JAPANESE KOREA 一个大国和2个小村的文字
result = false;
break;
}
}
return result;//SB就是日本人,谢谢
}
public final Token next() throws java.io.IOException {
loadWords();
int length = 0;//长度 TMD
int start = this.offset;//扁移量
tokenType = "double";
String separator = "";
StringBuffer currentWord = new StringBuffer();
StringBuffer currentWord2 = new StringBuffer();//专门处理英文加数字
while (true)
{
char c;
Character.UnicodeBlock ub;
if (bufferIndex >= dataLen) {//缓冲区
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = (char)ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//指定的字符是否为字母
tokenType = "double";
//System.out.println("a:" + c + " is:" + Character.isLetter(c));
//System.out.println("len:" + currentWord.length() + "char:" + currentWord);
if (currentWord.length() == 0) { // start looking for next word//此行如果是空 则到下一行
currentWord.append(c);
}
else
{
//开始搜索-主要搜索
if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
currentWord.append(c);
}
else if (zhwords.containsKey(new String(currentWord.toString() + c).intern()) == true && ((String)(zhwords.get(new String(currentWord.toString() + c).intern()))).equals("1") == true) { //哈哈 在2叉树里找当前字是否在词典里
currentWord.append(c);
bufferIndex--;
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}//
else {
//currentWord.setLength(0);//单字
//System.out.print("c "+c+" c ");
currentWord.append(c);
bufferIndex--;
//System.out.print("bi "+bufferIndex+" ");
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
////////////////////这里只判断了数字 与字典 人名暂时没有添加////////////////////////
}
}
else { // Not chinese character//数字
//if (Character.isDigit(c)) {
if (Character.isWhitespace(c) == false) {//如果不是空白字
currentWord2.append(c);
}
else
{
if (currentWord2.toString().compareTo("")!=0)
{
Token token = new Token(currentWord2.toString(), bufferIndex
- currentWord2.length(), bufferIndex, tokenType);
currentWord2.setLength(0);
return token;
}
}
/*
if (currentWord.length() > 0) {
//currentWord.setLength(0);
//tokenType = "single";
bufferIndex += currentWord.length();
if (Character.isWhitespace(c) == false) {//如果不是空白字
//System.out.println("tostring1: "+c);
bufferIndex += currentWord.length();
}
//System.out.println("not chinese2" + c);//这里都是空格
currentWord.setLength(0);
bufferIndex--;
break;
}
else
{
//System.out.print("len " + currentWord.length());//数字+英文按照空格来划分
if (Character.isWhitespace(c) == false) {//如果不是空白字
bufferIndex += currentWord.length();
currentWord.append(c);
}
//System.out.println("tostring2: "+currentWord.toString());
//bufferIndex--;
//break;
}//end else
*/
//}
}//end else
}// end while
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}//end token next();
}//end class
?>
package taojava.search.analysis.tao;
import taojava.search.analysis.Token;
import taojava.search.analysis.Tokenizer;
import java.lang.*;
import java.io.*;
import java.util.*;
public final class taoTokenizer extends Tokenizer {
//吃了我的给我吐出来
private int offset = 0;
private static TreeMap zhwords; //红黑树,是一种特殊的2叉树,是有序的,所以可以排序,HASHMAP不能排序
private static TreeSet csurname, cforeign, cnumbers, cnotname;//需要求子集,与TREEMAP配合使用
private String debugencoding;
private boolean debug;
private int bufferIndex = 0;
private int dataLen = 0;
private String tokenType = "word";
private static final int IO_BUFFER_SIZE = 256;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
int count = 0;
private int charform = 0;
private boolean loadwordfile = true;
public taoTokenizer(Reader in) {
input = in;
debug = false;
debugencoding = "UTF-8";
int count = 0;
int treelevel;
// 缓存
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
if (charform == SIMP) {//简体
loadset(cnumbers, "c:\dict\snumbers_u8.txt");//全角数字+中文数字
//loadset(cforeign, "data/sforeign_u8.txt");//外国人名的姓
//loadset(csurname, "data/ssurname_u8.txt");//中文姓
//loadset(cnotname, "data/snotname_u8.txt");//过滤字符串
} else if (charform == TRAD) {//翻体
loadset(cnumbers, "c:\dict\tnumbers_u8.txt");
//loadset(cdm , "c:\dict\dm.txt");
//loadset(cforeign, "data/tforeign_u8.txt");
//loadset(csurname, "data/tsurname_u8.txt");
//loadset(cnotname, "data/tnotname_u8.txt");
} else { // 简体 + 翻体
loadset(cnumbers, "c:\dict\snumbers_u8.txt");
//loadset(cdm , "c:\dict\dm.txt");
//loadset(cforeign, "data/sforeign_u8.txt");
//loadset(csurname, "data/ssurname_u8.txt");
//loadset(cnotname, "data/snotname_u8.txt");
//loadset(cnumbers, "data/tnumbers_u8.txt");
//loadset(cforeign, "data/tforeign_u8.txt");
//loadset(csurname, "data/tsurname_u8.txt");
//loadset(cnotname, "data/tnotname_u8.txt");
}
//zhwords = new Hashtable(120000);//本来准备开一个12W的HASHTABLE
}//end init
public void loadWords()
{
long starttime = System.currentTimeMillis();
if (zhwords != null)return;
zhwords = new TreeMap();//实例化(所有字典文件)
if (!loadwordfile) {//如果预读字典文件则终止程序
return;
}
String newword = null;
try {
InputStream worddata = null;
if (charform == SIMP) {//如果是简体中文,则读取字典
//worddata = new FileInputStream("c:\dict\simplexu8.txt");
worddata = new FileInputStream("c:\simchinese.txt");
//worddata = getClass().getResourceAsStream("c:\dict\simplexu8.txt");
} else if (charform == TRAD) {//如果是翻体,则读取翻体字典
worddata = new FileInputStream("c:\dict\tradlexu8.txt");
//worddata = getClass().getResourceAsStream("c:\dict\tradlexu8.txt");
} else if (charform == BOTH) {//读取简体+翻体字体
worddata = new FileInputStream("c:\dict\bothlexu8.txt");
//worddata = getClass().getResourceAsStream("c:\dict\bothlexu8.txt");
}
BufferedReader inn = new BufferedReader(new InputStreamReader(worddata, "UTF8"));//
while ((newword = inn.readLine()) != null) {//开始读取词库,并进行筛选
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {//如果是#或者长度大于5的
zhwords.put(newword.intern(), "1");//“key , value”
if (newword.length() == 3) {//如果字典长度 == 3 则选前2字
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {//返回key
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {//选前2 与前3
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
//if (count++ % 20000 == 0) { System.err.println("wokao"+count); }//如果大于等于20000
}
}
inn.close();//关闭READBUFFER
long endtime = System.currentTimeMillis();
System.out.println("dict time:" + (endtime - starttime));
}
catch (IOException e) {
System.err.println("IOException: "+e);
}
}
private void loadset(TreeSet targetset, String sourcefile) {//这里读取字典文件
String dataline = null;//定义一个准备读取的String
try {
//InputStream words = getClass().getResourceAsStream(sourcefile);//按照流数据来读取字典文件
InputStream words = new FileInputStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
while ((dataline = in.readLine()) != null) {//开始循环读取
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {//如果不是空,或者不是注释
continue;
}
targetset.add(dataline.intern());//向TREESET中添加
}
in.close();
}
catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " " + e);
}
}//end loadset
public boolean isNumber(String testword) {//是否是数字
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {//判断数字里面是否有全角数字或者是中文数字
result = false;
break;
}
}
if (debug) {
try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;//返回结果 FALSE表示是数字
}
public boolean isAllForeign(String testword) {//是否是外国人
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
return result;//同上
}
public boolean isNotCJK(String testword) {//是否有中文,这里用CHAR来判断
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//使用CJK同意的文件 CJK就是CHINESE JAPANESE KOREA 一个大国和2个小村的文字
result = false;
break;
}
}
return result;//SB就是日本人,谢谢
}
public final Token next() throws java.io.IOException {
loadWords();
int length = 0;//长度 TMD
int start = this.offset;//扁移量
tokenType = "double";
String separator = "";
StringBuffer currentWord = new StringBuffer();
StringBuffer currentWord2 = new StringBuffer();//专门处理英文加数字
while (true)
{
char c;
Character.UnicodeBlock ub;
if (bufferIndex >= dataLen) {//缓冲区
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = (char)ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {//指定的字符是否为字母
tokenType = "double";
//System.out.println("a:" + c + " is:" + Character.isLetter(c));
//System.out.println("len:" + currentWord.length() + "char:" + currentWord);
if (currentWord.length() == 0) { // start looking for next word//此行如果是空 则到下一行
currentWord.append(c);
}
else
{
//开始搜索-主要搜索
if (isNumber(currentWord.toString()) && cnumbers.contains(new String(new char[] {c}).intern())) {
currentWord.append(c);
}
else if (zhwords.containsKey(new String(currentWord.toString() + c).intern()) == true && ((String)(zhwords.get(new String(currentWord.toString() + c).intern()))).equals("1") == true) { //哈哈 在2叉树里找当前字是否在词典里
currentWord.append(c);
bufferIndex--;
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}//
else {
//currentWord.setLength(0);//单字
//System.out.print("c "+c+" c ");
currentWord.append(c);
bufferIndex--;
//System.out.print("bi "+bufferIndex+" ");
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
////////////////////这里只判断了数字 与字典 人名暂时没有添加////////////////////////
}
}
else { // Not chinese character//数字
//if (Character.isDigit(c)) {
if (Character.isWhitespace(c) == false) {//如果不是空白字
currentWord2.append(c);
}
else
{
if (currentWord2.toString().compareTo("")!=0)
{
Token token = new Token(currentWord2.toString(), bufferIndex
- currentWord2.length(), bufferIndex, tokenType);
currentWord2.setLength(0);
return token;
}
}
/*
if (currentWord.length() > 0) {
//currentWord.setLength(0);
//tokenType = "single";
bufferIndex += currentWord.length();
if (Character.isWhitespace(c) == false) {//如果不是空白字
//System.out.println("tostring1: "+c);
bufferIndex += currentWord.length();
}
//System.out.println("not chinese2" + c);//这里都是空格
currentWord.setLength(0);
bufferIndex--;
break;
}
else
{
//System.out.print("len " + currentWord.length());//数字+英文按照空格来划分
if (Character.isWhitespace(c) == false) {//如果不是空白字
bufferIndex += currentWord.length();
currentWord.append(c);
}
//System.out.println("tostring2: "+currentWord.toString());
//bufferIndex--;
//break;
}//end else
*/
//}
}//end else
}// end while
Token token = new Token(currentWord.toString(), bufferIndex
- currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}//end token next();
}//end class
?>