zf本身没有提供中文分词算法,具体应用中要自己写。我这里使用简单的二元分词算法(只在utf-8下工作正常,对于其他字符集,请修改程序)。

第一步、如何测试分词算法的输出。
在zf 的手册中没有提到,我这里简单给个例子:


<?php
$analyzer
=Zend_Search_Lucene_Analysis_Analyzer::getDefault
();
$value='this is a test!'
;
        
$analyzer->setInput($value,'utf-8'
);
        
    
$position     =0
;
        
$tokenCounter=0
;
         while ((
$token=$analyzer->nextToken()) !==null
) {
            
$tokenCounter
++;
            
$tokens[] =$token
;
         }
        
print_r($tokens
);
?>


这里使用是zf默认的分词算法Zend_Search_Lucene_Analysis_Analyzer_Common_Text。另外你可以加上一个过滤方法。比如说过滤一些单词,比如“is”,"a "之类的。

第二步、自定义自己的分词算法,可以参考手册,或者自己看Zend_Search_Lucene_Analysis_Analyzer_Common_Text类的实现。
其中要注意的是过滤这点。由于我们的分词是二元分词,如果要过滤一些比如“的”、“啊”之类的单词,是无法使用内置的Tokens Filtering。我们需要是分词前先过滤调。这个可以在reset()里面实现
例子。


<?
require_once'Zend/Search/Lucene/Analysis/Analyzer.php'
;
class
Phpbean_Lucene_AnalyzerextendsZend_Search_Lucene_Analysis_Analyzer_Common
{
    
     private
$_position
;
    
     private
$_cnStopWords
= array();
    
     public function
setCnStopWords($cnStopWords
){
        
$this->_cnStopWords=$cnStopWords
;
     }

    
/**
      * Reset token stream
      */
    
public functionreset
()
     {
        
$this->_position=0
;
        
$search= array(",","/","\", ".", ";", ":", ""","!","~","`","^","(",")","?","-","t","n","'","<",">","r","rn","$","&","%","#","@","+","=","{","}","[","]",":",")","(",".","。",",","!",";","“","”","‘","’","[","]","、","—"," ","《","》","-","…","【","】"
,);
        
$this->_input=str_replace($search,' ',$this->_input
);
        
$this->_input=str_replace($this->_cnStopWords,' ',$this->_input
);
     }

    
/**
      * Tokenization stream API
      * Get next token
      * Returns null at the end of stream
      *
      * @return Zend_Search_Lucene_Analysis_Token|null
      */
    
public functionnextToken
()
     {
         if (
$this->_input===null
) {
             return
null
;
         }
         while (
$this->_position<strlen($this->_input
)) {
             while (
$this->_position<strlen($this->_input
) &&
                    
$this->_input[$this->_position]==' '
) {
                
$this->_position
++;
             }
            
$termStartPosition=$this->_position
;      
            
$temp_char=$this->_input[$this->_position
];
            
$isCnWord=false
;
             if(
ord($temp_char)>127
){  
                
$i=0
;       
                 while (
$this->_position<strlen($this->_input
) &&
                
ord($this->_input[$this->_position] )>127
) {
                    
$this->_position=$this->_position+3
;
                    
$i
++;
                     if(
$i==2
){
                        
$isCnWord=true
;
                         break;
                     }
                 }
                 if(
$i==1
)continue;
             }else{
                 while (
$this->_position<strlen($this->_input
) &&
                
ctype_alnum($this->_input[$this->_position
] )) {
                    
$this->_position
++;
                 }
             }
             if (
$this->_position==$termStartPosition
) {
                 return
null
;
             }

            
$token= newZend_Search_Lucene_Analysis_Token
(
                                      
substr($this->_input
,
                                             
$termStartPosition
,
                                             
$this->_position-$termStartPosition
),
                                      
$termStartPosition
,
                                      
$this->_position
);
            
$token=$this->normalize($token
);
             if(
$isCnWord)$this->_position=$this->_position-3
;
             if (
$token!==null
) {
                 return
$token
;
             }
         }
         return
null
;
     }
    
}
?>


测试分词输出demo


<?
$stopWords
= array('a','an','at','the','and','or','is','am'
);
        
$stopWordsFilter= newZend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords
);
        
$analyzer= newPhpbean_Lucene_Analyzer
();
        
$cnStopWords= array('的'
);
        
$analyzer->setCnStopWords($cnStopWords
);
        
$analyzer->addFilter($stopWordsFilter
);
        
$value='this is " a test【中文】的测试'
;
        
$analyzer->setInput($value,'utf-8'
);
        
        
$position     =0
;
        
$tokenCounter=0
;
         while ((
$token=$analyzer->nextToken()) !==null
) {
            
$tokenCounter
++;
            
$tokens[] =$token
;
         }
        
print_r($tokens
);
?>



比如上面的输出就是"this" "test" "中文" “测试”四个结果。符合我们的需要。

posted on 2010-09-20 14:27  Dufe王彬  阅读(958)  评论(0编辑  收藏  举报