纯php分词封装的类
分享一个纯php分词封装的类
<?php /* * 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合 */ class trie { protected $dict; protected $dictFile; protected $specSymbol; //规格常见符号 protected $ty_dict; /** * @param string $dictFile 字典文件路径, 每行一句 */ public function __construct() { $this->dict = []; $this->ty_dict = []; $this->specSymbol = "*|M|m|φ|Φ|st|ST"; } public function loadData($cache = true) { global $dc; $cacheKey = __CLASS__ . "_" . md5($this->dictFile); if ($cache && false !== ($this->dict = $dc->get($cacheKey))) { return; } $this->loadDataFromFile(); if ($cache) { $dc->set($cacheKey, $this->dict, null, 100000); } } /** * 从文件加载字典数据, 并构建 trie 树 */ public function loadDataFromFile() { $file = $this->dictFile; if (!file_exists($file)) { throw new InvalidArgumentException("字典文件不存在"); } $handle = @fopen($file, "r"); if (!is_resource($handle)) { throw new RuntimeException("字典文件无法打开"); } while (!feof($handle)) { $line = fgets($handle); if (empty($line)) { continue; } $this->addWords(trim($line)); } fclose($handle); } /** * 分割文本(注意ascii占1个字节, unicode...) * * @param string $str * * @return string[] */ protected function splitStr($str) { return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY); } /** * 往dict树中添加语句 * * @param $wordArr */ protected function addWords($words) { $wordArr = $this->splitStr($words); $curNode = &$this->dict; foreach ($wordArr as $char) { if (!isset($curNode)) { $curNode[$char] = []; } $curNode = &$curNode[$char]; } // 标记到达当前节点完整路径为"敏感词" $curNode['end']++; } /** * 过滤文本 * * @param string $str 原始文本 * @param string $replace 敏感字替换字符 * @param int $skipDistance 严格程度: 检测时允许跳过的间隔 * * @return string 返回过滤后的文本 */ public function filter($str, $replace = '*', $skipDistance = 0) { $maxDistance = max($skipDistance, 0) + 1; $strArr = $this->splitStr($str); $length = count($strArr); for ($i = 0; $i < $length; $i++) { $char = $strArr[$i]; if (!isset($this->dict[$char])) { continue; } $curNode = &$this->dict[$char]; $dist = 0; $matchIndex = [$i]; for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) { if (!isset($curNode[$strArr[$j]])) { $dist ++; continue; } $matchIndex[] = $j; $curNode = &$curNode[$strArr[$j]]; } // 匹配 if (isset($curNode['end'])) { // Log::Write("match "); foreach ($matchIndex as $index) { $strArr[$index] = $replace; } $i = max($matchIndex); } } return implode('', $strArr); } /** * 查找 * * @param $strArr * * @return bool|mixed */ public function isMatch($strArr) { $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr); $curNode = $this->dict; foreach ($strArr as $char) { if (!isset($curNode[$char])) { return false; }else{ $curNode = $curNode[$char]; } } return isset($curNode['end']) ? $curNode['end'] : false; } /* * 判断词是否存在于词库中 */ public function isType($word,$filename='word'){ //判断 return $this->isMatch($word); } /* * 对前端传过来的$kw对进行分词 * 然后返回对应类型的词 * $kw string 前端传过来的关健词 * $filename string 词库文件名 * $ty_file string 同义词库文件名 */ public function split_kw($kw,$filename='word',$ty_file=''){ $this->dictFile = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt'; $this->loadData(); //第一步,先进行空格,,号拆分 $temp = preg_split("/[\s,,]+/", $kw); //explode(' ',trim($kw)); $data = []; if(!empty($temp)){ foreach ($temp as $k=>$v){ if($v) $data[] = $v; } }else{ $data[] = $kw; } $word = []; //用来保存词库中匹配上的词 //第二步,先把初步分词的去词库中匹配 foreach ($data as $k=>$v){ if($this->isMatch($v,$filename)){ $word[] = $v;//保存进已匹配数组中 unset($data[$k]); //删除已匹配上的词 } } //第三步,对未匹配上的词进一步分词处理 if(!empty($data)){ foreach ($data as $k=>$v){ $temp = $this->split_word($v); if(!empty($temp)){ foreach ($temp as $str){ $word[] = $str; $v = str_replace($str,'',$v); $data[$k] = $v; } //当前词已经为空时,删除当前元素 if(trim($v)=='') unset($data[$k]); } } } //第四步,对剩下的词进行替换同义词 if(!empty($data) && $ty_file){ foreach ($data as $k=>$v){ $word[] = $this->tyReplace($v,$ty_file); } } return $word; } /* * 词库精细分词 */ public function split_word($strArr){ $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr); $curNode = $this->dict; $find = []; $rootpostion = 0;//词根位置 $prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次 $words = []; $len = count($strArr); foreach ($strArr as $k=>$char) { $word = ''; if (isset($curNode[$char])) { for($i=$k;$i<$len;$i++){ $word .= $strArr[$i]; $curNode = $curNode[$strArr[$i]]; //遇到end时,将词保存下来 if(isset($curNode['end'])){ $words[] = $word; } } } //if($k) break; $curNode = $this->dict; } return $words; } /* * 编译同义词库 */ public function load_tongyi($filename='tongyi',$cache = true){ global $dc; $file = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt'; $cacheKey = __CLASS__ . "_" . md5($file); if ($cache && false !== ($this->ty_dict = $dc->get($cacheKey))) { return; } if (!file_exists($file)) { throw new InvalidArgumentException("字典文件不存在"); } $handle = @fopen($file, "r"); if (!is_resource($handle)) { throw new RuntimeException("字典文件无法打开"); } while (!feof($handle)) { $line = fgets($handle); if (empty($line)) { continue; } $this->addTongyi(trim($line)); } fclose($handle); if ($cache) { $dc->set($cacheKey, $this->ty_dict, null, 100000); } } /* * 添加同义词进字典 */ protected function addTongyi($str) { $arr = explode('=',$str); $words = $arr[0]; $oldword = $arr[1]; $wordArr = $this->splitStr($words); $curNode = &$this->ty_dict; foreach ($wordArr as $char) { if (!isset($curNode)) { $curNode[$char] = []; } $curNode = &$curNode[$char]; } // 标记到达当前节点完整路径为"敏感词" $curNode['end'] = $oldword; } /* * 同义词替换 */ public function tyReplace($strArr,$ty_file='tongyi'){ $this->load_tongyi($ty_file); $arr = is_array($strArr) ? $strArr : $this->splitStr($strArr); $data = $this->ty_dict; foreach ($arr as $k=>$v){ $data = $data[$v]; } return $data['end'] ? $data['end'] : $strArr; } /* * 替换文本中的指定词 * $text string 要替换的文本 * $filename string 使用的词库 */ public function contentReplace($text,$filename='tongyi'){ $str = strip_tags($text); preg_match_all('/([\w\x{4e00}-\x{9fa5}]+)/u', $text,$arr); $this->load_tongyi($filename); $this->dict = $this->ty_dict; //先用同义词库分词 foreach ($arr[0] as $k=>$v){ $word = $this->split_word($v); if($word){ foreach ($word as $t){ $tyc = $this->tyReplace($t,$filename); $text = str_replace($t,$tyc,$text); } } } return $text; } }
千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。