纯php分词封装的类
分享一个纯php分词封装的类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 | <?php /* * 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合 */ class trie { protected $dict ; protected $dictFile ; protected $specSymbol ; //规格常见符号 protected $ty_dict ; /** * @param string $dictFile 字典文件路径, 每行一句 */ public function __construct() { $this ->dict = []; $this ->ty_dict = []; $this ->specSymbol = "*|M|m|φ|Φ|st|ST" ; } public function loadData( $cache = true) { global $dc ; $cacheKey = __CLASS__ . "_" . md5( $this ->dictFile); if ( $cache && false !== ( $this ->dict = $dc ->get( $cacheKey ))) { return ; } $this ->loadDataFromFile(); if ( $cache ) { $dc ->set( $cacheKey , $this ->dict, null, 100000); } } /** * 从文件加载字典数据, 并构建 trie 树 */ public function loadDataFromFile() { $file = $this ->dictFile; if (! file_exists ( $file )) { throw new InvalidArgumentException( "字典文件不存在" ); } $handle = @ fopen ( $file , "r" ); if (! is_resource ( $handle )) { throw new RuntimeException( "字典文件无法打开" ); } while (! feof ( $handle )) { $line = fgets ( $handle ); if ( empty ( $line )) { continue ; } $this ->addWords(trim( $line )); } fclose( $handle ); } /** * 分割文本(注意ascii占1个字节, unicode...) * * @param string $str * * @return string[] */ protected function splitStr( $str ) { return preg_split( "//u" , $str , -1, PREG_SPLIT_NO_EMPTY); } /** * 往dict树中添加语句 * * @param $wordArr */ protected function addWords( $words ) { $wordArr = $this ->splitStr( $words ); $curNode = & $this ->dict; foreach ( $wordArr as $char ) { if (!isset( $curNode )) { $curNode [ $char ] = []; } $curNode = & $curNode [ $char ]; } // 标记到达当前节点完整路径为"敏感词" $curNode [ 'end' ]++; } /** * 过滤文本 * * @param string $str 原始文本 * @param string $replace 敏感字替换字符 * @param int $skipDistance 严格程度: 检测时允许跳过的间隔 * * @return string 返回过滤后的文本 */ public function filter( $str , $replace = '*' , $skipDistance = 0) { $maxDistance = max( $skipDistance , 0) + 1; $strArr = $this ->splitStr( $str ); $length = count ( $strArr ); for ( $i = 0; $i < $length ; $i ++) { $char = $strArr [ $i ]; if (!isset( $this ->dict[ $char ])) { continue ; } $curNode = & $this ->dict[ $char ]; $dist = 0; $matchIndex = [ $i ]; for ( $j = $i + 1; $j < $length && $dist < $maxDistance ; $j ++) { if (!isset( $curNode [ $strArr [ $j ]])) { $dist ++; continue ; } $matchIndex [] = $j ; $curNode = & $curNode [ $strArr [ $j ]]; } // 匹配 if (isset( $curNode [ 'end' ])) { // Log::Write("match "); foreach ( $matchIndex as $index ) { $strArr [ $index ] = $replace ; } $i = max( $matchIndex ); } } return implode( '' , $strArr ); } /** * 查找 * * @param $strArr * * @return bool|mixed */ public function isMatch( $strArr ) { $strArr = is_array ( $strArr ) ? $strArr : $this ->splitStr( $strArr ); $curNode = $this ->dict; foreach ( $strArr as $char ) { if (!isset( $curNode [ $char ])) { return false; } else { $curNode = $curNode [ $char ]; } } return isset( $curNode [ 'end' ]) ? $curNode [ 'end' ] : false; } /* * 判断词是否存在于词库中 */ public function isType( $word , $filename = 'word' ){ //判断 return $this ->isMatch( $word ); } /* * 对前端传过来的$kw对进行分词 * 然后返回对应类型的词 * $kw string 前端传过来的关健词 * $filename string 词库文件名 * $ty_file string 同义词库文件名 */ public function split_kw( $kw , $filename = 'word' , $ty_file = '' ){ $this ->dictFile = DT_ROOT. '/api/dtapicom/trie/' . $filename . '.txt' ; $this ->loadData(); //第一步,先进行空格,,号拆分 $temp = preg_split( "/[\s,,]+/" , $kw ); //explode(' ',trim($kw)); $data = []; if (! empty ( $temp )){ foreach ( $temp as $k => $v ){ if ( $v ) $data [] = $v ; } } else { $data [] = $kw ; } $word = []; //用来保存词库中匹配上的词 //第二步,先把初步分词的去词库中匹配 foreach ( $data as $k => $v ){ if ( $this ->isMatch( $v , $filename )){ $word [] = $v ; //保存进已匹配数组中 unset( $data [ $k ]); //删除已匹配上的词 } } //第三步,对未匹配上的词进一步分词处理 if (! empty ( $data )){ foreach ( $data as $k => $v ){ $temp = $this ->split_word( $v ); if (! empty ( $temp )){ foreach ( $temp as $str ){ $word [] = $str ; $v = str_replace ( $str , '' , $v ); $data [ $k ] = $v ; } //当前词已经为空时,删除当前元素 if (trim( $v )== '' ) unset( $data [ $k ]); } } } //第四步,对剩下的词进行替换同义词 if (! empty ( $data ) && $ty_file ){ foreach ( $data as $k => $v ){ $word [] = $this ->tyReplace( $v , $ty_file ); } } return $word ; } /* * 词库精细分词 */ public function split_word( $strArr ){ $strArr = is_array ( $strArr ) ? $strArr : $this ->splitStr( $strArr ); $curNode = $this ->dict; $find = []; $rootpostion = 0; //词根位置 $prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次 $words = []; $len = count ( $strArr ); foreach ( $strArr as $k => $char ) { $word = '' ; if (isset( $curNode [ $char ])) { for ( $i = $k ; $i < $len ; $i ++){ $word .= $strArr [ $i ]; $curNode = $curNode [ $strArr [ $i ]]; //遇到end时,将词保存下来 if (isset( $curNode [ 'end' ])){ $words [] = $word ; } } } //if($k) break; $curNode = $this ->dict; } return $words ; } /* * 编译同义词库 */ public function load_tongyi( $filename = 'tongyi' , $cache = true){ global $dc ; $file = DT_ROOT. '/api/dtapicom/trie/' . $filename . '.txt' ; $cacheKey = __CLASS__ . "_" . md5( $file ); if ( $cache && false !== ( $this ->ty_dict = $dc ->get( $cacheKey ))) { return ; } if (! file_exists ( $file )) { throw new InvalidArgumentException( "字典文件不存在" ); } $handle = @ fopen ( $file , "r" ); if (! is_resource ( $handle )) { throw new RuntimeException( "字典文件无法打开" ); } while (! feof ( $handle )) { $line = fgets ( $handle ); if ( empty ( $line )) { continue ; } $this ->addTongyi(trim( $line )); } fclose( $handle ); if ( $cache ) { $dc ->set( $cacheKey , $this ->ty_dict, null, 100000); } } /* * 添加同义词进字典 */ protected function addTongyi( $str ) { $arr = explode ( '=' , $str ); $words = $arr [0]; $oldword = $arr [1]; $wordArr = $this ->splitStr( $words ); $curNode = & $this ->ty_dict; foreach ( $wordArr as $char ) { if (!isset( $curNode )) { $curNode [ $char ] = []; } $curNode = & $curNode [ $char ]; } // 标记到达当前节点完整路径为"敏感词" $curNode [ 'end' ] = $oldword ; } /* * 同义词替换 */ public function tyReplace( $strArr , $ty_file = 'tongyi' ){ $this ->load_tongyi( $ty_file ); $arr = is_array ( $strArr ) ? $strArr : $this ->splitStr( $strArr ); $data = $this ->ty_dict; foreach ( $arr as $k => $v ){ $data = $data [ $v ]; } return $data [ 'end' ] ? $data [ 'end' ] : $strArr ; } /* * 替换文本中的指定词 * $text string 要替换的文本 * $filename string 使用的词库 */ public function contentReplace( $text , $filename = 'tongyi' ){ $str = strip_tags ( $text ); preg_match_all( '/([\w\x{4e00}-\x{9fa5}]+)/u' , $text , $arr ); $this ->load_tongyi( $filename ); $this ->dict = $this ->ty_dict; //先用同义词库分词 foreach ( $arr [0] as $k => $v ){ $word = $this ->split_word( $v ); if ( $word ){ foreach ( $word as $t ){ $tyc = $this ->tyReplace( $t , $filename ); $text = str_replace ( $t , $tyc , $text ); } } } return $text ; } } |
千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具