纯php分词封装的类

  分享一个纯php分词封装的类

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
<?php
/*
 * 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合
 */
 
class trie
{
    protected $dict;
    protected $dictFile;
    protected $specSymbol; //规格常见符号
    protected $ty_dict;
 
    /**
     * @param string $dictFile 字典文件路径, 每行一句
     */
    public function __construct()
    {
        $this->dict = [];
        $this->ty_dict = [];
        $this->specSymbol = "*|M|m|φ|Φ|st|ST";
    }
 
    public function loadData($cache = true)
    {
        global $dc;
 
        $cacheKey = __CLASS__ . "_" . md5($this->dictFile);
        if ($cache && false !== ($this->dict = $dc->get($cacheKey))) {
            return;
        }
 
        $this->loadDataFromFile();
 
        if ($cache) {
            $dc->set($cacheKey, $this->dict, null, 100000);
        }
    }
 
    /**
     * 从文件加载字典数据, 并构建 trie 树
     */
    public function loadDataFromFile()
    {
        $file = $this->dictFile;
        if (!file_exists($file)) {
            throw new InvalidArgumentException("字典文件不存在");
        }
 
        $handle = @fopen($file, "r");
        if (!is_resource($handle)) {
            throw new RuntimeException("字典文件无法打开");
        }
        while (!feof($handle)) {
            $line = fgets($handle);
            if (empty($line)) {
                continue;
            }
            $this->addWords(trim($line));
        }
 
        fclose($handle);
    }
 
    /**
     * 分割文本(注意ascii占1个字节, unicode...)
     *
     * @param string $str
     *
     * @return string[]
     */
    protected function splitStr($str)
    {
        return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
    }
 
    /**
     * 往dict树中添加语句
     *
     * @param $wordArr
     */
    protected function addWords($words)
    {
        $wordArr = $this->splitStr($words);
        $curNode = &$this->dict;
        foreach ($wordArr as $char) {
            if (!isset($curNode)) {
                $curNode[$char] = [];
            }
 
            $curNode = &$curNode[$char];
        }
        // 标记到达当前节点完整路径为"敏感词"
        $curNode['end']++;
    }
 
    /**
     * 过滤文本
     *
     * @param string $str 原始文本
     * @param string $replace 敏感字替换字符
     * @param int    $skipDistance 严格程度: 检测时允许跳过的间隔
     *
     * @return string 返回过滤后的文本
     */
    public function filter($str, $replace = '*', $skipDistance = 0)
    {
        $maxDistance = max($skipDistance, 0) + 1;
        $strArr = $this->splitStr($str);
        $length = count($strArr);
        for ($i = 0; $i < $length; $i++) {
            $char = $strArr[$i];
 
            if (!isset($this->dict[$char])) {
                continue;
            }
 
            $curNode = &$this->dict[$char];
            $dist = 0;
            $matchIndex = [$i];
            for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
                if (!isset($curNode[$strArr[$j]])) {
                    $dist ++;
                    continue;
                }
 
                $matchIndex[] = $j;
                $curNode = &$curNode[$strArr[$j]];
            }
 
            // 匹配
            if (isset($curNode['end'])) {
//                Log::Write("match ");
                foreach ($matchIndex as $index) {
                    $strArr[$index] = $replace;
                }
                $i = max($matchIndex);
            }
        }
        return implode('', $strArr);
    }
 
    /**
     * 查找
     *
     * @param $strArr
     *
     * @return bool|mixed
     */
    public function isMatch($strArr)
    {
        $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $curNode = $this->dict;
        foreach ($strArr as $char) {
            if (!isset($curNode[$char])) {
                return false;
            }else{
                $curNode = $curNode[$char];
            }
        }
        return isset($curNode['end']) ? $curNode['end'] : false;
    }
 
    /*
     * 判断词是否存在于词库中
     */
    public function isType($word,$filename='word'){
        //判断
        return $this->isMatch($word);
    }
 
 
    /*
     * 对前端传过来的$kw对进行分词
     * 然后返回对应类型的词
     * $kw string 前端传过来的关健词
     * $filename string 词库文件名
     * $ty_file string 同义词库文件名
     */
    public function split_kw($kw,$filename='word',$ty_file=''){
        $this->dictFile = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
        $this->loadData();
        //第一步,先进行空格,,号拆分
        $temp = preg_split("/[\s,,]+/", $kw); //explode(' ',trim($kw));
        $data = [];
        if(!empty($temp)){
            foreach ($temp as $k=>$v){
                if($v) $data[] = $v;
            }
        }else{
            $data[] = $kw;
        }
        $word = []; //用来保存词库中匹配上的词
        //第二步,先把初步分词的去词库中匹配
        foreach ($data as $k=>$v){
            if($this->isMatch($v,$filename)){
                $word[] = $v;//保存进已匹配数组中
                unset($data[$k]); //删除已匹配上的词
            }
        }
        //第三步,对未匹配上的词进一步分词处理
        if(!empty($data)){
            foreach ($data as $k=>$v){
                $temp = $this->split_word($v);
                if(!empty($temp)){
                    foreach ($temp as $str){
                        $word[] = $str;
                        $v = str_replace($str,'',$v);
                        $data[$k] = $v;
                    }
                    //当前词已经为空时,删除当前元素
                    if(trim($v)=='') unset($data[$k]);
                }
 
            }
        }
 
        //第四步,对剩下的词进行替换同义词
        if(!empty($data) && $ty_file){
            foreach ($data as $k=>$v){
                $word[] = $this->tyReplace($v,$ty_file);
            }
        }
        return $word;
    }
 
 
    /*
     * 词库精细分词
     */
    public function split_word($strArr){
        $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $curNode = $this->dict;
        $find = [];
        $rootpostion = 0;//词根位置
        $prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次
        $words = [];
        $len = count($strArr);
        foreach ($strArr as $k=>$char) {
            $word = '';
            if (isset($curNode[$char])) {
                for($i=$k;$i<$len;$i++){
                    $word .= $strArr[$i];
                    $curNode = $curNode[$strArr[$i]];
                    //遇到end时,将词保存下来
                    if(isset($curNode['end'])){
                        $words[] = $word;
 
                    }
                }
            }
            //if($k) break;
            $curNode = $this->dict;
        }
        return $words;
    }
 
 
    /*
     * 编译同义词库
     */
    public function load_tongyi($filename='tongyi',$cache = true){
        global $dc;
        $file = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
        $cacheKey = __CLASS__ . "_" . md5($file);
        if ($cache && false !== ($this->ty_dict = $dc->get($cacheKey))) {
            return;
        }
 
 
        if (!file_exists($file)) {
            throw new InvalidArgumentException("字典文件不存在");
        }
 
        $handle = @fopen($file, "r");
        if (!is_resource($handle)) {
            throw new RuntimeException("字典文件无法打开");
        }
        while (!feof($handle)) {
            $line = fgets($handle);
            if (empty($line)) {
                continue;
            }
            $this->addTongyi(trim($line));
        }
 
        fclose($handle);
 
        if ($cache) {
            $dc->set($cacheKey, $this->ty_dict, null, 100000);
        }
    }
 
    /*
     * 添加同义词进字典
     */
    protected function addTongyi($str)
    {
        $arr = explode('=',$str);
        $words = $arr[0];
        $oldword = $arr[1];
        $wordArr = $this->splitStr($words);
        $curNode = &$this->ty_dict;
        foreach ($wordArr as $char) {
            if (!isset($curNode)) {
                $curNode[$char] = [];
            }
 
            $curNode = &$curNode[$char];
        }
        // 标记到达当前节点完整路径为"敏感词"
        $curNode['end'] = $oldword;
    }
 
    /*
     * 同义词替换
     */
    public function tyReplace($strArr,$ty_file='tongyi'){
        $this->load_tongyi($ty_file);
        $arr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $data = $this->ty_dict;
        foreach ($arr as $k=>$v){
            $data = $data[$v];
        }
        return $data['end'] ? $data['end'] : $strArr;
    }
 
    /*
     * 替换文本中的指定词
     * $text string 要替换的文本
     * $filename string 使用的词库
     */
    public function contentReplace($text,$filename='tongyi'){
        $str = strip_tags($text);
        preg_match_all('/([\w\x{4e00}-\x{9fa5}]+)/u', $text,$arr);
        $this->load_tongyi($filename);
        $this->dict = $this->ty_dict;
        //先用同义词库分词
        foreach ($arr[0] as $k=>$v){
            $word = $this->split_word($v);
            if($word){
                foreach ($word as $t){
                    $tyc = $this->tyReplace($t,$filename);
                    $text = str_replace($t,$tyc,$text);
                }
            }
        }
        return $text;
    }
 
 
}

  

posted @   圆柱模板  阅读(217)  评论(0编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具
点击右上角即可分享
微信分享提示