1. 二元分词
1 <?php
2 function _tokenizer ($text)
3 {
4 // UTF8_only
5 // 2-Base Cut
6 $len = strlen ($text);
7 $mbc = '';
8 $last_mbc = '';
9 $tmp = '';
10 $tokens = array ();
11
12 for ($i = 0; $i < $len; $i++) {
13 $c = $text[$i];
14 $v = ord ($c);
15
16 if ($v > 0xe0) {
17 // 3-bytes chars
18 $tmp = '';
19 $mbc = $c . $text[$i + 1] . $text[$i + 2];
20 $i += 2;
21 }
22
23 elseif ($v > 0xc0) {
24 // 2-bytes chars
25 $tmp = '';
26 $mbc = $c . $text[$i + 1];
27 $i ++;
28 }
29
30 else {
31 $mbc = '';
32 if ($c == ' ') {
33 if ($tmp) {
34 $p = $i - strlen ($tmp);
35 $tokens[$p] = $tmp;
36 }
37
38 $tmp = '';
39 }
40 else {
41 $tmp .= $c;
42 }
43 }
44
45 if ($mbc) {
46 if ($last_mbc) {
47 $p = $i - strlen ($last_mbc . $mbc) + 1;
48 $tokens[$p] = $last_mbc . $mbc;
49 }
50 $last_mbc = $mbc;
51 }
52
53 else {
54 $last_mbc = '';
55 }
56 }
57
58 return $tokens;
59 }
60
61 function _tokenizer_dict ($text, $non_word = false)
62 {
63 $len = strlen ($text);
64 $mbc = '';
65 //$mbc_str = '';
66 $mbc_str = array ();
67 $tmp = '';
68 $tokens = array ();
69
70 for ($i = 0; $i < $len; $i++) {
71 $c = $text[$i];
72 $v = ord ($c);
73
74 if ($v > 0xe0) {
75 // 3-bytes chars
76 $tmp = '';
77 $mbc = $c . $text[$i + 1] . $text[$i + 2];
78 $i += 2;
79 }
80
81 elseif ($v > 0xc0) {
82 // 2-bytes chars
83 $tmp = '';
84 $mbc = $c . $text[$i + 1];
85 $i ++;
86 }
87
88 else {
89 $mbc = '';
90 if ($c == ' ') {
91 if ($tmp) {
92 $p = $i - strlen ($tmp);
93 $tokens[$p] = $tmp;
94 }
95
96 $tmp = '';
97 }
98
99 else {
100 $tmp .= $c;
101 }
102
103 if (count ($mbc_str) > 0) {
104 // Div_dict
105 //mb_internal_encoding ('UTF-8');
106 $start_offset = $i - strlen (implode ('', $mbc_str));
107 $mbc_str_left = $mbc_str;
108 while (count ($mbc_str_left)) {
109 //$mb_len = mb_strlen ($mbc_str_left);
110 $mb_len = count ($mbc_str_left);
111 $word = '';
112
113 for ($j = ($mb_len > 4 ? 4 : $mb_len); $j >= 1; $j --) {
114 //$test = mb_substr ($mbc_str_left, 0, $j);
115 $test = '';
116 for ($k = 0; $k < $j; $k++) {
117 $test .= $mbc_str_left[$k];
118 }
119
120 //$mb_test_len = mb_strlen ($test);
121 if ($j == 1) {
122 // 1 only
123 $word = $test;
124 }
125
126 else {
127 if ($this->dict->find ($test)) {
128 $word = $test;
129 }
130 }
131
132 if ($word) {
133 //$mbc_str_left = mb_substr ($mbc_str_left, $mb_test_len);
134
135 $arr_tmp = array ();
136 for ($k = $j; $k < $mb_len; $k++) {
137 $arr_tmp[] = $mbc_str_left[$k];
138 }
139
140 $mbc_str_left = $arr_tmp;
141 if (!$non_word) {
142 if ($j > 1)
143 $tokens[$start_offset] = $word;
144 }
145 else
146 $tokens[$start_offset] = $word;
147
148 $start_offset += strlen ($word);
149 continue 2;
150 }
151 }
152 }
153 }
154
155 //$mbc_str = '';
156 $mbc_str = array ();
157 }
158
159 if ($mbc) {
160 $mbc_str[] = $mbc;
161 }
162 }
163
164 return $tokens;
165 }
166 ?>
2 function _tokenizer ($text)
3 {
4 // UTF8_only
5 // 2-Base Cut
6 $len = strlen ($text);
7 $mbc = '';
8 $last_mbc = '';
9 $tmp = '';
10 $tokens = array ();
11
12 for ($i = 0; $i < $len; $i++) {
13 $c = $text[$i];
14 $v = ord ($c);
15
16 if ($v > 0xe0) {
17 // 3-bytes chars
18 $tmp = '';
19 $mbc = $c . $text[$i + 1] . $text[$i + 2];
20 $i += 2;
21 }
22
23 elseif ($v > 0xc0) {
24 // 2-bytes chars
25 $tmp = '';
26 $mbc = $c . $text[$i + 1];
27 $i ++;
28 }
29
30 else {
31 $mbc = '';
32 if ($c == ' ') {
33 if ($tmp) {
34 $p = $i - strlen ($tmp);
35 $tokens[$p] = $tmp;
36 }
37
38 $tmp = '';
39 }
40 else {
41 $tmp .= $c;
42 }
43 }
44
45 if ($mbc) {
46 if ($last_mbc) {
47 $p = $i - strlen ($last_mbc . $mbc) + 1;
48 $tokens[$p] = $last_mbc . $mbc;
49 }
50 $last_mbc = $mbc;
51 }
52
53 else {
54 $last_mbc = '';
55 }
56 }
57
58 return $tokens;
59 }
60
61 function _tokenizer_dict ($text, $non_word = false)
62 {
63 $len = strlen ($text);
64 $mbc = '';
65 //$mbc_str = '';
66 $mbc_str = array ();
67 $tmp = '';
68 $tokens = array ();
69
70 for ($i = 0; $i < $len; $i++) {
71 $c = $text[$i];
72 $v = ord ($c);
73
74 if ($v > 0xe0) {
75 // 3-bytes chars
76 $tmp = '';
77 $mbc = $c . $text[$i + 1] . $text[$i + 2];
78 $i += 2;
79 }
80
81 elseif ($v > 0xc0) {
82 // 2-bytes chars
83 $tmp = '';
84 $mbc = $c . $text[$i + 1];
85 $i ++;
86 }
87
88 else {
89 $mbc = '';
90 if ($c == ' ') {
91 if ($tmp) {
92 $p = $i - strlen ($tmp);
93 $tokens[$p] = $tmp;
94 }
95
96 $tmp = '';
97 }
98
99 else {
100 $tmp .= $c;
101 }
102
103 if (count ($mbc_str) > 0) {
104 // Div_dict
105 //mb_internal_encoding ('UTF-8');
106 $start_offset = $i - strlen (implode ('', $mbc_str));
107 $mbc_str_left = $mbc_str;
108 while (count ($mbc_str_left)) {
109 //$mb_len = mb_strlen ($mbc_str_left);
110 $mb_len = count ($mbc_str_left);
111 $word = '';
112
113 for ($j = ($mb_len > 4 ? 4 : $mb_len); $j >= 1; $j --) {
114 //$test = mb_substr ($mbc_str_left, 0, $j);
115 $test = '';
116 for ($k = 0; $k < $j; $k++) {
117 $test .= $mbc_str_left[$k];
118 }
119
120 //$mb_test_len = mb_strlen ($test);
121 if ($j == 1) {
122 // 1 only
123 $word = $test;
124 }
125
126 else {
127 if ($this->dict->find ($test)) {
128 $word = $test;
129 }
130 }
131
132 if ($word) {
133 //$mbc_str_left = mb_substr ($mbc_str_left, $mb_test_len);
134
135 $arr_tmp = array ();
136 for ($k = $j; $k < $mb_len; $k++) {
137 $arr_tmp[] = $mbc_str_left[$k];
138 }
139
140 $mbc_str_left = $arr_tmp;
141 if (!$non_word) {
142 if ($j > 1)
143 $tokens[$start_offset] = $word;
144 }
145 else
146 $tokens[$start_offset] = $word;
147
148 $start_offset += strlen ($word);
149 continue 2;
150 }
151 }
152 }
153 }
154
155 //$mbc_str = '';
156 $mbc_str = array ();
157 }
158
159 if ($mbc) {
160 $mbc_str[] = $mbc;
161 }
162 }
163
164 return $tokens;
165 }
166 ?>
可以看到注释掉的信息,是mb_函数部分,我去掉他们,一方面是为了迁移,一方面是mb_很慢。我偷懒地使用了不完整的UTF8切字,只判断2个字节的和3个字节的,其实只有UTF3,呵呵……以后再说
1 <?php
2 function _normalize_text ($text)
3 {
4 $symbol = '`~!@#$%^&*()_+=|{}[]:;"<>,.?';
5 $symbol = preg_quote ($symbol);
6 $ret = preg_replace ("/[$symbol]/", ' ', $text);
7 $ret = preg_replace ("/[rnt]/", ' ', $ret);
8
9 // For Chinese
10 $ret = str_replace ('“', ' ', $ret);
11 $ret = str_replace ('”', ' ', $ret);
12 $ret = str_replace ('‘', ' ', $ret);
13 $ret = str_replace ('’', ' ', $ret);
14 $ret = str_replace ('!', ' ', $ret);
15 $ret = str_replace ('?', ' ', $ret);
16 $ret = str_replace ('。', ' ', $ret);
17 $ret = str_replace (',', ' ', $ret);
18 $ret = str_replace ('、', ' ', $ret);
19 $ret = str_replace ('·', ' ', $ret);
20 $ret = str_replace ('(', ' ', $ret);
21 $ret = str_replace (')', ' ', $ret);
22 $ret = str_replace ('#', ' ', $ret);
23 $ret = str_replace ('《', ' ', $ret);
24 $ret = str_replace ('》', ' ', $ret);
25 $ret = str_replace (';', ' ', $ret);
26 $ret = str_replace (':', ' ', $ret);
27 $ret = str_replace ('……', ' ', $ret);
28 $ret = str_replace (' ', ' ', $ret);
29 $ret = str_replace ('——', ' ', $ret);
30
31 // Cut Words
32 $ret = str_replace ('的', '的 ', $ret);
33 $ret = str_replace ('是', '是 ', $ret);
34 $ret = str_replace ('吗', '吗 ', $ret);
35 $ret = str_replace ('吧', '吧 ', $ret);
36 $ret = str_replace ('呀', '呀 ', $ret);
37
38 $ret = preg_replace ("/s+/", ' ', $ret);
39
40 return (trim ($ret) . ' ');
41 }
42 ?>
上面这个函数对文字做了一些简单的预处理,扔掉了一些标点符号,主要就是为了把文章先分割成“句子”,实验性函数……2 function _normalize_text ($text)
3 {
4 $symbol = '`~!@#$%^&*()_+=|{}[]:;"<>,.?';
5 $symbol = preg_quote ($symbol);
6 $ret = preg_replace ("/[$symbol]/", ' ', $text);
7 $ret = preg_replace ("/[rnt]/", ' ', $ret);
8
9 // For Chinese
10 $ret = str_replace ('“', ' ', $ret);
11 $ret = str_replace ('”', ' ', $ret);
12 $ret = str_replace ('‘', ' ', $ret);
13 $ret = str_replace ('’', ' ', $ret);
14 $ret = str_replace ('!', ' ', $ret);
15 $ret = str_replace ('?', ' ', $ret);
16 $ret = str_replace ('。', ' ', $ret);
17 $ret = str_replace (',', ' ', $ret);
18 $ret = str_replace ('、', ' ', $ret);
19 $ret = str_replace ('·', ' ', $ret);
20 $ret = str_replace ('(', ' ', $ret);
21 $ret = str_replace (')', ' ', $ret);
22 $ret = str_replace ('#', ' ', $ret);
23 $ret = str_replace ('《', ' ', $ret);
24 $ret = str_replace ('》', ' ', $ret);
25 $ret = str_replace (';', ' ', $ret);
26 $ret = str_replace (':', ' ', $ret);
27 $ret = str_replace ('……', ' ', $ret);
28 $ret = str_replace (' ', ' ', $ret);
29 $ret = str_replace ('——', ' ', $ret);
30
31 // Cut Words
32 $ret = str_replace ('的', '的 ', $ret);
33 $ret = str_replace ('是', '是 ', $ret);
34 $ret = str_replace ('吗', '吗 ', $ret);
35 $ret = str_replace ('吧', '吧 ', $ret);
36 $ret = str_replace ('呀', '呀 ', $ret);
37
38 $ret = preg_replace ("/s+/", ' ', $ret);
39
40 return (trim ($ret) . ' ');
41 }
42 ?>
我的词典是保存在内存中的,依靠memcached来维护,每一个词保存的就是一个名字为word_key,值为“t”的内存变量。memcached对这个词典进行了有效的散列。下面是词典class:
1 <?php
2 class BsmSearchDictMemcached
3 {
4 var $mc;
5
6 function BsmSearchDictMemcached ()
7 {
8 global $dict_memcached_host, $dict_memcached_port;
9
10 $this->mc = memcache ();
11 $this->mc->add_server ($dict_memcached_host, $dict_memcached_port);
12
13 return $this->mc;
14 }
15
16 function make_mem_dict ()
17 {
18 global $dict_source_file;
19
20 $fp = fopen ($dict_source_file, 'rb');
21
22 while ($word = fgets ($fp)) {
23 $word = trim ($word);
24 $key = $this->_gen_mem_key ($word);
25 $this->mc->set ($key, 't');
26 }
27
28 fclose ($fp);
29 }
30
31 function find ($word)
32 {
33 $key = $this->_gen_mem_key ($word);
34
35 if ($this->mc->get ($key) == 't')
36 return true;
37
38 else
39 return false;
40 }
41
42 function _gen_mem_key ($word)
43 {
44 if ($word) {
45 $md5_word = md5 ($word);
46 $key = substr ($md5_word, 0, 4) . substr ($md5_word, 16, 8);
47 $key = 'dict_' . $key;
48 }
49
50 else
51 $key = 'NO_KEY';
52
53 return $key;
54 }
55 }
56 ?>
一些参数是在BSM的配置文件中定义的,make_mem_dict是生成内存词典的方法,它从原始词典dict.dat中导出数据插入到内存中。2 class BsmSearchDictMemcached
3 {
4 var $mc;
5
6 function BsmSearchDictMemcached ()
7 {
8 global $dict_memcached_host, $dict_memcached_port;
9
10 $this->mc = memcache ();
11 $this->mc->add_server ($dict_memcached_host, $dict_memcached_port);
12
13 return $this->mc;
14 }
15
16 function make_mem_dict ()
17 {
18 global $dict_source_file;
19
20 $fp = fopen ($dict_source_file, 'rb');
21
22 while ($word = fgets ($fp)) {
23 $word = trim ($word);
24 $key = $this->_gen_mem_key ($word);
25 $this->mc->set ($key, 't');
26 }
27
28 fclose ($fp);
29 }
30
31 function find ($word)
32 {
33 $key = $this->_gen_mem_key ($word);
34
35 if ($this->mc->get ($key) == 't')
36 return true;
37
38 else
39 return false;
40 }
41
42 function _gen_mem_key ($word)
43 {
44 if ($word) {
45 $md5_word = md5 ($word);
46 $key = substr ($md5_word, 0, 4) . substr ($md5_word, 16, 8);
47 $key = 'dict_' . $key;
48 }
49
50 else
51 $key = 'NO_KEY';
52
53 return $key;
54 }
55 }
56 ?>
一个使用实例:
1 <?php
2 define ('IN_BSM', true);
3 $phpEx = 'php';
4 error_reporting (2047);
5 require ('../include/kernel/common.inc.' . $phpEx);
6 require ($include_root . 'search/search.inc.' . $phpEx);
7 $search = new BsmSearch ('search/');
8 $str = '我是大傻瓜';
9 $start_time = array_sum (explode (' ', microtime()));
10 $db->sql_query ("INSERT INTO `data` SET `text` = '$str');
11 $id = $db->sql_nextid ();
12 $search->add_text ($id, $str);
13 print_r ($search->search ('傻瓜'));
14 $end_time = array_sum (explode (' ', microtime()));
15 $time = $end_time - $start_time;
16 echo ('<br>Spend Time: ' . $time . ' secs');
17 ?>
18
2 define ('IN_BSM', true);
3 $phpEx = 'php';
4 error_reporting (2047);
5 require ('../include/kernel/common.inc.' . $phpEx);
6 require ($include_root . 'search/search.inc.' . $phpEx);
7 $search = new BsmSearch ('search/');
8 $str = '我是大傻瓜';
9 $start_time = array_sum (explode (' ', microtime()));
10 $db->sql_query ("INSERT INTO `data` SET `text` = '$str');
11 $id = $db->sql_nextid ();
12 $search->add_text ($id, $str);
13 print_r ($search->search ('傻瓜'));
14 $end_time = array_sum (explode (' ', microtime()));
15 $time = $end_time - $start_time;
16 echo ('<br>Spend Time: ' . $time . ' secs');
17 ?>
18