[PHP]汉语分词
最近再写一个PHP的汉语转拼音的翻译,所以做了这样一个汉语分词,里面包含一个有些简单问题的句子识别。
问题就是如果句子中包含词汇中维护的词,则会将句子继续拆分。
这个问题,我也写好了一个解决方案,但是因为是给客户的,所以我会在后期更新至博客。
$AllCiZuData = array(); $AllJvZiData = array(); $Result = array(); $SqlCiZu = "Select * From cizu"; $SqlJvZi = "Select * From jvzi"; $result = $MySql_Conn->query($SqlCiZu); if ($result->num_rows > 0) { while($row = $result->fetch_assoc()) { $Temp = new Data; $Temp->Chinese = $row['Chinese'];array_push($AllCiZuData,$Temp); } } $result = $MySql_Conn->query($SqlJvZi); if ($result->num_rows > 0) { while($row = $result->fetch_assoc()) { $Temp = new Data; $Temp->Chinese = $row['Chinese']; $Temp->Pinyin = $row['yu']; array_push($AllJvZiData,$Temp); } } for($i=0;$i<count($Word);$i=$i+1) { $Temp = $Word[$i]; $sign =0; for($j = $i + 1;$j<count($Word);$j=$j+1) { $Temp = $Temp . $Word[$j]; for($t = 0;$t < count($AllJvZiData);$t++) { if($AllJvZiData[$t]->Chinese == $Temp) { array_push($Result,$AllJvZiData[$t]->Chinese); $sign =1; break; } } if($sign ==1 ) { $i = $j; break; } else { for($t = 0;$t < count($AllCiZuData);$t++) { if($AllCiZuData[$t]->Chinese == $Temp) { array_push($Result,$AllJvZiData[$t]->Chinese); $sign =1; break; } } if($sign ==1 ) { $i = $j; break; } } } if($sign == 0) { array_push($Result,$Word[$i]); } } print_r($Result);
下面是我用的批量导入常用的词汇的php,只用下载网上的常用词汇表,将空格全部转换为/,就能访问这个php,就能批量导入
<?php include("Pinyin_Word_Dictionary.php"); include("Config.php"); $AllCiZuFile=fopen("..\cizudata.txt","r") or die("打开文件失败"); $AllCiZuData=fread($AllCiZuFile,filesize("..\cizudata.txt")); $MySql_Conn = new Mysqli($Database_ServerIP,$Database_UserName,$Database_Password,$Database_DBName); if($MySql_Conn->connect_error) { die("数据库连接失败,失败原因: ".$MySql_Conn->connect_error); } $AllCiZuArr=explode("\\",$AllCiZuData); $test=$AllCiZuArr[0]; //print_r($AllCiZuArr); fclose($AllCiZuFile); $Nmber = 0; foreach($AllCiZuArr as $Values) { $Pinyin = ""; $Len = mb_strlen($Values,'utf8'); $list = array(); for($start = 0;count($list) < $Len;$start++) { $Temp = mb_substr($Values, $start,1,'utf8'); $list[] = $Temp; if($Pinyin == "") { $Pinyin = Pinyin::getPinyin($Temp); } else { $Pinyin = $Pinyin ."-" . Pinyin::getPinyin($Temp); } } $MySql_sql = "insert into ".$Database_CizuTableName." values('".$Nmber."','".$Values ."','".$Pinyin."')"; $result = $MySql_Conn->query($MySql_sql); if($result === TRUE) { echo $Nmber." ".$Values." ". $Pinyin ."添加成功!<br>"; } else { echo $Nmber." ".$Values." ". $Pinyin ."添加失败!<br>"; } $Nmber++; } ?>