汉字转拼音问题
这是一个非常经典的问题,同时,也是一个很难解决的问题。经典是因为,这个问题非常常见,而且有很多解决方案。
难以解决是因为,存在多音字。可以发现很多股票软件都有这问题,比如zgyh 出来的不是中国银行,而是找不到,
这是因为,中国银行的 “行” 查出来读的是 "xing" 。我篇文章我只是提供了一个原始的转换方法,就是查表。
和你以前看过的算法可能不一样的是,我加了一个二分查找,我看到的一个开源的解决方案是顺序查找的。
关于,多音字问题,我只有一个小规模的多音字表,从所有股票名称里面纠正过来的。解决方案是这样的:在查找拼音的时候,
查找一下这个字是否在多音字表里面,如果是,那么查找前后组成的词,来决定读什么。如果有谁有完整的多音字表,或者有更好的
解决方案,请给我留言。我一定拜谢。
代码
<?php
/**
* 初始化
*
* @param string $file 拼音代码对应表
* @return array 把这个对应表映射成array的格式
*/
function pinyin_init($file)
{
$pinyin = file($file);
if (empty($pinyin)) {
throw new Exception("load pinyin table error.");
}
foreach ($pinyin as $k => $item)
{
$item = explode("\t", $item);
$item[0] = trim($item[0]);
$item[1] = (int)trim($item[1]);
$pinyin[$k] = $item;
}
$pinyin[] = array('', -10248); //最后一个拼音的妈是 -10247
return $pinyin;
}
/**
* 查找一个字符串中,中文字符部分的拼音
*
* @param array $pingyin
* @param string $str 字符串
* @param string $in_encode 字符串的编码
* @return array
*/
function pingyin_get($pingyin, $str, $in_encode = 'gb2312')
{
if (strtolower($in_encode) != 'gb2312') {
$str = iconv($in_encode, 'gb2312', $str);
if (!$str) {
throw new Exception('invalid input string.');
}
}
$pinyin_arr = array();
$len = strlen($str);
for ($i = 0; $i < $len; $i++)
{
if ($i == $len -1) break;
if (($code = pinyin_getcode_($str[$i], $str[$i + 1])) != false) {
$i++;
if (($py = pinyin_find_($pingyin, $code)) !== false) {
$pinyin_arr[substr($str, $i - 1 , 2)] = $py;
}
}
}
return $pinyin_arr;
}
/**
* 内部函数,通过拼音码查找汉字
*
* @param array $pingyin
* @param int $code
* @return string/false
*/
function pinyin_find_($pingyin, $code)
{
$start = 0;
$count = count($pingyin) - 1;
$end = $count;
if ($code < $pingyin[0][1] || $code >= $pingyin[$end][1]) { //no found.
return false;
}
while ($start <= $end)
{
$mid = (int)($start + ($end - $start)/2);
if ($mid == $count) { //正常情况下,应该是永远都不会执行到这里,但是为了逻辑上清晰点,还是决定写上
return $pingyin[$mid - 1][0];
} else if ($code >= $pingyin[$mid][1] && $code < $pingyin[$mid + 1][1]) {
return $pingyin[$mid][0];
} else if ($code >= $pingyin[$mid + 1][1]) {
$start = $mid + 1;
} else {
$end = $mid - 1;
}
}
return false;
}
/**
* 内部函数,通过汉字的第一个字节 和 第二个字节,得到查找的code
*
* @param char $ch1
* @param char $ch2
* @return int
*/
function pinyin_getcode_($ch1, $ch2)
{
$num1 = ord($ch1);
$num2 = ord($ch2);
if ($num1 > 159 && $num1 < 248) {
return $num1 * 256 + $num2 - 65536;
}
return false;
}
/**
* 基本配置信息
*/
define("CURRENT_PATH", dirname(__FILE__));
$pingin_file = CURRENT_PATH . "/gb-pinyin.table";
$pinyin = pinyin_init($pingin_file);
$py = pingyin_get($pinyin, "中国银行", 'utf-8');
echo iconv('gb2312', 'utf-8', print_r($py, true));
echo implode("'", $py);
?>
/**
* 初始化
*
* @param string $file 拼音代码对应表
* @return array 把这个对应表映射成array的格式
*/
function pinyin_init($file)
{
$pinyin = file($file);
if (empty($pinyin)) {
throw new Exception("load pinyin table error.");
}
foreach ($pinyin as $k => $item)
{
$item = explode("\t", $item);
$item[0] = trim($item[0]);
$item[1] = (int)trim($item[1]);
$pinyin[$k] = $item;
}
$pinyin[] = array('', -10248); //最后一个拼音的妈是 -10247
return $pinyin;
}
/**
* 查找一个字符串中,中文字符部分的拼音
*
* @param array $pingyin
* @param string $str 字符串
* @param string $in_encode 字符串的编码
* @return array
*/
function pingyin_get($pingyin, $str, $in_encode = 'gb2312')
{
if (strtolower($in_encode) != 'gb2312') {
$str = iconv($in_encode, 'gb2312', $str);
if (!$str) {
throw new Exception('invalid input string.');
}
}
$pinyin_arr = array();
$len = strlen($str);
for ($i = 0; $i < $len; $i++)
{
if ($i == $len -1) break;
if (($code = pinyin_getcode_($str[$i], $str[$i + 1])) != false) {
$i++;
if (($py = pinyin_find_($pingyin, $code)) !== false) {
$pinyin_arr[substr($str, $i - 1 , 2)] = $py;
}
}
}
return $pinyin_arr;
}
/**
* 内部函数,通过拼音码查找汉字
*
* @param array $pingyin
* @param int $code
* @return string/false
*/
function pinyin_find_($pingyin, $code)
{
$start = 0;
$count = count($pingyin) - 1;
$end = $count;
if ($code < $pingyin[0][1] || $code >= $pingyin[$end][1]) { //no found.
return false;
}
while ($start <= $end)
{
$mid = (int)($start + ($end - $start)/2);
if ($mid == $count) { //正常情况下,应该是永远都不会执行到这里,但是为了逻辑上清晰点,还是决定写上
return $pingyin[$mid - 1][0];
} else if ($code >= $pingyin[$mid][1] && $code < $pingyin[$mid + 1][1]) {
return $pingyin[$mid][0];
} else if ($code >= $pingyin[$mid + 1][1]) {
$start = $mid + 1;
} else {
$end = $mid - 1;
}
}
return false;
}
/**
* 内部函数,通过汉字的第一个字节 和 第二个字节,得到查找的code
*
* @param char $ch1
* @param char $ch2
* @return int
*/
function pinyin_getcode_($ch1, $ch2)
{
$num1 = ord($ch1);
$num2 = ord($ch2);
if ($num1 > 159 && $num1 < 248) {
return $num1 * 256 + $num2 - 65536;
}
return false;
}
/**
* 基本配置信息
*/
define("CURRENT_PATH", dirname(__FILE__));
$pingin_file = CURRENT_PATH . "/gb-pinyin.table";
$pinyin = pinyin_init($pingin_file);
$py = pingyin_get($pinyin, "中国银行", 'utf-8');
echo iconv('gb2312', 'utf-8', print_r($py, true));
echo implode("'", $py);
?>
拼音表如下:
代码
a -20319
ai -20317
an -20304
ang -20295
ao -20292
ba -20283
bai -20265
ban -20257
bang -20242
bao -20230
bei -20051
ben -20036
beng -20032
bi -20026
bian -20002
biao -19990
bie -19986
bin -19982
bing -19976
bo -19805
bu -19784
ca -19775
cai -19774
can -19763
cang -19756
cao -19751
ce -19746
ceng -19741
cha -19739
chai -19728
chan -19725
chang -19715
chao -19540
che -19531
chen -19525
cheng -19515
chi -19500
chong -19484
chou -19479
chu -19467
chuai -19289
chuan -19288
chuang -19281
chui -19275
chun -19270
chuo -19263
ci -19261
cong -19249
cou -19243
cu -19242
cuan -19238
cui -19235
cun -19227
cuo -19224
da -19218
dai -19212
dan -19038
dang -19023
dao -19018
de -19006
deng -19003
di -18996
dian -18977
diao -18961
die -18952
ding -18783
diu -18774
dong -18773
dou -18763
du -18756
duan -18741
dui -18735
dun -18731
duo -18722
e -18710
en -18697
er -18696
fa -18526
fan -18518
fang -18501
fei -18490
fen -18478
feng -18463
fo -18448
fou -18447
fu -18446
ga -18239
gai -18237
gan -18231
gang -18220
gao -18211
ge -18201
gei -18184
gen -18183
geng -18181
gong -18012
gou -17997
gu -17988
gua -17970
guai -17964
guan -17961
guang -17950
gui -17947
gun -17931
guo -17928
ha -17922
hai -17759
han -17752
hang -17733
hao -17730
he -17721
hei -17703
hen -17701
heng -17697
hong -17692
hou -17683
hu -17676
hua -17496
huai -17487
huan -17482
huang -17468
hui -17454
hun -17433
huo -17427
ji -17417
jia -17202
jian -17185
jiang -16983
jiao -16970
jie -16942
jin -16915
jing -16733
jiong -16708
jiu -16706
ju -16689
juan -16664
jue -16657
jun -16647
ka -16474
kai -16470
kan -16465
kang -16459
kao -16452
ke -16448
ken -16433
keng -16429
kong -16427
kou -16423
ku -16419
kua -16412
kuai -16407
kuan -16403
kuang -16401
kui -16393
kun -16220
kuo -16216
la -16212
lai -16205
lan -16202
lang -16187
lao -16180
le -16171
lei -16169
leng -16158
li -16155
lia -15959
lian -15958
liang -15944
liao -15933
lie -15920
lin -15915
ling -15903
liu -15889
long -15878
lou -15707
lu -15701
lv -15681
luan -15667
lue -15661
lun -15659
luo -15652
ma -15640
mai -15631
man -15625
mang -15454
mao -15448
me -15436
mei -15435
men -15419
meng -15416
mi -15408
mian -15394
miao -15385
mie -15377
min -15375
ming -15369
miu -15363
mo -15362
mou -15183
mu -15180
na -15165
nai -15158
nan -15153
nang -15150
nao -15149
ne -15144
nei -15143
nen -15141
neng -15140
ni -15139
nian -15128
niang -15121
niao -15119
nie -15117
nin -15110
ning -15109
niu -14941
nong -14937
nu -14933
nv -14930
nuan -14929
nue -14928
nuo -14926
o -14922
ou -14921
pa -14914
pai -14908
pan -14902
pang -14894
pao -14889
pei -14882
pen -14873
peng -14871
pi -14857
pian -14678
piao -14674
pie -14670
pin -14668
ping -14663
po -14654
pu -14645
qi -14630
qia -14594
qian -14429
qiang -14407
qiao -14399
qie -14384
qin -14379
qing -14368
qiong -14355
qiu -14353
qu -14345
quan -14170
que -14159
qun -14151
ran -14149
rang -14145
rao -14140
re -14137
ren -14135
reng -14125
ri -14123
rong -14122
rou -14112
ru -14109
ruan -14099
rui -14097
run -14094
ruo -14092
sa -14090
sai -14087
san -14083
sang -13917
sao -13914
se -13910
sen -13907
seng -13906
sha -13905
shai -13896
shan -13894
shang -13878
shao -13870
she -13859
shen -13847
sheng -13831
shi -13658
shou -13611
shu -13601
shua -13406
shuai -13404
shuan -13400
shuang -13398
shui -13395
shun -13391
shuo -13387
si -13383
song -13367
sou -13359
su -13356
suan -13343
sui -13340
sun -13329
suo -13326
ta -13318
tai -13147
tan -13138
tang -13120
tao -13107
te -13096
teng -13095
ti -13091
tian -13076
tiao -13068
tie -13063
ting -13060
tong -12888
tou -12875
tu -12871
tuan -12860
tui -12858
tun -12852
tuo -12849
wa -12838
wai -12831
wan -12829
wang -12812
wei -12802
wen -12607
weng -12597
wo -12594
wu -12585
xi -12556
xia -12359
xian -12346
xiang -12320
xiao -12300
xie -12120
xin -12099
xing -12089
xiong -12074
xiu -12067
xu -12058
xuan -12039
xue -11867
xun -11861
ya -11847
yan -11831
yang -11798
yao -11781
ye -11604
yi -11589
yin -11536
ying -11358
yo -11340
yong -11339
you -11324
yu -11303
yuan -11097
yue -11077
yun -11067
za -11055
zai -11052
zan -11045
zang -11041
zao -11038
ze -11024
zei -11020
zen -11019
zeng -11018
zha -11014
zhai -10838
zhan -10832
zhang -10815
zhao -10800
zhe -10790
zhen -10780
zheng -10764
zhi -10587
zhong -10544
zhou -10533
zhu -10519
zhua -10331
zhuai -10329
zhuan -10328
zhuang -10322
zhui -10315
zhun -10309
zhuo -10307
zi -10296
zong -10281
zou -10274
zu -10270
zuan -10262
zui -10260
zun -10256
zuo -10254
ai -20317
an -20304
ang -20295
ao -20292
ba -20283
bai -20265
ban -20257
bang -20242
bao -20230
bei -20051
ben -20036
beng -20032
bi -20026
bian -20002
biao -19990
bie -19986
bin -19982
bing -19976
bo -19805
bu -19784
ca -19775
cai -19774
can -19763
cang -19756
cao -19751
ce -19746
ceng -19741
cha -19739
chai -19728
chan -19725
chang -19715
chao -19540
che -19531
chen -19525
cheng -19515
chi -19500
chong -19484
chou -19479
chu -19467
chuai -19289
chuan -19288
chuang -19281
chui -19275
chun -19270
chuo -19263
ci -19261
cong -19249
cou -19243
cu -19242
cuan -19238
cui -19235
cun -19227
cuo -19224
da -19218
dai -19212
dan -19038
dang -19023
dao -19018
de -19006
deng -19003
di -18996
dian -18977
diao -18961
die -18952
ding -18783
diu -18774
dong -18773
dou -18763
du -18756
duan -18741
dui -18735
dun -18731
duo -18722
e -18710
en -18697
er -18696
fa -18526
fan -18518
fang -18501
fei -18490
fen -18478
feng -18463
fo -18448
fou -18447
fu -18446
ga -18239
gai -18237
gan -18231
gang -18220
gao -18211
ge -18201
gei -18184
gen -18183
geng -18181
gong -18012
gou -17997
gu -17988
gua -17970
guai -17964
guan -17961
guang -17950
gui -17947
gun -17931
guo -17928
ha -17922
hai -17759
han -17752
hang -17733
hao -17730
he -17721
hei -17703
hen -17701
heng -17697
hong -17692
hou -17683
hu -17676
hua -17496
huai -17487
huan -17482
huang -17468
hui -17454
hun -17433
huo -17427
ji -17417
jia -17202
jian -17185
jiang -16983
jiao -16970
jie -16942
jin -16915
jing -16733
jiong -16708
jiu -16706
ju -16689
juan -16664
jue -16657
jun -16647
ka -16474
kai -16470
kan -16465
kang -16459
kao -16452
ke -16448
ken -16433
keng -16429
kong -16427
kou -16423
ku -16419
kua -16412
kuai -16407
kuan -16403
kuang -16401
kui -16393
kun -16220
kuo -16216
la -16212
lai -16205
lan -16202
lang -16187
lao -16180
le -16171
lei -16169
leng -16158
li -16155
lia -15959
lian -15958
liang -15944
liao -15933
lie -15920
lin -15915
ling -15903
liu -15889
long -15878
lou -15707
lu -15701
lv -15681
luan -15667
lue -15661
lun -15659
luo -15652
ma -15640
mai -15631
man -15625
mang -15454
mao -15448
me -15436
mei -15435
men -15419
meng -15416
mi -15408
mian -15394
miao -15385
mie -15377
min -15375
ming -15369
miu -15363
mo -15362
mou -15183
mu -15180
na -15165
nai -15158
nan -15153
nang -15150
nao -15149
ne -15144
nei -15143
nen -15141
neng -15140
ni -15139
nian -15128
niang -15121
niao -15119
nie -15117
nin -15110
ning -15109
niu -14941
nong -14937
nu -14933
nv -14930
nuan -14929
nue -14928
nuo -14926
o -14922
ou -14921
pa -14914
pai -14908
pan -14902
pang -14894
pao -14889
pei -14882
pen -14873
peng -14871
pi -14857
pian -14678
piao -14674
pie -14670
pin -14668
ping -14663
po -14654
pu -14645
qi -14630
qia -14594
qian -14429
qiang -14407
qiao -14399
qie -14384
qin -14379
qing -14368
qiong -14355
qiu -14353
qu -14345
quan -14170
que -14159
qun -14151
ran -14149
rang -14145
rao -14140
re -14137
ren -14135
reng -14125
ri -14123
rong -14122
rou -14112
ru -14109
ruan -14099
rui -14097
run -14094
ruo -14092
sa -14090
sai -14087
san -14083
sang -13917
sao -13914
se -13910
sen -13907
seng -13906
sha -13905
shai -13896
shan -13894
shang -13878
shao -13870
she -13859
shen -13847
sheng -13831
shi -13658
shou -13611
shu -13601
shua -13406
shuai -13404
shuan -13400
shuang -13398
shui -13395
shun -13391
shuo -13387
si -13383
song -13367
sou -13359
su -13356
suan -13343
sui -13340
sun -13329
suo -13326
ta -13318
tai -13147
tan -13138
tang -13120
tao -13107
te -13096
teng -13095
ti -13091
tian -13076
tiao -13068
tie -13063
ting -13060
tong -12888
tou -12875
tu -12871
tuan -12860
tui -12858
tun -12852
tuo -12849
wa -12838
wai -12831
wan -12829
wang -12812
wei -12802
wen -12607
weng -12597
wo -12594
wu -12585
xi -12556
xia -12359
xian -12346
xiang -12320
xiao -12300
xie -12120
xin -12099
xing -12089
xiong -12074
xiu -12067
xu -12058
xuan -12039
xue -11867
xun -11861
ya -11847
yan -11831
yang -11798
yao -11781
ye -11604
yi -11589
yin -11536
ying -11358
yo -11340
yong -11339
you -11324
yu -11303
yuan -11097
yue -11077
yun -11067
za -11055
zai -11052
zan -11045
zang -11041
zao -11038
ze -11024
zei -11020
zen -11019
zeng -11018
zha -11014
zhai -10838
zhan -10832
zhang -10815
zhao -10800
zhe -10790
zhen -10780
zheng -10764
zhi -10587
zhong -10544
zhou -10533
zhu -10519
zhua -10331
zhuai -10329
zhuan -10328
zhuang -10322
zhui -10315
zhun -10309
zhuo -10307
zi -10296
zong -10281
zou -10274
zu -10270
zuan -10262
zui -10260
zun -10256
zuo -10254