改进的函数（判断字符串是否为utf-8格式）php4.3

$utf8score=utf8_probability(&$fstr);

if( 90<=$utf8score && $utf8score<=100){echo "It is encoded with utf-8 already ";die;}

echo $utf8score;

function utf8_probability(&$rawtextstr) {

$score = 0;

$i = 0;

$rawtextlen = 0;

$goodbytes = 0;

$asciibytes = 0;

$rawtextarray = preg_split("//",$rawtextstr,-1, PREG_SPLIT_NO_EMPTY); //转换成char数组，如果是php5，则可使用str_split

$rawtext = array();

//var_dump($rawtextarray);die;

for($i=0;$i<count($rawtextarray);$i++)

$rawtext[] = ord($rawtextarray[$i]); //ord(char)

// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF

//BOM，某些utf8文件流的首3个字节，可以表示这个文件的编码方式

// Check to see if characters fit into acceptable ranges

//print_r($rawtext);

$rawtextlen = strlen($rawtextstr);

for ($i = 0; $i < $rawtextlen; $i++) {

if ($rawtext[$i] < 0x80) { // One byte

$asciibytes++; // Ignore ASCII, can throw off count

} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes

$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {

$goodbytes += 2; $i++;

} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes

$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&

0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {

$goodbytes += 3; $i+=2;

}

//ascii is sub of utf8

if ($asciibytes == $rawtextlen) { return 0; }

$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));

// If not above 98, reduce to zero to prevent coincidental matches

if ($score > 98) {

return $score;

} else if ($score > 95 && $goodbytes > 30) {

// Allows for some (few) bad formed sequences

return $score;

} else {

return 0;

}

Posted on 2005-06-06 10:32 古代阅读(625) 评论(0) 编辑收藏举报

刷新页面返回顶部