utf8转gbk

<?php
/*
utf8 2 gbk
bailing 2006-08-27
*/
return;

require_once("batchFile.class.php");
require_once("echo.php");
$fl = new batchFile();
$fl->extname =";php;html;js;css;htm;"; //the file type need to be operated ,format is [;php;html;]

$fl->deepLimit = 0;
$fl->path='D:\web\youtube'; //operate dir
$fl->filterPath='_vti_cnf';        //ignored dir
$fl->start();

//pr($fl);
//iconv();die;
foreach ($fl->file as $file)
{
    $data=file_get_contents ( $fl->path . $file );
    $code = mb_detect_encoding($data) ; //
    echo $code . "-- $file --";
    if($code=="UTF-8")
    {
        $score = utf8_probability( $data ); echo $score;
        if( $score>90 )
        {
            $data = iconv("UTF-8" , "gbk" , $data);
            $rt = file_put_contents ( $fl->path . $file , $data);
        }
    }
    echo "<br>";
}

function utf8_probability(&$rawtextstr)
{
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1 , PREG_SPLIT_NO_EMPTY); //转换成char数组，如果是php5，则可使用str_split

//echo  ord( dechex($rawtextarray[0])) . "=" . dechex(ord($rawtextarray[1])) . "--"; //return;
//var_dump($rawtextarray);die;

$rawtext = array();

for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM，某些utf8文件流的首3个字节，可以表示这个文件的编码方式

// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
    if ($rawtext[$i] < 0x80) { // One byte
        $asciibytes++; // Ignore ASCII, can throw off count
    } else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
    $i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
        $goodbytes += 2; $i++;
    } else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
    $i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
    0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
        $goodbytes += 3; $i+=2;
    }
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }

$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
    // If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
    return $score;
} else if ($score > 95 && $goodbytes > 30) {
    // Allows for some (few) bad formed sequences
    return $score;
} else {
    return 0;
}

}

?>

Posted on 2006-08-27 15:38 古代阅读(2514) 评论(0) 编辑收藏举报

刷新页面返回顶部