<?php
/*
utf8 2 gbk
bailing 2006-08-27
*/
return;
require_once("batchFile.class.php");
require_once("echo.php");
$fl = new batchFile();
$fl->extname =";php;html;js;css;htm;"; //the file type need to be operated ,format is [;php;html;]
$fl->deepLimit = 0;
$fl->path='D:\web\youtube'; //operate dir
$fl->filterPath='_vti_cnf'; //ignored dir
$fl->start();
//pr($fl);
//iconv();die;
foreach ($fl->file as $file)
{
$data=file_get_contents ( $fl->path . $file );
$code = mb_detect_encoding($data) ; //
echo $code . "-- $file --";
if($code=="UTF-8")
{
$score = utf8_probability( $data ); echo $score;
if( $score>90 )
{
$data = iconv("UTF-8" , "gbk" , $data);
$rt = file_put_contents ( $fl->path . $file , $data);
}
}
echo "<br>";
}
function utf8_probability(&$rawtextstr)
{
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1 , PREG_SPLIT_NO_EMPTY); //转换成char数组,如果是php5,则可使用str_split
//echo ord( dechex($rawtextarray[0])) . "=" . dechex(ord($rawtextarray[1])) . "--"; //return;
//var_dump($rawtextarray);die;
$rawtext = array();
for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM,某些utf8文件流的首3个字节,可以表示这个文件的编码方式
// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
if ($rawtext[$i] < 0x80) { // One byte
$asciibytes++; // Ignore ASCII, can throw off count
} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
$goodbytes += 2; $i++;
} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
$goodbytes += 3; $i+=2;
}
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }
$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
return $score;
} else if ($score > 95 && $goodbytes > 30) {
// Allows for some (few) bad formed sequences
return $score;
} else {
return 0;
}
}
?>
/*
utf8 2 gbk
bailing 2006-08-27
*/
return;
require_once("batchFile.class.php");
require_once("echo.php");
$fl = new batchFile();
$fl->extname =";php;html;js;css;htm;"; //the file type need to be operated ,format is [;php;html;]
$fl->deepLimit = 0;
$fl->path='D:\web\youtube'; //operate dir
$fl->filterPath='_vti_cnf'; //ignored dir
$fl->start();
//pr($fl);
//iconv();die;
foreach ($fl->file as $file)
{
$data=file_get_contents ( $fl->path . $file );
$code = mb_detect_encoding($data) ; //
echo $code . "-- $file --";
if($code=="UTF-8")
{
$score = utf8_probability( $data ); echo $score;
if( $score>90 )
{
$data = iconv("UTF-8" , "gbk" , $data);
$rt = file_put_contents ( $fl->path . $file , $data);
}
}
echo "<br>";
}
function utf8_probability(&$rawtextstr)
{
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1 , PREG_SPLIT_NO_EMPTY); //转换成char数组,如果是php5,则可使用str_split
//echo ord( dechex($rawtextarray[0])) . "=" . dechex(ord($rawtextarray[1])) . "--"; //return;
//var_dump($rawtextarray);die;
$rawtext = array();
for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM,某些utf8文件流的首3个字节,可以表示这个文件的编码方式
// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
if ($rawtext[$i] < 0x80) { // One byte
$asciibytes++; // Ignore ASCII, can throw off count
} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
$goodbytes += 2; $i++;
} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
$goodbytes += 3; $i+=2;
}
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }
$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
return $score;
} else if ($score > 95 && $goodbytes > 30) {
// Allows for some (few) bad formed sequences
return $score;
} else {
return 0;
}
}
?>