<?php
//header('Content-Type: text/html; charset=utf-8');
$tm1=microtime();
echo "<BR>";
$extname=".php;.xml;html;.htm;.css;.txt;.js;";
$utf8File=0;
$convertedFile=0;
$notTextFile=0;
listDirTree("d:/usr/www/html/mambog");
echo (microtime()-$tm1)."秒<BR>";
echo "<BR>utf8文件:".$utf8File;
echo "<BR>被转换的文件总数:".$convertedFile;
echo "<BR>非文本文件数目:".$notTextFile;
/** 函数 listDirTree( $dirName = null )
* 功能 列出目录下所有文件及子目录
* 参数 $dirName 目录名称
* 返回 目录结构数组 false为失败
*/
//print_r($tree);
function listDirTree( $dirName = null )
{//global $tree;
global $extname,$utf8File,$convertedFile,$notTextFile;
if( empty( $dirName ) )
exit( "IBFileSystem: directory is empty." );
if( is_dir( $dirName ) )
{
if( $dh = opendir( $dirName ) )
{
//$tree = array();
while( ( $file = readdir( $dh ) ) !== false )
{
if( $file != "." && $file != ".." )
{
$filePath = $dirName . "/" . $file;
if( is_dir( $filePath ) )//为目录,递归
{
//$tree[$file] = listDirTree( $filePath );
listDirTree( $filePath );
}
else//为文件,进行处理
{
$fileext=substr($file,-4,4);
preg_match("/".$fileext."/i",$extname, $matches);
if ($matches[0]) //是文本文件扩展名为:.php,.xml,.css,.js.由数组$matches定义
{
$outfilename=$filePath;
$fstr=file_get_contents($outfilename);
$utf8score=utf8_probability(&$fstr);
if( 90<=$utf8score && $utf8score<=100)
{ $utf8File++;
//echo "<font color=red> $filePath is encoded with utf-8 already </font><BR>";//die;
}
else
{
$fp=fopen($outfilename,'wb');
$foutstr=iconv("GB2312","utf-8",$fstr); //如果碰到 此文件中不存在双字节字符,则这个函数并不起作用
//$foutstr.="\n <!--脚注-->"; //这一句确保不存在双字节字符的文件也能得到转换,js文件经测试也可以。
//<!-- -->形式的注释可以使用于php,css,html,css等文件中
//本脚本(fileconv.php已经是utf-8编码保存的,所以加到转换后的字串里)
//然而,这样不行,搞的配套的代码都不能运行了
fwrite($fp,$foutstr);
$convertedFile++;
//echo $filePath . "<<<<<<<<< $matches[0]<BR>";
fclose($fp);
}
}//是文本文件处理结束
else{
$notTextFile++;
//echo $file."=========<BR>";
}
//$tree[] = $file;
} //文件处理结束
}
}
closedir( $dh );
}
else
{
exit( "IBFileSystem: can not open directory $dirName.");
}
//返回当前的$tree
//return $tree;
}
else
{
exit( "IBFileSystem: $dirName is not a directory.");
}
}
function showdir($dir){
while ($file_name = readdir($dir)) {
if (($file_name != ".") && ($file_name != "..")) {
$file_list .= "<li>$file_name";
if(is_dir($file_name))
showdir($file_name);
}
}
}
function utf8_probability(&$rawtextstr) {
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1, PREG_SPLIT_NO_EMPTY); //转换成char数组,如果是php5,则可使用str_split
$rawtext = array();
//var_dump($rawtextarray);die;
for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM,某些utf8文件流的首3个字节,可以表示这个文件的编码方式
// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
if ($rawtext[$i] < 0x80) { // One byte
$asciibytes++; // Ignore ASCII, can throw off count
} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
$goodbytes += 2; $i++;
} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
$goodbytes += 3; $i+=2;
}
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }
$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
return $score;
} else if ($score > 95 && $goodbytes > 30) {
// Allows for some (few) bad formed sequences
return $score;
} else {
return 0;
}
}
?>
//header('Content-Type: text/html; charset=utf-8');
$tm1=microtime();
echo "<BR>";
$extname=".php;.xml;html;.htm;.css;.txt;.js;";
$utf8File=0;
$convertedFile=0;
$notTextFile=0;
listDirTree("d:/usr/www/html/mambog");
echo (microtime()-$tm1)."秒<BR>";
echo "<BR>utf8文件:".$utf8File;
echo "<BR>被转换的文件总数:".$convertedFile;
echo "<BR>非文本文件数目:".$notTextFile;
/** 函数 listDirTree( $dirName = null )
* 功能 列出目录下所有文件及子目录
* 参数 $dirName 目录名称
* 返回 目录结构数组 false为失败
*/
//print_r($tree);
function listDirTree( $dirName = null )
{//global $tree;
global $extname,$utf8File,$convertedFile,$notTextFile;
if( empty( $dirName ) )
exit( "IBFileSystem: directory is empty." );
if( is_dir( $dirName ) )
{
if( $dh = opendir( $dirName ) )
{
//$tree = array();
while( ( $file = readdir( $dh ) ) !== false )
{
if( $file != "." && $file != ".." )
{
$filePath = $dirName . "/" . $file;
if( is_dir( $filePath ) )//为目录,递归
{
//$tree[$file] = listDirTree( $filePath );
listDirTree( $filePath );
}
else//为文件,进行处理
{
$fileext=substr($file,-4,4);
preg_match("/".$fileext."/i",$extname, $matches);
if ($matches[0]) //是文本文件扩展名为:.php,.xml,.css,.js.由数组$matches定义
{
$outfilename=$filePath;
$fstr=file_get_contents($outfilename);
$utf8score=utf8_probability(&$fstr);
if( 90<=$utf8score && $utf8score<=100)
{ $utf8File++;
//echo "<font color=red> $filePath is encoded with utf-8 already </font><BR>";//die;
}
else
{
$fp=fopen($outfilename,'wb');
$foutstr=iconv("GB2312","utf-8",$fstr); //如果碰到 此文件中不存在双字节字符,则这个函数并不起作用
//$foutstr.="\n <!--脚注-->"; //这一句确保不存在双字节字符的文件也能得到转换,js文件经测试也可以。
//<!-- -->形式的注释可以使用于php,css,html,css等文件中
//本脚本(fileconv.php已经是utf-8编码保存的,所以加到转换后的字串里)
//然而,这样不行,搞的配套的代码都不能运行了
fwrite($fp,$foutstr);
$convertedFile++;
//echo $filePath . "<<<<<<<<< $matches[0]<BR>";
fclose($fp);
}
}//是文本文件处理结束
else{
$notTextFile++;
//echo $file."=========<BR>";
}
//$tree[] = $file;
} //文件处理结束
}
}
closedir( $dh );
}
else
{
exit( "IBFileSystem: can not open directory $dirName.");
}
//返回当前的$tree
//return $tree;
}
else
{
exit( "IBFileSystem: $dirName is not a directory.");
}
}
function showdir($dir){
while ($file_name = readdir($dir)) {
if (($file_name != ".") && ($file_name != "..")) {
$file_list .= "<li>$file_name";
if(is_dir($file_name))
showdir($file_name);
}
}
}
function utf8_probability(&$rawtextstr) {
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1, PREG_SPLIT_NO_EMPTY); //转换成char数组,如果是php5,则可使用str_split
$rawtext = array();
//var_dump($rawtextarray);die;
for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM,某些utf8文件流的首3个字节,可以表示这个文件的编码方式
// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
if ($rawtext[$i] < 0x80) { // One byte
$asciibytes++; // Ignore ASCII, can throw off count
} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
$goodbytes += 2; $i++;
} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
$goodbytes += 3; $i+=2;
}
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }
$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
return $score;
} else if ($score > 95 && $goodbytes > 30) {
// Allows for some (few) bad formed sequences
return $score;
} else {
return 0;
}
}
?>