ANSI 2 UTF-8将ansi格式存储的文本文件转为utf-8格式存储的文件(php)

<?php

//header('Content-Type: text/html; charset=utf-8');

$tm1=microtime();

echo " ";

$extname=".php;.xml;html;.htm;.css;.txt;.js;";

$utf8File=0;

$convertedFile=0;

$notTextFile=0;

listDirTree("d:/usr/www/html/mambog");

echo (microtime()-$tm1)."秒 ";

echo " utf8文件：".$utf8File;

echo " 被转换的文件总数:".$convertedFile;

echo " 非文本文件数目:".$notTextFile;

/** 函数 listDirTree( $dirName = null )

* 参数 $dirName 目录名称

* 返回目录结构数组 false为失败

//print_r($tree);

function listDirTree( $dirName = null )

{//global $tree;

global $extname,$utf8File,$convertedFile,$notTextFile;

if( empty( $dirName ) )

exit( "IBFileSystem: directory is empty." );

if( is_dir( $dirName ) )

{

if( $dh = opendir( $dirName ) )

{

//$tree = array();

while( ( $file = readdir( $dh ) ) !== false )

{

if( $file != "." && $file != ".." )

{

$filePath = $dirName . "/" . $file;

if( is_dir( $filePath ) )//为目录,递归

{

//$tree[$file] = listDirTree( $filePath );

listDirTree( $filePath );

}

else//为文件,进行处理

{

$fileext=substr($file,-4,4);

preg_match("/".$fileext."/i",$extname, $matches);

if ($matches[0]) //是文本文件扩展名为:.php,.xml,.css,.js

.由数组$matches定义

{

$outfilename=$filePath;

$fstr=file_get_contents($outfilename);

$utf8score=utf8_probability(&$fstr);

if( 90<=$utf8score && $utf8score<=100)

{ $utf8File++;

//echo " $filePath is encoded with utf-8 already ";//die;

}

else

{

$fp=fopen($outfilename,'wb');

$foutstr=iconv("GB2312","utf-8",$fstr); //如果碰到此文件中不存在双字节字符，则这个函数并不起作用

//$foutstr.="\n "; //这一句确保不存在双字节字符的文件也能得到转换,js文件经测试也可以。

//形式的注释可以使用于php,css,html,css等文件中

//本脚本（fileconv.php已经是utf-8编码保存的，所以加到转换后的字串里）

//然而，这样不行，搞的配套的代码都不能运行了

fwrite($fp,$foutstr);

$convertedFile++;

//echo $filePath . "<<<<<<<<< $matches[0] ";

fclose($fp);

}

}//是文本文件处理结束

else{

$notTextFile++;

//echo $file."========= ";

}

//$tree[] = $file;

} //文件处理结束

}

closedir( $dh );

}

else

{

exit( "IBFileSystem: can not open directory $dirName.");

}

//返回当前的$tree

//return $tree;

}

else

{

exit( "IBFileSystem: $dirName is not a directory.");

}

function showdir($dir){

while ($file_name = readdir($dir)) {

if (($file_name != ".") && ($file_name != "..")) {

$file_list .= "<li>$file_name";

if(is_dir($file_name))

showdir($file_name);

}

function utf8_probability(&$rawtextstr) {

$score = 0;

$i = 0;

$rawtextlen = 0;

$goodbytes = 0;

$asciibytes = 0;

$rawtextarray = preg_split("//",$rawtextstr,-1, PREG_SPLIT_NO_EMPTY); //转换成char数组，如果是php5，则可使用str_split

$rawtext = array();

//var_dump($rawtextarray);die;

for($i=0;$i<count($rawtextarray);$i++)

$rawtext[] = ord($rawtextarray[$i]); //ord(char)

// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF

//BOM，某些utf8文件流的首3个字节，可以表示这个文件的编码方式

// Check to see if characters fit into acceptable ranges

//print_r($rawtext);

$rawtextlen = strlen($rawtextstr);

for ($i = 0; $i < $rawtextlen; $i++) {

if ($rawtext[$i] < 0x80) { // One byte

$asciibytes++; // Ignore ASCII, can throw off count

} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes

$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {

$goodbytes += 2; $i++;

} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes

$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&

0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {

$goodbytes += 3; $i+=2;

}

//ascii is sub of utf8

if ($asciibytes == $rawtextlen) { return 0; }

$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));

// If not above 98, reduce to zero to prevent coincidental matches

if ($score > 98) {

return $score;

} else if ($score > 95 && $goodbytes > 30) {

// Allows for some (few) bad formed sequences

return $score;

} else {

return 0;

}

Posted on 2005-06-06 17:21 古代阅读(1210) 评论(0) 收藏举报

刷新页面返回顶部