文本编码的智能识别（它的两个指导哲学真是绝妙）

一点闲话：

2009年的时候，写了几篇博客文章，此外也将以前写的一些文章抄到博客上，完了之后发现没啥好写的了？! 晕啊。为什么别人就能坚持不懈地写那么多文章，而我却想不出来写什么呢？平日里，我可是挺喜欢写些代码，来实现一些自己想要的功能，但我发觉，似乎写代码还算好，写文章还真不容易。

最近总算又想出点东西来了，得，写下来吧，要不然感觉对不起这好不容易下定决心才开的博客啊，哈哈。

近日整理自己写的文本编辑器时，觉得不能自动识别文本编码用起来很不方便，于是研究了一下文本文件的编码方式，发现如果文件中有BOM(Byte Order Mark)头，那识别起来简单；如果没有BOM头，就没那么好处理了。

用Google搜索了老半天，自己也苦思冥想，发现大致可以划分为以下几种情况(这里只考虑中英文)：

1、有BOM头的文件

这个处理起来比较简单，只要识别出各自的编码方式，然后出加以转换就可以了。

2、没有BOM头的UCS2(UTF-16)文件

这种类型的文件如果其中含有Ascii码字符，那也可以用比较简单的方式加以判断：看看文件中是否有为0的字符，如果有，基本上可以认为是UCS2了。当然，文件中有错，或没有Ascii码字符，那就没法判断了，因为UCS2(UTF-16)用的编码是0x0000-0xFFFF，所以难以从字符特征上来判断（可以通过字频、词频统计以及词语特征分析等方法识别，不过本文主要讨论GB/Big5的识别，所以这里不作探讨）。

3、没有BOM头的UTF-8编码的文件

由于UTF-8编码有一定的特征，如下面的说明：

UTF-8

Ascii Chars:

00-7F // 1 Bytes = 0xxxxxxx

Multi Bytes:

C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx

E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx

F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

所以可以从编码特征上来考虑。

4、没有BOM头的ANSI文件

此类文件里可能有Ascii字符，也可能有GB2312/GBK/GB18030/Big5的中文字符。单纯从文件上来看，它与没有BOM头的UTF-8编码文件类似，所以必须考虑如何与UTF-8文件区分开来。

此外，还有不常用的UCS4(UTF-32)，这里就不考虑了。GB18030是与GBK兼容的，这里也只考虑它的2字节部分，4字节部分较少用到，不作考虑。

通过以上分析，可以看出，现在的主要问题是如何区分没有BOM头的UTF-8文件和ANSI文件。进一步，还有如何判断ANSI文件是简体还是繁体的问题。

以下代码是我想出来的一个猜测文本文件编码的方法，这可是原创哦(代码是Delphi的)：

type

TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5);

function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding;
var
len: Longint;

function Maybe3BytesUtf8(Index: Integer): Boolean;
begin
Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and
(buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]);
end;

var
idx: Longint;
iUtf8: Longint;
maybeGB: Integer; //GB2312/GBK/GB18030
mayBig5: Integer; //Big5
mayUtf8: Integer; //Utf-8
maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian
maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian
mayBins: Integer; //Binary File Chars ?
serZero: Integer;
ratio: Integer;
chsCount: Integer;
utf8Count: Integer;
gbkNulls: Integer;
big5Nulls: Integer;
begin
Result := ceAnsi;
maybeGB := 0;
mayBig5 := 0;
mayUtf8 := 0;
maybeLE := 0;
maybeBE := 0;
mayBins := 0;
serZero := 0;
chsCount := 0;
utf8Count := 0;
gbkNulls := 0;
big5Nulls := 0;
len := Length(buf);
idx := 1;
while idx <= len do begin
if idx < len then begin
if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or
(buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(gbkNulls);
end;
if (buf[idx +1] in [#$7F..#$A0]) or
(buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or
(buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(big5Nulls);
end;
end;
case buf[idx] of
#0: begin
Inc(mayBins);
if (idx < len) and (buf[idx +1] = #0) then begin
Inc(serZero);
end;
if (idx mod 2) = 0 then begin
Inc(maybeLE);
end else begin
Inc(maybeBE);
end;
end;
#1..#8, #11, #12, #14..#31: begin
Inc(mayBins);
end;
#$80: begin
iUtf8 := idx;
Inc(iUtf8);
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$81..#$BF: begin
if buf[idx] in [#$81..#$A0] then begin
Inc(maybeGB, 8);
end else begin
Inc(maybeGB, 8);
Inc(mayBig5, 8);
end;
Inc(chsCount);
Inc(idx);
iUtf8 := idx;
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$C0..#$DF: begin
if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin
Inc(mayUtf8);
Inc(utf8Count);
Inc(chsCount);
if (buf[idx +1] in [#$A1..#$BF]) then begin
Inc(maybeGB);
Inc(mayBig5);
end else begin
Inc(maybeGB, 4);
end;
end else begin
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
end;
Inc(idx);
end;
#$E0..#$EF: begin
if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
Inc(idx);
end;
#$F0..#$FE: begin
if buf[idx] in [#$FA..#$FE] then begin
Inc(maybeGB, 8);
end;
Inc(maybeGB, 8);
Inc(mayBig5, 8);
Inc(chsCount);
Inc(idx);
end;
end;
Inc(idx);
end;
// set encoding
if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin
if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or
(serZero * 8 > Max(maybeLE, maybeBE)) then begin
Result := ceBinary;
end else if maybeLE >= maybeBE then begin
Result := ceUcs2_LE;
end else begin
Result := ceUcs2_BE;
end;
end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin
ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5));
if Abs(ratio) <= 5 then begin
if gbkNulls > big5Nulls then begin
Result := ceBig5;
end else if gbkNulls < big5Nulls then begin
Result := ceGB;
end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin
Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len));
end;
end else begin
if ratio > 0 then begin
Result := ceGB;
end else begin
Result := ceBig5;
end;
end;
end else if mayUtf8 > 0 then begin
Result := ceUtf_8;
end;
end;

function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding;
var
buf: string;
begin
SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position));
AStream.Read(buf[1], Length(buf));
Result := GuessCharEncoding(buf, SeeGBBig5);
end;

请仔细看这段代码，并将它与下面关于编码的说明对比：

{---------------------------- Character Encoding -----------------------------

1. UTF-8

Ascii Chars: 00-7F // 1 Bytes = 0xxxxxxx

Multi Bytes: C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx

E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx

F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]

2 Bytes Characters: 0000-FFFF

3. UTF-16

0000-D7FF = 0000-D7FF // 2 Bytes

E000-FFFF = E000-FFFF // 2 Bytes

10000-10FFFF = D800-D8FF + DC00-DCFF // 4 Bytes

D800-DFFF // Surrogate

4. UTF-32 = 32 bit unsigned integer of character

5. GB2312-80

Ascii Chars: 00-7F

Simplified Chinese Chars: A1-F7 + A1-FE

6. Big5

Ascii Chars: 00-7F

Traditional Chinses Chars: A1-F9 + 40-7E

A1-F9 + A1-FE

7. GBK

Ascii Chars: 00-7F

Chinses Chars: 81-FE + 40-7E

81-FE + 80-FE

8. GB18030-2000

Ascii Chars: 00-7F

Chinese Chars (2 Bytes): 81-FE + 40-7E

81-FE + 80-FE

Chinese Chars (4 Bytes): 81-FE + 30-39 + 81-FE + 30-39

-----------------------------------------------------------------------------}

看出来了吗？这个函数利用UTF-8的编码特征和GB/Big5的编码范围，再加上权值，来判断是UTF-8还是ANSI文档(UCS2(UTF-16)的判断只是附带的，可能也不是很准确)。

以我的测试来看，用这种方法可以比较准确地区分UTF-8和ANSI编码的文件。

等等，还有一个TryToDistinguishGBOrBig5函数的调用，相比较而言，这个函数的实现才是一个高潮部分，哈。

如果你在网上搜索，如何判断文档是GB还是Big5，估计你得到的最靠谱的答案是：统计常用字出现频率，或利用常用词库。统计常用字频率的代表有Mozilla的UniversalCharDet(可在google里搜索“Mozilla 编码识别”)；至于利用常用词库，我也曾想过用这种方法，结果一查，好家伙，常用词库少说也得好几万个，如果加上现在层出不穷的网络词语，估计光是词库就够忙一阵的了。

不知道在哪个Action Script的论坛上，我看到一个建议，说是先将文本尝试着转换一下，但也就一句话，具体怎么做还是没答案。我尝试了一些方法之后，忽然想到：在简体中文系统下，是否可以将文本先转换成繁体，然后再转回成简体，再分析转换前后有什么不同，如果不同的地方超过一定比例，则认为是繁体，否则就是简体？立即动手写了一些代码，就是上述的TryToDistinguishGBOrBig5，试了一下，哈，还真的行啊。

function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;

function MyCompareChineseStr(const s1, s2: string): Boolean;

var

difCount: Integer;

i, k: Integer;

cmpLen: Integer;

begin

difCount := 0;

i := 1;

k := 1;

while (i <= Length(s1)) and (k <= Length(s2)) do begin

if s1[i] <> s2[k] then begin

if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin

//比较原理：两次转换后，某些字可能会转戌一个'?'，以下处理这种情况

if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin

Inc(i);

end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin

Inc(k);

end else begin

Inc(difCount);

end;

end else begin

Inc(difCount);

end;

Inc(i);

Inc(k);

end;

if ExactCompare then begin

cmpLen := CountChineseChars(S);

end else begin

cmpLen := Length(S);

end;

Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6%

end;

begin

Result := ceAnsi;

if SysLocale.PriLangID = LANG_CHINESE then begin

case SysLocale.SubLangID of

SUBLANG_CHINESE_SIMPLIFIED,

SUBLANG_CHINESE_SINGAPORE: begin

if MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin

Result := ceGB;

end else begin

Result := ceBig5;

end;

SUBLANG_CHINESE_TRADITIONAL,

SUBLANG_CHINESE_HONGKONG: begin

if MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin

Result := ceBig5;

end else begin

Result := ceGB;

end;

Big52GBProc/

GB2Big5Proc是两个函数指针，分别用于繁体转简体和简体转繁体。

ExactCompare是一个单元内的全局变量，用于适应不同简繁体转换函数。

CountChineseChars用于统计中文字符个数，具体实现如下：

{ Count Chinese Characters }

function CountChineseChars(const S: string): Integer;

var

i: Integer;

begin

Result := 0;

i := 1;

while i <= Length(S) do begin

if S[i] > #$80 then begin

Inc(Result, 2);

Inc(i, 2);

end else begin

Inc(i);

end;

可能是史上最简单又足够强大的区分GB/Big5编码方法诞生了

！

也许这个方法早就有人发现了，只是没公布出来。用这个方法，我写了个简单的文本测试工具，将我手头能找到的GB/Big5文件找出来试了，还没发现不能识别的呢。

当然，这个方法只是取文件开头的一部分(默认是4K)来作样本，所以并不能识别所有文件，不过，对通常的应用足够了。

   这种方法的缺点：
   1.对短文本识别效果不太好；
   2.如果文本开头的4K区域没有中文字符，那就识别不出来，只能当作ANSI；
   3.对某些字符识别不好，如制表字符；

4.只能在中文系统中使用；

完整的代码（只支持D5~D2007，在D5/D7/D2007下测试通过）我将会在后续贴出来与大家共享。

最后说一句，如果你有什么更好的方法或改进，别忘了告诉我啊。共享万岁！

<iamdream*yeah.net> (*->@)

附：关联文章

简繁体编码识别原理

文本编码的智能识别(续) - 完整代码部分

文本编码的智能识别(续) - Unicode版本代码

文本编码的智能识别(续) - C#版本代码

http://dreamisx.blog.163.com/blog/static/11500483920122134947776/

--------------------------------------------------------------------------------------------------------------

//核心代码 (将其保存为SmartLoadFile.pas)  只支持D5~D2007  
unit SmartLoadFile;

{*******************************************************************************
*                              智能识别文本编码                                *
*                                                                              *
*   功能：  本单元用于智能识别文本文件编码，包括简繁体文本的识别，支持D5-D2007 *
*   实现：  2012.3.10 ~ 2012.3.15 完成Ansi(GB/Big5)/Unicode的识别功能          *
*           2012.3.18 ~ 2012.3.19 完成UTF-16与二进制文件的区分功能             *
*           2012.6.2  ~ 2012.6.3  改进简繁体的识别方法，提高了识别率           * 
*   Blog:   dreamisx.blog.163.com                                              *
*   EMail:  iamdream%yeah.net  (% -> @)                                        *
*******************************************************************************}

interface

{$IFDEF UNICODE}Sorry, not Support Unicode Version!{$ENDIF}

{$IFDEF VER150}             //消除D7警告
  {$WARN UNIT_PLATFORM OFF}
  {$WARN UNSAFE_CODE OFF}
  {$WARN UNSAFE_TYPE OFF}
  {$WARN UNSAFE_CAST OFF}
{$ENDIF}

uses
  Windows, Messages, SysUtils, Classes, Math, GB2BigEx
  {$IFDEF VER130}, Unicode{$ENDIF};

type
  TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5);
  TCharConvertProc = function (const S: string): string;


function SysIsSimplifiedChinese: Boolean;
function SysIsTraditionalChinese: Boolean;
function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;
function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding; overload;
function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; overload;

procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc;
  AExactCompare: Boolean = False; ASamplingSize: Integer = 4096);
function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream;
  TryToDecode: Boolean = True; SeeGBBig5: Boolean = True): TCharEncoding;

implementation

{---------------------------- Character Encoding -----------------------------

1. UTF-8
  Ascii Chars:  00-7F                          // 1 Bytes = 0xxxxxxx
  Multi Bytes:  C0-DF + 80-BF                  // 2 Bytes = 110xxxxx 10xxxxxx
                E0-EF + 80-BF + 80-BF          // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
                F0-F7 + 80-BF + 80-BF + 80-BF  // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]
  2 Bytes Characters:        0000-FFFF

3. UTF-16
            0000-D7FF     =  0000-D7FF              // 2 Bytes
            E000-FFFF     =  E000-FFFF              // 2 Bytes
            10000-10FFFF  =  D800-D8FF + DC00-DCFF  // 4 Bytes
            D800-DFFF                               // Surrogate

4. UTF-32  = 32 bit unsigned integer of character

5. GB2312-80
  Ascii Chars:               00-7F
  Simplified Chinese Chars:  A1-F7 + A1-FE
  Note: A9 + A4-EF ==> Tabs in Chinese Chars

6. Big5
  Ascii Chars:               00-7F
  Traditional Chinses Chars: A1-F9 + 40-7E
                             A1-F9 + A1-FE

7. GBK
  Ascii Chars:               00-7F
  Chinses Chars:             81-FE + 40-7E
                             81-FE + 80-FE

8. GB18030-2000
  Ascii Chars:               00-7F
  Chinese Chars (2 Bytes):   81-FE + 40-7E
                             81-FE + 80-FE
  Chinese Chars (4 Bytes):   81-FE + 30-39 + 81-FE + 30-39



null GBK:
  A1.40-A0
  ...
  A7.40-A0
           AA.A1-FE
           ...
           AF.A1-FE
           F8.A1-FE
           ...
           FE.A1-FE

null Big5:
  C6.A1-FE
  C7.40-FE
  C8.40-FE
  XX.7F-A0

-----------------------------------------------------------------------------}

var
  ExactCompare: Boolean = False;  // for exact convert GB / Big5
  SamplingSize: Integer = 4096;   // for guessing GB / Big5
  GB2Big5Proc: TCharConvertProc;
  Big52GBProc: TCharConvertProc;


{ Internal functions of transfering text to ANSI }

procedure DoUtf8ToAnsi(Source, Target: TStream);
var
  buf: string;
  ret: string;
begin
  SetLength(buf, Source.Size - Source.Position);
  Source.Read(PChar(buf)^, Length(buf));
  ret := Utf8ToAnsi(buf);
  Target.Write(PChar(ret)^, Length(ret));
end;

procedure DoLittleEndianUnicode16ToAnsi(Source, Target: TStream);
const                         //UCS2 ?  (Cannot process 4 Bytes' Chars!!)
  cBufSize = 4096;
var
  buf: array[1..cBufSize] of Char;
  pIn: PWideChar;
  readSize: Longint;
  sOut: string;
begin
  repeat
    readSize := Source.Read(buf, cBufSize);
    if readSize > 0 then begin
      pIn := PWideChar(@buf[1]);
      sOut := WideCharLenToString(pIn, readSize shr 1);
      Target.Write(Pointer(sOut)^, Length(sOut));
    end;
  until readSize < cBufSize;
end;

procedure DoBigEndianUnicode16ToAnsi(Source, Target: TStream);
const                         //UCS2 ?  (Cannot process 4 Bytes' Chars!!)
  cBufSize = 4096;
var
  buf: array[1..cBufSize] of Char;
  pIn: PWideChar;
  readSize: Longint;
  sOut: string;
  c: Char;
  i: Longint;
begin
  repeat
    readSize := Source.Read(buf, cBufSize);
    if readSize > 0 then begin
      pIn := PWideChar(@buf[1]);
      // adjust byte order
      i := 1;
      while i <= readSize do begin
        c := buf[i];
        buf[i] := buf[i +1];
        buf[i +1] := c;
        Inc(i, 2);
      end;
      sOut := WideCharLenToString(pIn, readSize shr 1);
      Target.Write(Pointer(sOut)^, Length(sOut));
    end;
  until readSize < cBufSize;
end;

procedure DoGBToAnsi(Source, Target: TStream);
var
  S: string;
begin
  //If System is Traditional Chinese, Convert it.
  if SysIsTraditionalChinese() then begin
    SetLength(S, Source.Size);
    Source.Read(PChar(S)^, Length(S));
    S := Gb2Big5Proc(S);
    Target.Write(PChar(S)^, Length(S));
  end else begin
    Target.CopyFrom(Source, Source.Size);
  end;
end;

procedure DoBig5ToAnsi(Source, Target: TStream);
var
  S: string;
begin
  //If System is Simplified Chinese, Convert it.
  if SysIsSimplifiedChinese() then begin
    SetLength(S, Source.Size);
    Source.Read(PChar(S)^, Length(S));
    S := Big52GbProc(S);
    Target.Write(PChar(S)^, Length(S));
  end else begin
    Target.CopyFrom(Source, Source.Size);
  end;
end;

{ Count Chinese Characters }

function CountChineseChars(const S: string): Integer;
var
  i: Integer;
begin
  Result := 0;
  i := 1;
  while i <= Length(S) do begin
    if S[i] > #$80 then begin
      Inc(Result, 2);
      Inc(i, 2);
    end else begin
      Inc(i);
    end;
  end;
end;

//....................... functions of GB / Big5 .............................

function SysIsSimplifiedChinese: Boolean;
begin
  Result := (SysLocale.PriLangID = LANG_CHINESE) and
    (SysLocale.SubLangID in [SUBLANG_CHINESE_SIMPLIFIED, SUBLANG_CHINESE_SINGAPORE]);
end;

function SysIsTraditionalChinese: Boolean;
begin
  Result := (SysLocale.PriLangID = LANG_CHINESE) and
    (SysLocale.SubLangID in [SUBLANG_CHINESE_TRADITIONAL, SUBLANG_CHINESE_HONGKONG]);
end;

function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;

  function MyCompareChineseStr(const s1, s2: string): Boolean;
  var
    difCount: Integer;
    i, k: Integer;
    cmpLen: Integer;
  begin
    difCount := 0;
    i := 1;
    k := 1;
    while (i <= Length(s1)) and (k <= Length(s2)) do begin
      if s1[i] <> s2[k] then begin
        if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin
          //比较原理：两次转换后，某些字可能会转戌一个'?'，以下处理这种情况
          if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin
            Inc(i);
          end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin
            Inc(k);
          end else begin
            Inc(difCount);
          end;
        end else begin
          Inc(difCount);
        end;
      end;
      Inc(i);
      Inc(k);
    end;
    if ExactCompare then begin
      cmpLen := CountChineseChars(S);
    end else begin
      cmpLen := Length(S);
    end;
    Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6%
  end;

begin
  Result := ceAnsi;
  if SysLocale.PriLangID = LANG_CHINESE then begin
    case SysLocale.SubLangID of
      SUBLANG_CHINESE_SIMPLIFIED,
      SUBLANG_CHINESE_SINGAPORE: begin
        if MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin
          Result := ceGB;
        end else begin
          Result := ceBig5;
        end;
      end;
      SUBLANG_CHINESE_TRADITIONAL,
      SUBLANG_CHINESE_HONGKONG: begin
        if MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin
          Result := ceBig5;
        end else begin
          Result := ceGB;
        end;
      end;
    end;
  end;
end;

//............................................................................
// It can distinguish encoding is UTF8 or Ansi,
// but may not distinguish encoding is GB or Big5,
// so it need to process GB / Big5 specially

function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding;
var
  len: Longint;

  function Maybe3BytesUtf8(Index: Integer): Boolean;
  begin
    Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and
      (buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]);
  end;

var
  idx: Longint;
  iUtf8: Longint;
  maybeGB: Integer;   //GB2312/GBK/GB18030
  mayBig5: Integer;   //Big5
  mayUtf8: Integer;   //Utf-8
  maybeLE: Integer;   //Unicode 16 (UCS2) , Little Endian
  maybeBE: Integer;   //Unicode 16 (UCS2) , Big Endian
  mayBins: Integer;   //Binary File Chars ?
  serZero: Integer;
  ratio:   Integer;
  chsCount: Integer;
  utf8Count: Integer;
  gbkNulls: Integer;
  big5Nulls: Integer;
begin
  Result  := ceAnsi;
  maybeGB := 0;
  mayBig5 := 0;
  mayUtf8 := 0;
  maybeLE := 0;
  maybeBE := 0;
  mayBins := 0;
  serZero := 0;
  chsCount := 0;
  utf8Count := 0;
  gbkNulls := 0;
  big5Nulls := 0;
  len := Length(buf);
  idx := 1;
  while idx <= len do begin
    if idx < len then begin
      if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or
         (buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin
        Inc(gbkNulls);
      end;
      if (buf[idx +1] in [#$7F..#$A0]) or
         (buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or
         (buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin
        Inc(big5Nulls);
      end;
    end;
    case buf[idx] of
      #0: begin
        Inc(mayBins);
        if (idx < len) and (buf[idx +1] = #0) then begin
          Inc(serZero);
        end;
        if (idx mod 2) = 0 then begin
          Inc(maybeLE);
        end else begin
          Inc(maybeBE);
        end;
      end;
      #1..#8, #11, #12, #14..#31: begin
        Inc(mayBins);
      end;
      #$80: begin
        iUtf8 := idx;
        Inc(iUtf8);
        if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
        if Maybe3BytesUtf8(iUtf8) then begin
          Inc(mayUtf8, 32);
          Inc(utf8Count);
        end;
      end;
      #$81..#$BF: begin
        if buf[idx] in [#$81..#$A0] then begin
          Inc(maybeGB, 8);
        end else begin
          Inc(maybeGB, 8);
          Inc(mayBig5, 8);
        end;
        Inc(chsCount);
        Inc(idx);
        iUtf8 := idx;
        if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
        if Maybe3BytesUtf8(iUtf8) then begin
          Inc(mayUtf8, 32);
          Inc(utf8Count);
        end;
      end;
      #$C0..#$DF: begin
        if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin
          Inc(mayUtf8);
          Inc(utf8Count);
          Inc(chsCount);
          if (buf[idx +1] in [#$A1..#$BF]) then begin
            Inc(maybeGB);
            Inc(mayBig5);
          end else begin
            Inc(maybeGB, 4);
          end;
        end else begin
          Inc(maybeGB);
          Inc(mayBig5);
          Inc(chsCount);
        end;
        Inc(idx);
      end;
      #$E0..#$EF: begin
        if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin
          Inc(mayUtf8, 32);
          Inc(utf8Count);
        end;
        Inc(maybeGB);
        Inc(mayBig5);
        Inc(chsCount);
        Inc(idx);
      end;
      #$F0..#$FE: begin
        if buf[idx] in [#$FA..#$FE] then begin
          Inc(maybeGB, 8);
        end;
        Inc(maybeGB, 8);
        Inc(mayBig5, 8);
        Inc(chsCount);
        Inc(idx);
      end;
    end;
    Inc(idx);
  end;
  // set encoding
  if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin
    if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or
       (serZero * 8 > Max(maybeLE, maybeBE)) then begin
      Result := ceBinary;
    end else if maybeLE >= maybeBE then begin
      Result := ceUcs2_LE;
    end else begin
      Result := ceUcs2_BE;
    end;
  end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin
    ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5));
    if Abs(ratio) <= 5 then begin
      if gbkNulls > big5Nulls then begin
        Result := ceBig5;
      end else if gbkNulls < big5Nulls then begin
        Result := ceGB;
      end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin
        Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len));
      end;
    end else begin
      if ratio > 0 then begin
        Result := ceGB;
      end else begin
        Result := ceBig5;
      end;
    end;
  end else if mayUtf8 > 0 then begin
    Result := ceUtf_8;
  end;
end;

function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding;
var
  buf: string;
begin
  SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position));
  AStream.Read(buf[1], Length(buf));
  Result := GuessCharEncoding(buf, SeeGBBig5);
end;

function DoTryToDecode(Source, Target: TStream; SeeGBBig5: Boolean): TCharEncoding;
begin
  // test character encoding
  Result := GuessCharEncoding(Source, SeeGBBig5);

  // transfer encoding
  Source.Seek(0, soFromBeginning);
  case Result of
    ceUtf_8:   DoUtf8ToAnsi(Source, Target);
    ceUcs2_LE: DoLittleEndianUnicode16ToAnsi(Source, Target);
    ceUcs2_BE: DoBigEndianUnicode16ToAnsi(Source, Target);
    ceUtf_32:  raise Exception.Create('UTF-32 not support yet.');
    ceGB:      DoGBToAnsi(Source, Target);
    ceBig5:    DoBig5ToAnsi(Source, Target);
  else         //ceAnsi, ceBinary
    Target.CopyFrom(Source, Source.Size);
  end;
end;

{------------------------- Intialize Char Convert ---------------------------}

procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc;
  AExactCompare: Boolean; ASamplingSize: Integer);
begin
  ExactCompare := AExactCompare;
  SamplingSize := ASamplingSize;
  if Assigned(AGB2Big5) then begin
    GB2Big5Proc := AGB2Big5;
  end else begin
    GB2Big5Proc := GB2BigEx.Gb2Big5;
  end;
  if Assigned(ABig52GB) then begin
    Big52GBProc := ABig52GB;
  end else begin
    Big52GBProc := GB2BigEx.Big52GB;
  end;
end;

{--------------------------- Smart load text file ---------------------------}

function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream;
  TryToDecode, SeeGBBig5: Boolean): TCharEncoding;
var
  fs: TFileStream;
  bom: array[1..4] of Char;
  len: Longint;
begin
  Result := ceAnsi;
  fs := TFileStream.Create(AFileName, fmOpenRead or fmShareDenyNone);
  with fs do try
    if Size = 0 then Exit;
    len := Read(bom, SizeOf(bom));
    if (len >= 3) and (bom[1] = #$EF) and (bom[2] = #$BB) and (bom[3] = #$BF) then begin //UTF8
      Result := ceUtf_8;
      Seek(3, soFromBeginning);
      DoUtf8ToAnsi(fs, AStream);
    end else if (len >= 2) and (bom[1] = #$FF) and (bom[2] = #$FE) then begin            //UTF-16/UCS-2, little endian
      Result := ceUcs2_LE;
      Seek(2, soFromBeginning);
      DoLittleEndianUnicode16ToAnsi(fs, AStream);
    end else if (len >= 2) and (bom[1] = #$FE) and (bom[2] = #$FF) then begin            //UTF-16/UCS-2, big endian
      Result := ceUcs2_BE;
      Seek(2, soFromBeginning);
      DoBigEndianUnicode16ToAnsi(fs, AStream);
    end else begin
      Seek(0, soFromBeginning);
      if TryToDecode then begin
        Result := DoTryToDecode(fs, AStream, SeeGBBig5);
      end else begin
        AStream.CopyFrom(fs, fs.Size);
      end;
    end;
    AStream.Position := 0;
  finally
    fs.Free;
  end;
end;

initialization
  GB2Big5Proc := GB2BigEx.Gb2Big5;
  Big52GBProc := GB2BigEx.Big52GB;

end.

//******************************************************************************
// GB/Big5相互转换代码(直接调用API实现，也可以用其他类似函数代替)
unit GB2BigEX;

interface

{$IFDEF VER150} //D7
  {$WARN UNSAFE_CODE OFF}
  {$WARN UNSAFE_TYPE OFF}
  {$WARN UNSAFE_CAST OFF}
{$ENDIF}

uses
  Windows;

function Gb2Big5(const S: string): string;
function Big52Gb(const S: string): string;


implementation

function UnicodeEncode(const S: string; CodePage: Integer): WideString;
var
  Len: Integer;
begin
  Len := Length(S) + 1;
  SetLength(Result, Len);
  Len := MultiByteToWideChar(CodePage, 0, PChar(S), -1, PWideChar(Result), Len);
  SetLength(Result, Len - 1); //end is #0
end;

function UnicodeDecode(const S: WideString; CodePage: Integer): string;
var
  Len: Integer;
begin
  Len := Length(S) * 2 + 1; //one for #0
  SetLength(Result, Len);
  Len := WideCharToMultiByte(CodePage, 0, PWideChar(S), -1, PChar(Result), Len, nil, nil);
  SetLength(Result, Len - 1);
end;

function Gb2Big5(const S: string): string;
begin
  SetLength(Result, Length(S));
  LCMapString(GetUserDefaultLCID, LCMAP_TRADITIONAL_CHINESE,
    PChar(S), Length(S),
    PChar(Result), Length(Result));
  Result := UnicodeDecode(UnicodeEncode(Result, 936), 950); //改成Result := UnicodeEncode(Result, 936);则可在GBK中显示繁体字
end;

function Big52Gb(const S: string): string;
var
  tmp: string;
begin
  tmp := UnicodeDecode(UnicodeEncode(S, 950), 936);
  SetLength(Result, Length(tmp));
  LCMapString(GetUserDefaultLCID, LCMAP_SIMPLIFIED_CHINESE,
    PChar(tmp), Length(tmp),
    PChar(Result), Length(Result));
end;

end.


// 如果是在Delphi5下使用，那还需要一个Unicode.pas
{****************************************************************************}
{    Some Function of Ansi, UTF8, Unicode Converting  (copy from Delphi6)    }
{****************************************************************************}

unit Unicode;

interface

uses
  Classes, Windows, SysUtils;

type
  UTF8String = type string;
  PUTF8String = ^UTF8String;

{ PChar/PWideChar Unicode <-> UTF8 conversion }

// UnicodeToUTF8(3):
// UTF8ToUnicode(3):
// Scans the source data to find the null terminator, up to MaxBytes
// Dest must have MaxBytes available in Dest.
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer; overload; //deprecated;
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer; overload; //deprecated;

// UnicodeToUtf8(4):
// UTF8ToUnicode(4):
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.
// Nulls in the source data are not considered terminators - SourceChars must be accurate

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal; overload;
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal; overload;

{ WideString <-> UTF8 conversion }

function UTF8Encode(const WS: WideString): UTF8String;
function UTF8Decode(const S: UTF8String): WideString;

{ Ansi <-> UTF8 conversion }

function AnsiToUtf8(const S: string): UTF8String;
function Utf8ToAnsi(const S: UTF8String): string;

implementation

// UnicodeToUTF8(3):
// Scans the source data to find the null terminator, up to MaxBytes
// Dest must have MaxBytes available in Dest.

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer;
var
  len: Cardinal;
begin
  len := 0;
  if Source <> nil then
    while Source[len] <> #0 do
      Inc(len);
  Result := UnicodeToUtf8(Dest, MaxBytes, Source, len);
end;

// UnicodeToUtf8(4):
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.
// Nulls in the source data are not considered terminators - SourceChars must be accurate

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal;
var
  i, count: Cardinal;
  c: Cardinal;
begin
  Result := 0;
  if Source = nil then Exit;
  count := 0;
  i := 0;
  if Dest <> nil then
  begin
    while (i < SourceChars) and (count < MaxDestBytes) do
    begin
      c := Cardinal(Source[i]);
      Inc(i);
      if c <= $7F then
      begin
        Dest[count] := Char(c);
        Inc(count);
      end
      else if c > $7FF then
      begin
        if count + 3 > MaxDestBytes then
          break;
        Dest[count] := Char($E0 or (c shr 12));
        Dest[count+1] := Char($80 or ((c shr 6) and $3F));
        Dest[count+2] := Char($80 or (c and $3F));
        Inc(count,3);
      end
      else //  $7F < Source[i] <= $7FF
      begin
        if count + 2 > MaxDestBytes then
          break;
        Dest[count] := Char($C0 or (c shr 6));
        Dest[count+1] := Char($80 or (c and $3F));
        Inc(count,2);
      end;
    end;
    if count >= MaxDestBytes then count := MaxDestBytes-1;
    Dest[count] := #0;
  end
  else
  begin
    while i < SourceChars do
    begin
      c := Integer(Source[i]);
      Inc(i);
      if c > $7F then
      begin
        if c > $7FF then
          Inc(count);
        Inc(count);
      end;
      Inc(count);
    end;
  end;
  Result := count+1;  // convert zero based index to byte count
end;

function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer;
var
  len: Cardinal;
begin
  len := 0;
  if Source <> nil then
    while Source[len] <> #0 do
      Inc(len);
  Result := Utf8ToUnicode(Dest, MaxChars, Source, len);
end;

function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal;
var
  i, count: Cardinal;
  c: Byte;
  wc: Cardinal;
begin
  if Source = nil then
  begin
    Result := 0;
    Exit;
  end;
  Result := Cardinal(-1);
  count := 0;
  i := 0;
  if Dest <> nil then
  begin
    while (i < SourceBytes) and (count < MaxDestChars) do
    begin
      wc := Cardinal(Source[i]);
      Inc(i);
      if (wc and $80) <> 0 then
      begin
        wc := wc and $3F;
        if i > SourceBytes then Exit;           // incomplete multibyte char
        if (wc and $20) <> 0 then
        begin
          c := Byte(Source[i]);
          Inc(i);
          if (c and $C0) <> $80 then  Exit;     // malformed trail byte or out of range char
          if i > SourceBytes then Exit;         // incomplete multibyte char
          wc := (wc shl 6) or (c and $3F);
        end;
        c := Byte(Source[i]);
        Inc(i);
        if (c and $C0) <> $80 then Exit;       // malformed trail byte

        Dest[count] := WideChar((wc shl 6) or (c and $3F));
      end
      else
        Dest[count] := WideChar(wc);
      Inc(count);
    end;
 if count >= MaxDestChars then count := MaxDestChars-1;
 Dest[count] := #0;
  end
  else
  begin
 while (i <= SourceBytes) do
 begin
   c := Byte(Source[i]);
   Inc(i);
   if (c and $80) <> 0 then
   begin
  if (c and $F0) = $F0 then Exit;  // too many bytes for UCS2
  if (c and $40) = 0 then Exit;    // malformed lead byte
  if i > SourceBytes then Exit;         // incomplete multibyte char

  if (Byte(Source[i]) and $C0) <> $80 then Exit;  // malformed trail byte
  Inc(i);
  if i > SourceBytes then Exit;         // incomplete multibyte char
  if ((c and $20) <> 0) and ((Byte(Source[i]) and $C0) <> $80) then Exit; // malformed trail byte
  Inc(i);
   end;
   Inc(count);
 end;
  end;
  Result := count+1;
end;

function Utf8Encode(const WS: WideString): UTF8String;
var
  L: Integer;
  Temp: UTF8String;
begin
  Result := '';
  if WS = '' then Exit;
  SetLength(Temp, Length(WS) * 3); // SetLength includes space for null terminator

  L := UnicodeToUtf8(PChar(Temp), Length(Temp)+1, PWideChar(WS), Length(WS));
  if L > 0 then
    SetLength(Temp, L-1)
  else
    Temp := '';
  Result := Temp;
end;

function Utf8Decode(const S: UTF8String): WideString;
var
  L: Integer;
  Temp: WideString;
begin
  Result := '';
  if S = '' then Exit;
  SetLength(Temp, Length(S));

  L := Utf8ToUnicode(PWideChar(Temp), Length(Temp)+1, PChar(S), Length(S));
  if L > 0 then
    SetLength(Temp, L-1)
  else
    Temp := '';
  Result := Temp;
end;

function AnsiToUtf8(const S: string): UTF8String;
begin
  Result := Utf8Encode(S);
end;

function Utf8ToAnsi(const S: UTF8String): string;
begin
  Result := Utf8Decode(S);
end;

end.


// 测试代码：

procedure TForm1.LoadTextFile(const AFileName: string);
const
  cEncodings: array[TCharEncoding] of string = ('Ansi', 'Binary', 'UTF-8',
    'UCS2(Little Endian)', 'UCS2(Big Endian)', 'UTF-32', 'GB', 'Big5');
var
  stream: TStream;
  charEncoding: TCharEncoding;
begin
  stream := TMemoryStream.Create;
  try
    charEncoding := SmartLoadTextFileToStream(AFileName, stream);
    Memo1.Lines.LoadFromStream(stream);
    Caption := Format('%s - %s (%s)', ['Smart Load Text File', AFileName, cEncodings[charEncoding]]);
  finally
    stream.Free;
  end;
end;

posted @ 2015-10-14 16:22 findumars Views(1236) Comments(0) Edit 收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

朝闻道

时间是不可逆的资源，让我们一起月月水火木金金

文本编码的智能识别（它的两个指导哲学真是绝妙）

公告