使用C#.NET调用ICU进行编码检测和编码转换
ICU的C/C++版本:ICU4C
相关API的用法可查阅官方文档,本例只演示使用P/Invoke调用。
DLL文件需要注意区分32位和64位。
官方API文档:ICU-docs
P/Invoke相关文档:Native interoperability Interop Marshaling
非常有用的P/Invoke函数签名查询工具:PINVOKE.NET
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Runtime.InteropServices; using System.Text; namespace EncodingConverter.Console { class Program { static void Main(string[] args) { Convert_UCS2_To_GBK(); Convert_GBK_To_UCS2(); Detect_Encoding(); System.Console.ReadKey(); } static void Detect_Encoding() { ICU4C.UErrorCode status; byte[] input = File.ReadAllBytes(@"TEST.GBK.TXT"); // 打开检测器 status = ICU4C.UErrorCode.U_ZERO_ERROR; IntPtr ucsd = ICU4C.NativeMethods.ucsdet_open(ref status); // 设置要检查的文本 status = ICU4C.UErrorCode.U_ZERO_ERROR; ICU4C.NativeMethods.ucsdet_setText(ucsd, input, input.Length, ref status); // 执行检测 status = ICU4C.UErrorCode.U_ZERO_ERROR; IntPtr ucsm = ICU4C.NativeMethods.ucsdet_detect(ucsd, ref status); // 取结果 IntPtr lpstr = ICU4C.NativeMethods.ucsdet_getName(ucsm, ref status); string str = Marshal.PtrToStringAnsi(lpstr); // 关闭检测器 ICU4C.NativeMethods.ucsdet_close(ucsd); System.Console.WriteLine($"Detected Encoding"); System.Console.WriteLine($" Result = {str}"); } static void Convert_UCS2_To_GBK() { string input = File.ReadAllText(@"TEST.TXT", Encoding.Unicode); ICU4C.UErrorCode status; // 打开转换器 status = ICU4C.UErrorCode.U_ZERO_ERROR; IntPtr cnv = ICU4C.NativeMethods.ucnv_open("GBK", ref status); // 计算输出长度 status = ICU4C.UErrorCode.U_ZERO_ERROR; int outputLength = ICU4C.NativeMethods.ucnv_fromUChars(cnv, null, 0, input, input.Length, ref status); // 输出缓冲区 byte[] output = new byte[outputLength]; // 转换并输出 status = ICU4C.UErrorCode.U_ZERO_ERROR; ICU4C.NativeMethods.ucnv_fromUChars(cnv, output, output.Length, input, input.Length, ref status); // 关闭转换器 ICU4C.NativeMethods.ucnv_close(cnv); // 写出文件 File.WriteAllBytes(@"TEST.GBK.TXT", output); System.Console.WriteLine("Convert UCS2 to GBK"); System.Console.WriteLine($" Input Length = {input.Length} characters"); System.Console.WriteLine($" Output Length = {output.Length} bytes"); } static void Convert_GBK_To_UCS2() { byte[] input = File.ReadAllBytes(@"TEST.GBK.TXT"); ICU4C.UErrorCode status; // 打开转换器 status = ICU4C.UErrorCode.U_ZERO_ERROR; IntPtr cnv = ICU4C.NativeMethods.ucnv_open("GBK", ref status); // 计算输出长度 status = ICU4C.UErrorCode.U_ZERO_ERROR; int outputLength = ICU4C.NativeMethods.ucnv_toUChars(cnv, null, 0, input, input.Length, ref status); // 输出缓冲区 StringBuilder output = new StringBuilder(outputLength); // 转换并输出 status = ICU4C.UErrorCode.U_ZERO_ERROR; ICU4C.NativeMethods.ucnv_toUChars(cnv, output, output.Capacity, input, input.Length, ref status); // 关闭转换器 ICU4C.NativeMethods.ucnv_close(cnv); // 写出文件 File.WriteAllText(@"TEST.UCS2.TXT", output.ToString(), Encoding.Unicode); System.Console.WriteLine("Convert GBK to UCS2"); System.Console.WriteLine($" Input Length = {input.Length} bytes"); System.Console.WriteLine($" Output Length = {output.Length} characters"); } } }
P/Invoke API 定义
using System; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; #pragma warning disable IDE1006 // Naming Styles namespace EncodingConverter.ICU4C { enum UErrorCode { U_ZERO_ERROR } static class NativeMethods { [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_open_67")] public static extern IntPtr ucsdet_open(ref UErrorCode status); [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_close_67")] public static extern void ucsdet_close(IntPtr ucsd); [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_setText_67")] public static extern void ucsdet_setText(IntPtr ucsd, byte[] textIn, int len, ref UErrorCode status); [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_detect_67")] public static extern IntPtr ucsdet_detect(IntPtr ucsd, ref UErrorCode status); [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_getName_67")] public static extern IntPtr ucsdet_getName(IntPtr ucsm, ref UErrorCode status); [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_open_67")] public static extern IntPtr ucnv_open([MarshalAs(UnmanagedType.LPStr)] string converterName, ref UErrorCode err); [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_close_67")] public static extern void ucnv_close(IntPtr converter); [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_fromUChars_67")] public static extern int ucnv_fromUChars(IntPtr cnv, [Out] byte[] dest, int destCapacity, [MarshalAs(UnmanagedType.LPWStr)] string src, int srcLength, ref UErrorCode pErrorCode); [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_toUChars_67")] public static extern int ucnv_toUChars(IntPtr cnv, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder dest, int destCapacity, byte[] src, int srcLength, ref UErrorCode pErrorCode); } }