使用C#.NET调用ICU进行编码检测和编码转换

ICU的C/C++版本:ICU4C

相关API的用法可查阅官方文档,本例只演示使用P/Invoke调用。

DLL文件需要注意区分32位和64位。

官方API文档:ICU-docs

P/Invoke相关文档:Native interoperability Interop Marshaling

非常有用的P/Invoke函数签名查询工具:PINVOKE.NET

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;

namespace EncodingConverter.Console
{
    class Program
    {
        static void Main(string[] args)
        {
            Convert_UCS2_To_GBK();
            Convert_GBK_To_UCS2();
            Detect_Encoding();

            System.Console.ReadKey();
        }

        static void Detect_Encoding()
        {
            ICU4C.UErrorCode status;

            byte[] input = File.ReadAllBytes(@"TEST.GBK.TXT");

            // 打开检测器
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            IntPtr ucsd = ICU4C.NativeMethods.ucsdet_open(ref status);

            // 设置要检查的文本
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            ICU4C.NativeMethods.ucsdet_setText(ucsd, input, input.Length, ref status);

            // 执行检测
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            IntPtr ucsm = ICU4C.NativeMethods.ucsdet_detect(ucsd, ref status);

            // 取结果
            IntPtr lpstr = ICU4C.NativeMethods.ucsdet_getName(ucsm, ref status);
            string str = Marshal.PtrToStringAnsi(lpstr);

            // 关闭检测器
            ICU4C.NativeMethods.ucsdet_close(ucsd);

            System.Console.WriteLine($"Detected Encoding");
            System.Console.WriteLine($"  Result = {str}");
        }

        static void Convert_UCS2_To_GBK()
        {
            string input = File.ReadAllText(@"TEST.TXT", Encoding.Unicode);

            ICU4C.UErrorCode status;

            // 打开转换器
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            IntPtr cnv = ICU4C.NativeMethods.ucnv_open("GBK", ref status);

            // 计算输出长度
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            int outputLength = ICU4C.NativeMethods.ucnv_fromUChars(cnv, null, 0, input, input.Length, ref status);

            // 输出缓冲区
            byte[] output = new byte[outputLength];

            // 转换并输出
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            ICU4C.NativeMethods.ucnv_fromUChars(cnv, output, output.Length, input, input.Length, ref status);

            // 关闭转换器
            ICU4C.NativeMethods.ucnv_close(cnv);

            // 写出文件
            File.WriteAllBytes(@"TEST.GBK.TXT", output);

            System.Console.WriteLine("Convert UCS2 to GBK");
            System.Console.WriteLine($"  Input Length = {input.Length} characters");
            System.Console.WriteLine($"  Output Length = {output.Length} bytes");
        }

        static void Convert_GBK_To_UCS2()
        {
            byte[] input = File.ReadAllBytes(@"TEST.GBK.TXT");

            ICU4C.UErrorCode status;

            // 打开转换器
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            IntPtr cnv = ICU4C.NativeMethods.ucnv_open("GBK", ref status);

            // 计算输出长度
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            int outputLength = ICU4C.NativeMethods.ucnv_toUChars(cnv, null, 0, input, input.Length, ref status);

            // 输出缓冲区
            StringBuilder output = new StringBuilder(outputLength);

            // 转换并输出
            status = ICU4C.UErrorCode.U_ZERO_ERROR;
            ICU4C.NativeMethods.ucnv_toUChars(cnv, output, output.Capacity, input, input.Length, ref status);

            // 关闭转换器
            ICU4C.NativeMethods.ucnv_close(cnv);

            // 写出文件
            File.WriteAllText(@"TEST.UCS2.TXT", output.ToString(), Encoding.Unicode);

            System.Console.WriteLine("Convert GBK to UCS2");
            System.Console.WriteLine($"  Input Length = {input.Length} bytes");
            System.Console.WriteLine($"  Output Length = {output.Length} characters");
        }
    }
}

P/Invoke API 定义

using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;

#pragma warning disable IDE1006 // Naming Styles

namespace EncodingConverter.ICU4C
{
    enum UErrorCode
    {
        U_ZERO_ERROR
    }

    static class NativeMethods
    {
        [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_open_67")]
        public static extern IntPtr ucsdet_open(ref UErrorCode status);

        [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_close_67")]
        public static extern void ucsdet_close(IntPtr ucsd);

        [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_setText_67")]
        public static extern void ucsdet_setText(IntPtr ucsd, byte[] textIn, int len, ref UErrorCode status);

        [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_detect_67")]
        public static extern IntPtr ucsdet_detect(IntPtr ucsd, ref UErrorCode status);

        [DllImport("icuin67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucsdet_getName_67")]
        public static extern IntPtr ucsdet_getName(IntPtr ucsm, ref UErrorCode status);

        [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_open_67")]
        public static extern IntPtr ucnv_open([MarshalAs(UnmanagedType.LPStr)] string converterName, ref UErrorCode err);

        [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_close_67")]
        public static extern void ucnv_close(IntPtr converter);

        [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_fromUChars_67")]
        public static extern int ucnv_fromUChars(IntPtr cnv, [Out] byte[] dest, int destCapacity, [MarshalAs(UnmanagedType.LPWStr)] string src, int srcLength, ref UErrorCode pErrorCode);

        [DllImport("icuuc67.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ucnv_toUChars_67")]
        public static extern int ucnv_toUChars(IntPtr cnv, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder dest, int destCapacity, byte[] src, int srcLength, ref UErrorCode pErrorCode);
    }
}

 

posted @ 2020-10-09 14:46  Akatsuki-  阅读(587)  评论(0编辑  收藏  举报