C++判断字符串编码格式(ANSI\UTF16_LE\UTF16_BE\UTF8\UTF8_BOM)

enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };

__inline static

Encode IsUtf8Data(const uint8_t* data, size_t size)

{

    bool bAnsi= true;

    uint8_t ch = 0x00;

    int32_t nBytes = 0;

    for (auto i = 0; i < size; i++)

    {

        ch = *(data + i);

        if ((ch & 0x80) != 0x00)

        {

            bAnsi = false;

        }

        if (nBytes == 0)

        {

            if (ch >= 0x80)

            {

                if (ch >= 0xFC && ch <= 0xFD)

                {

                    nBytes = 6;

                }

                else if (ch >= 0xF8)

                {

                    nBytes = 5;

                }

                else if (ch >= 0xF0)

                {

                    nBytes = 4;

                }

                else if (ch >= 0xE0)

                {

                    nBytes = 3;

                }

                else if (ch >= 0xC0)

                {

                    nBytes = 2;

                }

                else

                {

                    return Encode::ANSI;

                }

                nBytes--;

            }

        }

        else

        {

            if ((ch & 0xC0) != 0x80)

            {

                return Encode::ANSI;

            }

            nBytes--;

        }

    }

    if (nBytes > 0 || bAnsi)

    {

        return Encode::ANSI;

    }

    return Encode::UTF8;

}

__inline static

Encode DetectEncode(const uint8_t* data, size_t size)

{

    if (size > 2 && data[0] == 0xFF && data[1] == 0xFE)

    {

        return Encode::UTF16_LE;

    }

    else if (size > 2 && data[0] == 0xFE && data[1] == 0xFF)

    {

        return Encode::UTF16_BE;

    }

    else if (size > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)

    {

        return Encode::UTF8_BOM;

    }

    else

    {

        return IsUtf8Data(data, size);

    }

}

调用例子:

auto s = FILE_READER(sv.begin()->c_str(), std::ios::binary);

switch (DetectEncode((const uint8_t*)s.data(), s.size()))

{

case ANSI:

    break;

case UTF16_LE:

    s.erase(s.begin());

    s.erase(s.begin());

    s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));

    break;

case UTF16_BE:

    s.erase(s.begin());

    s.erase(s.begin());

    s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));

    break;

case UTF8_BOM:

    s.erase(s.begin());

    s.erase(s.begin());

    s.erase(s.begin());

    s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));

    break;

case UTF8:

    s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));

    break;

default:

    break;

}  
posted @ 2022-05-30 10:40  萧海~  阅读(879)  评论(0编辑  收藏  举报