16进制UTF8字符转中文汉字

1 UTF8转TUF16----UTF16---本系统单字节字符,字符串

复制代码
#if defined(_WIN32)
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <string>
#include <iostream>
#endif // WIN32

#ifdef __linux__
#include<string.h>
#include<iconv.h>
#include <string>
#include <locale>
#include <cstring>
#include <stdio.h>
#include <stdlib.h>
#include<stdint.h>
#endif
using namespace std;

#if defined(_WIN32)
string UTF8_URL_DECODE(char* URLcode,int bytelen)
{
    if (URLcode == NULL || bytelen <= 0)
    {
        return "";
    }
    char* UTF8str = URLcode;

    //UTF8转换到UTF16

    int wcslen = ::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, NULL, 0);

    wchar_t* wszString = new wchar_t[wcslen + 1];

    ::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, wszString, wcslen);

    wszString[wcslen] = L'\0';

    std::wcout.imbue(std::locale("CHS"));

    wstring DecodeStr = wszString;

    string OutStr;

    int nLen = (int)DecodeStr.length() * 2;

    OutStr.resize(nLen, ' ');
    //CP_ACP通常对应GB2312编码;win中是宽字符转换成当前系统的ANSI页的多字符
    WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)DecodeStr.c_str(), nLen, (LPSTR)OutStr.c_str(), nLen, NULL, NULL);

    delete[] wszString;

    return OutStr;

}
#endif



#ifdef __linux__

unsigned char UTF8ToUnicode(unsigned char* utf8, unsigned int* unicode) {
    const unsigned char lut_size = 3;
    const unsigned char length_lut[] = { 2, 3, 4 };
    const unsigned char range_lut[] = { 0xE0, 0xF0, 0xF8 };
    const unsigned char mask_lut[] = { 0x1F, 0x0F, 0x07 };

    unsigned char length = 0;
    byte b = *(utf8 + 0);
    unsigned int i = 0;

    if (utf8 == NULL) {
        *unicode = 0;
        return -1;
    }
    // utf8编码兼容ASCII编码,使用0xxxxxx 表示00~7F
    if (b < 0x80) {
        *unicode = b;
        return 1;
    }
    // utf8不兼容ISO8859-1 ASCII拓展字符集
    // 同时最大支持编码6个字节即1111110X
    if (b < 0xC0 || b > 0xFD) {
        *unicode = 0;
        return -1;
    }
    for (i = 0; i < lut_size; i++) {
        if (b < range_lut[i]) {
            *unicode = b & mask_lut[i];
            length = length_lut[i];
            break;
        }
    }
    // 超过四字节的utf8编码不进行解析
    if (length == 0) {
        *unicode = 0;
        return -1;
    }
    // 取后续字节数据
    for (i = 1; i < length; i++) {
        b = *(utf8 + i);
        // 多字节utf8编码后续字节范围10xxxxxx~10111111
        if (b < 0x80 || b > 0xBF) {
            break;
        }
        *unicode <<= 6;
        // 00111111
        *unicode |= (b & 0x3F);
    }
    // 长度校验
    return (i < length) ? -1 : length;
}
/**
 * @brief 4字节unicode(usc4)字符集转utf16编码
 * @param unicode unicode字符值
 * @param *utf16 utf16编码结果
 * @return utf16长度,(2字节)单位
 */
unsigned char UnicodeToUTF16(unsigned int unicode, unsigned short* utf16) {
    // Unicode范围 U+000~U+FFFF
    // utf16编码方式:2 Byte存储,编码后等于Unicode值
    if (unicode <= 0xFFFF) {
        if (utf16 != NULL) {
            *utf16 = (unicode & 0xFFFF);
        }
        return 1;
    }
    else if (unicode <= 0xEFFFF) {
        if (utf16 != NULL) {
            // 高10位
            *(utf16 + 0) = 0xD800 + (unicode >> 10) - 0x40;
            // 低10位
            *(utf16 + 1) = 0xDC00 + (unicode & 0x03FF);
        }
        return 2;
    }

    return 0;
}


//多字符转换
int UTF16toStr(unsigned short* utf16, int lens, string& strout)
{
    if (lens > 1024)
    {
        printf("error utf16 is too long\n");
        return -1;
    }
    char psz[1024];
    wchar_t* pwsz = (wchar_t*)utf16;
    setlocale(LC_CTYPE, "");
    int cch = wcstombs(psz, pwsz, 1024);
    if (cch != 0 && cch != -1) {
        printf("%s\n", psz);
    }
    else {
        printf("error UTF16toStr %d\n", cch);
        return -1;
    }

    strout = cch;
    return 1;
}


string L_UTF8_URL_DECODE(char* URLcode, int bytelen)
{
    if (URLcode == NULL || bytelen <= 0)
    {
        return "";
    }
    char* UTF8str = URLcode;

    //UTF8转换到UTF16
    //size_t mbstowcs( wchar_t* wcstr, const char* mbstr, size_t    count)
    setlocale(LC_CTYPE, "");
    int wcslen = mbstowcs(NULL, URLcode, bytelen);
    printf(" wcslen:%d\n", wcslen);
    wchar_t* wszString = new wchar_t[wcslen + 1];

    //::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, wszString, wcslen);
    mbstowcs(wszString, URLcode, bytelen);

    wszString[wcslen] = L'\0';

    wstring DecodeStr = wszString;

    string OutStr;
    UTF16toStr((unsigned short*)wszString, DecodeStr.length(), OutStr);
    delete[] wszString;
    return OutStr;

}

#endif

int main(int argc, char* argv[])

{

    //linux-----------------------------
        // 严 utf8 E4 B8 A5
    printf("Hello world!\n");
    unsigned int buffer;
    uint8_t utf8[20];
    utf8[0] = 0xE4; "\xE4\xB8\xA5";
    utf8[1] = 0xB8;
    utf8[2] = 0xA5;
    utf8[3] = 0xE4; "\xE4\xB8\xA5";
    utf8[4] = 0xB8;
    utf8[5] = 0xA5;
    utf8[6] = '\0';
    std::string stdout2 = L_UTF8_URL_DECODE((char*)utf8, 7);
 

    return 0;


}
复制代码

 

posted on   邗影  阅读(7)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
历史上的今天:
2018-01-03 PP助手上传失效
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

导航

统计

点击右上角即可分享
微信分享提示