c++ 字符集转换
转码整理, 资料来源于网络
charset.h
#pragma once #include <iostream> #include <string> std::string UnicodeToAnsi(const std::wstring& unicode); std::wstring AnsiToUnicode(const std::string& ansi); std::string AnsiToUtf8(const std::string& strSrc); std::string Utf8ToAnsi(const std::string& strSrc); std::string UnicodeToUtf8(const std::wstring& wstrSrc); std::wstring Utf8ToUnicode(const std::string& strSrc); std::string GBKToUtf8(const std::string& gbk); std::string Utf8ToGBK(const std::string& utf8); std::wstring GB2312ToUnicode(const std::string& gb2312); std::string UnicodeToGB2312(const std::wstring& unicode); std::wstring BIG5ToUnicode(const std::string& big5); std::string UnicodeToBIG5(const std::wstring& unicode); std::string FBIG5ToGB2312(const std::string& big5); std::string GB2312ToFBIG5(const std::string gb2312); bool IsUTF8(const void* pBuffer, long size);
main.cpp
#include "charset.h" void showHex(const char* bytes, int len) { for (int i = 0; i < len; i++) { printf("%02x ", (unsigned char)bytes[i]); } } void showHex(std::string charset, std::string str) { printf("%10s: ", charset.data()); showHex(str.data(), str.size()); printf("\n"); } void showHex(std::string charset, std::wstring str) { printf("%10s: ", charset.data()); showHex((char*)str.data(), 2 * str.size()); printf("\n"); } int main(int argc, char* argv[]) { std::wstring wstr(L"中abc国"); std::string str("中abc国"); std::string ansi; std::string utf8; std::string gbk; std::wstring unicode; showHex("unicode", wstr); showHex("ansi", str); ansi = UnicodeToAnsi(wstr); showHex("ansi", ansi); unicode = AnsiToUnicode(ansi); showHex("unicode", unicode); utf8 = AnsiToUtf8(str); showHex("utf8", utf8); ansi = Utf8ToAnsi(utf8); showHex("ansi", ansi); utf8 = UnicodeToUtf8(wstr); showHex("utf8", utf8); unicode = Utf8ToUnicode(utf8); showHex("unicode", unicode); gbk = Utf8ToGBK(utf8); showHex("gbk", gbk); utf8 = GBKToUtf8(gbk); showHex("utf8", utf8); getchar(); return 0; }
charset.cpp
#inchude "charset.h" #include <Windows.h> std::string UnicodeToAnsi(const std::wstring& unicode) { LPCWCH ptr = unicode.c_str(); /** 分配目标空间, 一个16位Unicode字符最多可以转为4个字节int size = static_cast<int>( wstrSrc.size() * 4 + 10 );*/ int size = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, NULL, 0, NULL, NULL); std::string strRet(size, 0); int len = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL); return strRet; } std::wstring AnsiToUnicode(const std::string& ansi) { LPCCH ptr = ansi.c_str(); int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL); std::wstring wstrRet(size, 0); int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size); return wstrRet; } std::string AnsiToUtf8(const std::string& ansi) { LPCCH ptr = ansi.c_str(); /* 分配目标空间, 长度为 Ansi 编码的两倍 */ int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL); std::wstring wstrTemp(size, 0); int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrTemp.c_str(), size); return UnicodeToUtf8(wstrTemp); } std::string Utf8ToAnsi(const std::string& utf8) { std::wstring wstrTemp = Utf8ToUnicode(utf8); LPCWCH ptr = wstrTemp.c_str(); int size = WideCharToMultiByte(CP_ACP, 0, ptr, -1, NULL, 0, NULL, NULL); std::string strRet(size, 0); int len = WideCharToMultiByte(CP_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL); return strRet; } std::string UnicodeToUtf8(const std::wstring& unicode) { /* 分配目标空间, 一个16位Unicode字符最多可以转为4个字节 */ LPCWCH ptr = unicode.c_str(); int size = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, NULL, 0, NULL, NULL); std::string strRet(size, 0); int len = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, (char*)strRet.c_str(), size, NULL, NULL); return strRet; } std::wstring Utf8ToUnicode(const std::string& utf8) { LPCCH ptr = utf8.c_str(); int size = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, NULL, NULL); std::wstring wstrRet(size, 0); int len = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size); return wstrRet; } std::string GBKToUtf8(const std::string& gbk) { return AnsiToUtf8(gbk); } std::string Utf8ToGBK(const std::string& utf8) { return Utf8ToAnsi(utf8); } bool IsUTF8(const void* pBuffer, long size) { bool isUTF8 = true; unsigned char* start = (unsigned char*)pBuffer; unsigned char* end = (unsigned char*)pBuffer + size; while (start < end) { if (*start < 0x80) { /*(10000000): 值小于0x80的为ASCII字符*/ start++; } else if (*start < (0xC0)) { /*(11000000): 值介于0x80与0xC0之间的为无效UTF-8字符*/ isUTF8 = false; break; } else if (*start < (0xE0)) { /*(11100000): 此范围内为2字节UTF-8字符 */ if (start >= end - 1) { break; } if ((start[1] & (0xC0)) != 0x80) { isUTF8 = false; break; } start += 2; } else if (*start < (0xF0)) { /**(11110000): 此范围内为3字节UTF-8字符*/ if (start >= end - 2) { break; } if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) { isUTF8 = false; break; } start += 3; } else { isUTF8 = false; break; } } return isUTF8; } //GB2312 转换成 Unicode std::wstring GB2312ToUnicode(const std::string& gb2312) { UINT nCodePage = 936; //GB2312 int size = MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, NULL, 0); std::wstring wstrRet(size, 0); MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, (LPWSTR)wstrRet.c_str(), size); return wstrRet; } //BIG5 转换成 Unicode std::wstring BIG5ToUnicode(const std::string& big5) { UINT nCodePage = 950; //BIG5 int size = MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, NULL, 0); std::wstring wstrRet(size, 0); MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, (LPWSTR)wstrRet.c_str(), size); return wstrRet; } //Unicode 转换成 GB2312 std::string UnicodeToGB2312(const std::wstring& unicode) { UINT nCodePage = 936; //GB2312 int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL); std::string strRet(size, 0); WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL); return strRet; } //Unicode 转换成 BIG5 std::string UnicodeToBIG5(const std::wstring& unicode) { UINT nCodePage = 950; //BIG5 int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL); std::string strRet(size, 0); WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL); return strRet; } //繁体中文BIG5 转换成 简体中文 GB2312 std::string FBIG5ToGB2312(const std::string& big5) { LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC); std::wstring unicode = BIG5ToUnicode(big5); std::string gb2312 = UnicodeToGB2312(unicode); int size = LCMapStringA(lcid, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, NULL, 0); std::string strRet(size, 0); LCMapStringA(0x0804, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size); return strRet; } //简体中文 GB2312 转换成 繁体中文BIG5 std::string GB2312ToFBIG5(const std::string gb2312) { LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC); int size = LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, NULL, 0); std::string strRet(size, 0); LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size); std::wstring unicode = GB2312ToUnicode(strRet); std::string big5 = UnicodeToBIG5(unicode); return big5; }