| #include "utfconvert.h" |
| |
| #include <stdint.h> |
| #ifdef __GNUC__ |
| #include <endian.h> |
| #endif |
| |
| static inline uint16_t byteswap_ushort(uint16_t number) |
| { |
| #if defined(_MSC_VER) && _MSC_VER > 1310 |
| return _byteswap_ushort(number); |
| #elif defined(__GNUC__) |
| return __builtin_bswap16(number); |
| #else |
| return (number >> 8) | (number << 8); |
| #endif |
| } |
| |
| |
| |
| |
| |
| |
| |
| std::string utf16_to_utf8(const std::u16string& u16str) |
| { |
| if (u16str.empty()){ return std::string(); } |
| |
| char16_t bom = u16str[0]; |
| switch (bom){ |
| case 0xFEFF: |
| return utf16le_to_utf8(u16str); |
| break; |
| case 0xFFFE: |
| return utf16be_to_utf8(u16str); |
| break; |
| default: |
| return std::string(); |
| } |
| } |
| |
| |
| |
| std::string utf16le_to_utf8(const std::u16string& u16str) |
| { |
| if (u16str.empty()){ return std::string(); } |
| const char16_t* p = u16str.data(); |
| std::u16string::size_type len = u16str.length(); |
| if (p[0] == 0xFEFF){ |
| p += 1; |
| len -= 1; |
| } |
| |
| |
| std::string u8str; |
| u8str.reserve(len * 3); |
| |
| char16_t u16char; |
| for (std::u16string::size_type i = 0; i < len; ++i){ |
| |
| u16char = p[i]; |
| |
| |
| if (u16char < 0x0080){ |
| |
| |
| u8str.push_back((char)(u16char & 0x00FF)); |
| continue; |
| } |
| |
| if (u16char >= 0x0080 && u16char <= 0x07FF){ |
| |
| u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0)); |
| u8str.push_back((char)((u16char & 0x3F) | 0x80)); |
| continue; |
| } |
| |
| if (u16char >= 0xD800 && u16char <= 0xDBFF) { |
| |
| uint32_t highSur = u16char; |
| uint32_t lowSur = p[++i]; |
| |
| |
| |
| |
| uint32_t codePoint = highSur - 0xD800; |
| codePoint <<= 10; |
| codePoint |= lowSur - 0xDC00; |
| codePoint += 0x10000; |
| |
| u8str.push_back((char)((codePoint >> 18) | 0xF0)); |
| u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80)); |
| u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80)); |
| u8str.push_back((char)((codePoint & 0x3F) | 0x80)); |
| continue; |
| } |
| |
| { |
| |
| u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0)); |
| u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80)); |
| u8str.push_back((char)((u16char & 0x3F) | 0x80)); |
| continue; |
| } |
| } |
| |
| return u8str; |
| } |
| |
| |
| |
| std::string utf16be_to_utf8(const std::u16string& u16str) |
| { |
| if (u16str.empty()){ return std::string(); } |
| const char16_t* p = u16str.data(); |
| std::u16string::size_type len = u16str.length(); |
| if (p[0] == 0xFEFF){ |
| p += 1; |
| len -= 1; |
| } |
| |
| |
| |
| std::string u8str; |
| u8str.reserve(len * 2); |
| char16_t u16char; |
| for (std::u16string::size_type i = 0; i < len; ++i) { |
| |
| u16char = p[i]; |
| |
| u16char = byteswap_ushort(u16char); |
| |
| |
| if (u16char < 0x0080) { |
| |
| |
| u8str.push_back((char)(u16char & 0x00FF)); |
| continue; |
| } |
| |
| if (u16char >= 0x0080 && u16char <= 0x07FF) { |
| |
| u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0)); |
| u8str.push_back((char)((u16char & 0x3F) | 0x80)); |
| continue; |
| } |
| |
| if (u16char >= 0xD800 && u16char <= 0xDBFF) { |
| |
| uint32_t highSur = u16char; |
| uint32_t lowSur = byteswap_ushort(p[++i]); |
| |
| |
| |
| |
| uint32_t codePoint = highSur - 0xD800; |
| codePoint <<= 10; |
| codePoint |= lowSur - 0xDC00; |
| codePoint += 0x10000; |
| |
| u8str.push_back((char)((codePoint >> 18) | 0xF0)); |
| u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80)); |
| u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80)); |
| u8str.push_back((char)((codePoint & 0x3F) | 0x80)); |
| continue; |
| } |
| |
| { |
| |
| u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0)); |
| u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80)); |
| u8str.push_back((char)((u16char & 0x3F) | 0x80)); |
| continue; |
| } |
| } |
| return u8str; |
| } |
| |
| |
| |
| |
| |
| |
| |
| std::u16string utf8_to_utf16le(const std::string& u8str, bool addbom, bool* ok) |
| { |
| std::u16string u16str; |
| u16str.reserve(u8str.size()); |
| if (addbom) { |
| u16str.push_back(0xFEFF); |
| } |
| std::string::size_type len = u8str.length(); |
| |
| const unsigned char* p = (unsigned char*)(u8str.data()); |
| |
| if (len > 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF){ |
| p += 3; |
| len -= 3; |
| } |
| |
| bool is_ok = true; |
| |
| for (std::string::size_type i = 0; i < len; ++i) { |
| uint32_t ch = p[i]; |
| if ((ch & 0x80) == 0) { |
| |
| u16str.push_back((char16_t)ch); |
| continue; |
| } |
| switch (ch & 0xF0) |
| { |
| case 0xF0: |
| { |
| uint32_t c2 = p[++i]; |
| uint32_t c3 = p[++i]; |
| uint32_t c4 = p[++i]; |
| |
| uint32_t codePoint = ((ch & 0x07U) << 18) | ((c2 & 0x3FU) << 12) | ((c3 & 0x3FU) << 6) | (c4 & 0x3FU); |
| if (codePoint >= 0x10000) |
| { |
| |
| |
| |
| |
| codePoint -= 0x10000; |
| u16str.push_back((char16_t)((codePoint >> 10) | 0xD800U)); |
| u16str.push_back((char16_t)((codePoint & 0x03FFU) | 0xDC00U)); |
| } |
| else |
| { |
| |
| |
| u16str.push_back((char16_t)codePoint); |
| } |
| } |
| break; |
| case 0xE0: |
| { |
| uint32_t c2 = p[++i]; |
| uint32_t c3 = p[++i]; |
| |
| uint32_t codePoint = ((ch & 0x0FU) << 12) | ((c2 & 0x3FU) << 6) | (c3 & 0x3FU); |
| u16str.push_back((char16_t)codePoint); |
| } |
| break; |
| case 0xD0: |
| case 0xC0: |
| { |
| uint32_t c2 = p[++i]; |
| |
| uint32_t codePoint = ((ch & 0x1FU) << 12) | ((c2 & 0x3FU) << 6); |
| u16str.push_back((char16_t)codePoint); |
| } |
| break; |
| default: |
| is_ok = false; |
| break; |
| } |
| } |
| if (ok != NULL) { *ok = is_ok; } |
| |
| return u16str; |
| } |
| |
| |
| |
| std::u16string utf8_to_utf16be(const std::string& u8str, bool addbom, bool* ok) |
| { |
| |
| std::u16string u16str = utf8_to_utf16le(u8str, addbom, ok); |
| |
| for (size_t i = 0; i < u16str.size(); ++i) { |
| u16str[i] = byteswap_ushort(u16str[i]); |
| } |
| return u16str; |
| } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理