utf8 ucs4
这个问题不好回答,首先UTF-8编码只不过是一种Unicode的转换,兼容ASCII。
所以,UTF-8编码支持的最大字符编码应该是Unicode支持的最大字符编码。
理论上,UTF-8编码可以支持最大6字节:
00000000-0000007F 0xxxxxxx
00000080-000007FF 110yyyxx 10xxxxxx
00000800-0000FFFF 1110yyyy 10yyyyxx 10xxxxxx
00010000-001FFFFF 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
00020000-03FFFFFF 111110aa 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
04000000-7FFFFFFF 1111110a 10aaaaaa 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
UCS4最多可以表示2^32个字符,但UTF-8最多只能表示2^31个字符,现实当中,Unicode规范根本就还没有规定这么多字符,目前最多也就规定到Plane 16,最多1114111个字符,所以网上关于UTF-8编码,大多截止到4字节:
00000000-0000007F 0xxxxxxx
00000080-000007FF 110yyyxx 10xxxxxx
00000800-0000FFFF 1110yyyy 10yyyyxx 10xxxxxx
00010000-0010FFFF 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
Unicode规范规定,10FFFE和10FFFF作为内部保留,所以Plane 16支持的最大字符编码是:10FFFD
转换成UTF-8编码是:F48FBFBD
但是,到底这个字符能否正常显示出来,还要看你系统里装的字体到底支持多少字符。
所以,“utf-8里现在已经用到的字符最大编码是多少”这个问题的答案依赖于Unicode规范的版本、具体字体字库的支持。
0x00-0x7F 同ASCII,也不可能作为任何其他多字节UTF-8字符的一部分
0xC0-0xDF 多字节UTF-8字符的开始字节,而且据此可以判断出该UTF-8字符的长度(字节数)
0x80-0xBF 多字节UTF-8字符的跟随字节
0xFE-0xFF UTF-8未使用
> 字节数 | 位数 | 表示 |
> 1 | 7 | 0bbbbbbb |
> 2 | 11 | 110bbbbb 10bbbbbb |
> 3 | 16 | 1110bbbb 10bbbbbb 10bbbbbb |
> 4 | 21 | 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb |
> 5 | 26 | 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb |
> 6 | 31 | 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb |
> 7 | 36 | 11111110 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb |
> 8 | 42 | 11111111 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb |
typedef uint8 utf8; //typedef uint16 utf16; // removed typedef to prevent usage, as utf16 is not supported (yet) typedef uint32 utf32; // return number of code units in a null terminated string size_type utf_length(const utf8* utf8_str) const { size_type cnt = 0; while (*utf8_str++) cnt++; return cnt; } // return number of code units in a null terminated string size_type utf_length(const utf32* utf32_str) const { size_type cnt = 0; while (*utf32_str++) cnt++; return cnt; }
上面是计算utf8所占的字节数和 utf32 占用的字节数
// return number of utf32 code units required to re-encode given utf8 data as utf32. len is number of code units in 'buf'. size_type encoded_size(const utf8* buf, size_type len) const { utf8 tcp; size_type count = 0; while (len--) { tcp = *buf++; ++count; size_type size = 0; if (tcp < 0x80) { } else if (tcp < 0xE0) { size = 1; ++buf; } else if (tcp < 0xF0) { size = 2; buf += 2; } else { size = 3; buf += 3; } if (len >= size) len -= size; else break; } return count; }
计算utf8的文字个数
// return the number of utf8 code units required to encode the given utf32 code point size_type encoded_size(utf32 code_point) const { if (code_point < 0x80) return 1; else if (code_point < 0x0800) return 2; else if (code_point < 0x10000) return 3; else return 4; }
计算ucs4的 转到utf8 的字节个数
size_type encode(const utf8* src, utf32* dest, size_type dest_len, size_type src_len = 0) const { // count length for null terminated source... if (src_len == 0) { src_len = utf_length(src); } size_type destCapacity = dest_len; // while there is data in the source buffer, and space in the dest buffer for (uint idx = 0; ((idx < src_len) && (destCapacity > 0));) { utf32 cp; utf8 cu = src[idx++]; if (cu < 0x80) { cp = (utf32)(cu); } else if (cu < 0xE0) { cp = ((cu & 0x1F) << 6); cp |= (src[idx++] & 0x3F); } else if (cu < 0xF0) { cp = ((cu & 0x0F) << 12); cp |= ((src[idx++] & 0x3F) << 6); cp |= (src[idx++] & 0x3F); } else { cp = ((cu & 0x07) << 18); cp |= ((src[idx++] & 0x3F) << 12); cp |= ((src[idx++] & 0x3F) << 6); cp |= (src[idx++] & 0x3F); } *dest++ = cp; --destCapacity; } return dest_len - destCapacity; }
从utf8 转到utf32 即ucs4
size_type encode(const utf32* src, utf8* dest, size_type dest_len, size_type src_len = 0) const { // count length for null terminated source... if (src_len == 0) { src_len = utf_length(src); } size_type destCapacity = dest_len; // while there is data in the source buffer, for (uint idx = 0; idx < src_len; ++idx) { utf32 cp = src[idx]; // check there is enough destination buffer to receive this encoded unit (exit loop & return if not) if (destCapacity < encoded_size(cp)) { break; } if (cp < 0x80) { *dest++ = (utf8)cp; --destCapacity; } else if (cp < 0x0800) { *dest++ = (utf8)((cp >> 6) | 0xC0); *dest++ = (utf8)((cp & 0x3F) | 0x80); destCapacity -= 2; } else if (cp < 0x10000) { *dest++ = (utf8)((cp >> 12) | 0xE0); *dest++ = (utf8)(((cp >> 6) & 0x3F) | 0x80); *dest++ = (utf8)((cp & 0x3F) | 0x80); destCapacity -= 3; } else { *dest++ = (utf8)((cp >> 18) | 0xF0); *dest++ = (utf8)(((cp >> 12) & 0x3F) | 0x80); *dest++ = (utf8)(((cp >> 6) & 0x3F) | 0x80); *dest++ = (utf8)((cp & 0x3F) | 0x80); destCapacity -= 4; } } return dest_len - destCapacity; }
从utf32 转utf8