gbk转utf-8,奇数中文乱码。
一、乱码的原因
gbk的中文编码是一个汉字用【2】个字节表示,例如汉字“内部”的gbk编码16进制的显示为c4 da b2 bf
utf-8的中文编码是一个汉字用【3】个字节表示,例如汉字“内部”的utf-8编码16进制的显示为e5 86 85 e9 83 a8
很显然,gbk是无法直接转换成utf-8,少字节变为多字节
二、转换的办法
1.首先将gbk字符串getBytes()得到两个原始字节,转换成二进制字符流,共16位。
2.根据UTF-8的汉字编码规则,首字节以1110开头,次字节以10开头,第3字节以10开头。在原始的2进制字符串中插入标志位。最终的长度从16--->16+3+2+2=24。
3.转换完成
通过以下方法将GBK字符转成UTF-8编码格式的byte【】数组
- package test;
- import java.io.UnsupportedEncodingException;
- public class TestEncoder {
- /**
- * @param args
- */
- public static void main(String[] args) throws Exception {
- String gbk = "iteye问答频道编码转换问题";
- String iso = new String(gbk.getBytes("UTF-8"),"ISO-8859-1");
- System.out.println(iso);
- String utf8 = new String(iso.getBytes("ISO-8859-1"),"UTF-8");
- System.out.println(utf8);
- System.out.println(getUTF8StringFromGBKString(gbk));
- }
- public static String getUTF8StringFromGBKString(String gbkStr) {
- try {
- return new String(getUTF8BytesFromGBKString(gbkStr), "UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new InternalError();
- }
- }
- public static byte[] getUTF8BytesFromGBKString(String gbkStr) {
- int n = gbkStr.length();
- byte[] utfBytes = new byte[3 * n];
- int k = 0;
- for (int i = 0; i < n; i++) {
- int m = gbkStr.charAt(i);
- if (m < 128 && m >= 0) {
- utfBytes[k++] = (byte) m;
- continue;
- }
- utfBytes[k++] = (byte) (0xe0 | (m >> 12));
- utfBytes[k++] = (byte) (0x80 | ((m >> 6) & 0x3f));
- utfBytes[k++] = (byte) (0x80 | (m & 0x3f));
- }
- if (k < utfBytes.length) {
- byte[] tmp = new byte[k];
- System.arraycopy(utfBytes, 0, tmp, 0, k);
- return tmp;
- }
- return utfBytes;
- }
- }
或者:
- public static void gbk2Utf() throws UnsupportedEncodingException {
- String gbk = "我来了";
- char[] c = gbk.toCharArray();
- byte[] fullByte = new byte[3*c.length];
- for (int i=0; i<c.length; i++) {
- String binary = Integer.toBinaryString(c[i]);
- StringBuffer sb = new StringBuffer();
- int len = 16 - binary.length();
- //前面补零
- for(int j=0; j<len; j++){
- sb.append("0");
- }
- sb.append(binary);
- //增加位,达到到24位3个字节
- sb.insert(0, "1110");
- sb.insert(8, "10");
- sb.insert(16, "10");
- fullByte[i*3] = Integer.valueOf(sb.substring(0, 8), 2).byteValue();//二进制字符串创建整型
- fullByte[i*3+1] = Integer.valueOf(sb.substring(8, 16), 2).byteValue();
- fullByte[i*3+2] = Integer.valueOf(sb.substring(16, 24), 2).byteValue();
- }
- //模拟UTF-8编码的网站显示
- System.out.println(new String(fullByte,"UTF-8"));
- }