Punycode与中文互转
Punycode是一个根据RFC 3492标准而制定的编码系统,主要用于把域名从地方语言所采用的Unicode编码转换成为可用于DNS系统的编码
“中文域名”不被标准的解析服务器支持,需转化为Punycode码进行解析,例如“百度.中国”的转码为: xn--wxTr44c.xn--fiqs8S
目前,因为操作系统的核心都是英文组成,DNS服务器的解析也是由英文代码交换,所以DNS服务器上并不支持直接的中文域名解析,所有中文域名的解析都需要转成Punycode码,然后由DNS解析Punycode码
其实目前所说的各种浏览器完美支持中文域名,只是浏览器中主动加入了中文域名自动转码,不需要再次安装中文域名转码控件来完成整个流程
如在浏览器中输入"北京大学.com”,然后通过wireshark抓包
GET http://xn--1lq90ic7fzpc.com/ HTTP/1.1
将原作者的.Net版本修改为了Java版本,这里感谢下原作者
public class CharsetTool { static int TMIN = 1; static int TMAX = 26; static int BASE = 36; static int INITIAL_N = 128; static int INITIAL_BIAS = 72; static int DAMP = 700; static int SKEW = 38; static char DELIMITER = '-'; /** * Punycodes a unicode string. * * @param input * Unicode string. * * @return Punycoded string. */ public static String encode(String input) throws Exception { int n = INITIAL_N; int delta = 0; int bias = INITIAL_BIAS; StringBuilder output = new StringBuilder(); // Copy all basic code points to the output int b = 0; for (int i = 0; i < input.length(); i++) { char c = input.charAt(i); if (isBasic(c)) { output.append(c); b++; } } // Append delimiter if (b > 0) { output.append(DELIMITER); } int h = b; while (h < input.length()) { int m = Integer.MAX_VALUE; // Find the minimum code point >= n for (int i = 0; i < input.length(); i++) { int c = input.charAt(i); if (c >= n && c < m) { m = c; } } if (m - n > (Integer.MAX_VALUE - delta) / (h + 1)) { throw new Exception("OVERFLOW"); } delta = delta + (m - n) * (h + 1); n = m; for (int j = 0; j < input.length(); j++) { int c = input.charAt(j); if (c < n) { delta++; if (0 == delta) { throw new Exception("OVERFLOW"); } } if (c == n) { int q = delta; for (int k = BASE;; k += BASE) { int t; if (k <= bias) { t = TMIN; } else if (k >= bias + TMAX) { t = TMAX; } else { t = k - bias; } if (q < t) { break; } output.append((char) digit2codepoint(t + (q - t) % (BASE - t))); q = (q - t) / (BASE - t); } output.append((char) digit2codepoint(q)); bias = adapt(delta, h + 1, h == b); delta = 0; h++; } } delta++; n++; } return output.toString(); } /** * Decode a punycoded string. * * @param input * Punycode string * * @return Unicode string. */ public static String decode(String input) throws Exception { int n = INITIAL_N; int i = 0; int bias = INITIAL_BIAS; StringBuilder output = new StringBuilder(); int d = input.lastIndexOf(DELIMITER); if (d > 0) { for (int j = 0; j < d; j++) { char c = input.charAt(j); if (!isBasic(c)) { throw new Exception("BAD_INPUT"); } output.append(c); } d++; } else { d = 0; } while (d < input.length()) { int oldi = i; int w = 1; for (int k = BASE;; k += BASE) { if (d == input.length()) { throw new Exception("BAD_INPUT"); } int c = input.charAt(d++); int digit = codepoint2digit(c); if (digit > (Integer.MAX_VALUE - i) / w) { throw new Exception("OVERFLOW"); } i = i + digit * w; int t; if (k <= bias) { t = TMIN; } else if (k >= bias + TMAX) { t = TMAX; } else { t = k - bias; } if (digit < t) { break; } w = w * (BASE - t); } bias = adapt(i - oldi, output.length() + 1, oldi == 0); if (i / (output.length() + 1) > Integer.MAX_VALUE - n) { throw new Exception("OVERFLOW"); } n = n + i / (output.length() + 1); i = i % (output.length() + 1); output.insert(i, (char) n); i++; } return output.toString(); } public static int adapt(int delta, int numpoints, boolean first) { if (first) { delta = delta / DAMP; } else { delta = delta / 2; } delta = delta + (delta / numpoints); int k = 0; while (delta > ((BASE - TMIN) * TMAX) / 2) { delta = delta / (BASE - TMIN); k = k + BASE; } return k + ((BASE - TMIN + 1) * delta) / (delta + SKEW); } public static boolean isBasic(char c) { return c < 0x80; } public static int digit2codepoint(int d) throws Exception { if (d < 26) { // 0..25 : 'a'..'z' return d + 'a'; } else if (d < 36) { // 26..35 : '0'..'9'; return d - 26 + '0'; } else { throw new Exception("BAD_INPUT"); } } public static int codepoint2digit(int c) throws Exception { if (c - '0' < 10) { // '0'..'9' : 26..35 return c - '0' + 26; } else if (c - 'a' < 26) { // 'a'..'z' : 0..25 return c - 'a'; } else { throw new Exception("BAD_INPUT"); } } public static void main(String[] args) throws Exception { String strPunycode ="xn--"+ CharsetTool.encode("北京大学"); System.out.println(strPunycode); String strChinese = CharsetTool.decode("1lq90ic7fzpc"); System.out.println(strChinese); } }
运行结果为:
xn--1lq90ic7fzpc 北京大学
可以使用在线Punycode转码工具验证转码的正确性
遗憾的是并不支持“百度.中国”这种带有特殊字符“.”以及“中123国”这种混杂有数字的字符串转码