和荣笔记 -- 从 Unicode 到 GB2312 转换表制作程式
在我发表了 GB2312 到 Unicode 的转换表以后,收到了读者信件,寻求 Unicode 到 GB2312 的转换表。
下面的程式便可以用来制作这样的转换表。程式的输出结果收入下一章之中。
1/**
2* UnicodeGB2312.java
3* Copyright (c) 1997-2003 by Dr. Herong Yang
4*/
5import java.io.*;
6import java.nio.*;
7import java.nio.charset.*;
8class UnicodeGB2312 {
9static OutputStream out = null;
10static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7',
11'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
12static int b_out[] = {201,267,279,293,484,587,625,657,734,782,827,
13874,901,980,1001,5590,8801};
14static int e_out[] = {216,268,280,294,494,594,632,694,748,794,836,
15894,903,994,1594,5594,9494};
16public static void main(String[] a) {
17try {
18out = new FileOutputStream("unicode_gb2312.gb");
19writeCode();
20out.close();
21} catch (IOException e) {
22System.out.println(e.toString());
23}
24}
25public static void writeCode() throws IOException {
26CharsetEncoder gbec = Charset.forName("GBK").newEncoder();
27char[] ca = new char[1];
28CharBuffer cb = null;
29ByteBuffer gbbb = null;
30writeHeader();
31int count = 0;
32for (int i=0; i<0x010000; i++) {
33ca[0] = (char) i;
34cb = CharBuffer.wrap(ca);
35try {
36gbbb = gbec.encode(cb);
37} catch (CharacterCodingException e) {
38gbbb = null;
39}
40if (validGB(gbbb)) {
41count++;
42writeHex((byte) (ca[0] >>> 8));
43writeHex((byte) (ca[0] & 0xff));
44writeString(" ");
45writeByteBuffer(gbbb,2);
46writeString(" ");
47writeByte(gbbb.get(0));
48writeByte(gbbb.get(1));
49if (count%5 == 0) writeln();
50else writeString(" ");
51}
52}
53if (count%5 != 0) writeln();
54writeFooter();
55System.out.println("Number of GB characters wrote: "+count);
56}
57public static boolean validGB(ByteBuffer gbbb) {
58if (gbbb==null) return false;
59else if (gbbb.limit()!=2) return false;
60else {
61byte hi = gbbb.get(0);
62byte lo = gbbb.get(1);
63if ((hi&0xFF)<0xA0) return false;
64if ((lo&0xFF)<0xA0) return false;
65int i = (hi&0xFF) - 0xA0;
66int j = (lo&0xFF) - 0xA0;
67if (i<1 || i>94) return false;
68if (j<1 || j>94) return false;
69for (int l=0; l<b_out.length; l++) {
70if (i*100+j>=b_out[l] && i*100+j<=e_out[l]) return false;
71}
72}
73return true;
74}
75public static void writeHeader() throws IOException {
76writeString("<pre>");
77writeln();
78writeString("Uni. GB ");
79writeGBSpace();
80writeString(" ");
81writeString("Uni. GB ");
82writeGBSpace();
83writeString(" ");
84writeString("Uni. GB ");
85writeGBSpace();
86writeString(" ");
87writeString("Uni. GB ");
88writeGBSpace();
89writeString(" ");
90writeString("Uni. GB ");
91writeGBSpace();
92writeln();
93writeln();
94}
95public static void writeFooter() throws IOException {
96writeString("</pre>");
97writeln();
98}
99public static void writeln() throws IOException {
100out.write(0x0D);
101out.write(0x0A);
102}
103public static void writeGBSpace() throws IOException {
104out.write(0xA1);
105out.write(0xA1);
106}
107public static void writeByteBuffer(ByteBuffer b, int l)
108throws IOException {
109int i = 0;
110if (b==null) {
111writeString("null");
112i = 2;
113} else {
114for (i=0; i<b.limit(); i++) writeHex(b.get(i));
115}
116for (int j=i; j<l; j++) writeString(" ");
117}
118public static void writeString(String s) throws IOException {
119if (s!=null) {
120for (int i=0; i<s.length(); i++) {
121out.write((int) (s.charAt(i) & 0xFF));
122}
123}
124}
125public static void writeHex(byte b) throws IOException {
126out.write((int) hexDigit[(b >> 4) & 0x0F]);
127out.write((int) hexDigit[b & 0x0F]);
128}
129public static void writeByte(byte b) throws IOException {
130out.write(b & 0xFF);
131}
132}
133
上面的程式发表后,又有读者来信要求对程式加以说明,以便理解。其实这个程式的逻辑很简单,阅读时仅需注意以下几点:
一, Unicode 字符集的全体编码都在 0x0000 和 0xFFFF 之间,所以子程式 writeCode() 使用了一个循环复句,以变量 i 走遍了 Unicode 的全体可能编码。
二,把单个 Unicode 编码转换成 GB2312 编码的关键语句是:gbec.encode(cb),它使用了 JDK 中 CharsetEncoder 的中文编码功能。注意,GBK 是由 GB2312 扩张而成。JDK 只提供 GBK 编码功能。
三,由于 Unicode 字符集比 GB2312 大,gbec.encode(cb) 输出的编码有许多是坏码,或者是 GBK 的扩张码,所以要用子程式 validGB() 进行验证。
四,程式的其它部分主要是用于输出的列表制作。
我来自:向东博客