java字符集(二)
知道java的字符集编码,那么java是怎么读取内存中的字节转换成你需要的字符的呢,其实很简单,
java中的是通过StringCoding来完成字符转换的,他是一个内嵌类,现将源代码拷至如下:
Code
public class StringCoding {
private StringCoding() {
}
/**//*
* The cached coders for each thread
*/
private static ThreadLocal decoder = new ThreadLocal();
private static ThreadLocal encoder = new ThreadLocal();
private static boolean warnUnsupportedCharset = true;
private static Object deref(ThreadLocal tl) {
SoftReference sr = (SoftReference) tl.get();
if (sr == null)
return null;
return sr.get();
}
private static void set(ThreadLocal tl, Object ob) {
tl.set(new SoftReference(ob));
}
// Trim the given byte array to the given length
//
private static byte[] trim(byte[] ba, int len) {
if (len == ba.length)
return ba;
byte[] tba = new byte[len];
System.arraycopy(ba, 0, tba, 0, len);
return tba;
}
// Trim the given char array to the given length
//
private static char[] trim(char[] ca, int len) {
if (len == ca.length)
return ca;
char[] tca = new char[len];
System.arraycopy(ca, 0, tca, 0, len);
return tca;
}
private static int scale(int len, float expansionFactor) {
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when len is larger than 2**24.
return (int) (len * (double) expansionFactor);
}
private static Charset lookupCharset(String csn) {
if (Charset.isSupported(csn)) {
try {
return Charset.forName(csn);
} catch (UnsupportedCharsetException x) {
throw new Error(x);
}
}
return null;
}
private static void warnUnsupportedCharset(String csn) {
if (warnUnsupportedCharset) {
// Use sun.misc.MessageUtils rather than the Logging API or
// System.err since this method may be called during VM
// initialization before either is available.
MessageUtils.err("WARNING: Default charset " + csn
+ " not supported, using ISO-8859-1 instead");
warnUnsupportedCharset = false;
}
}
// -- Decoding --
// Encapsulates either a ByteToCharConverter or a CharsetDecoder
//
private static abstract class StringDecoder {
private final String requestedCharsetName;
protected StringDecoder(String requestedCharsetName) {
this.requestedCharsetName = requestedCharsetName;
}
final String requestedCharsetName() {
return requestedCharsetName;
}
abstract String charsetName();
abstract char[] decode(byte[] ba, int off, int len);
}
// A string decoder based upon a ByteToCharConverter
//
private static class ConverterSD extends StringDecoder {
private ByteToCharConverter btc;
private ConverterSD(ByteToCharConverter btc, String rcn) {
super(rcn);
this.btc = btc;
}
String charsetName() {
return btc.getCharacterEncoding();
}
char[] decode(byte[] ba, int off, int len) {
int en = scale(len, btc.getMaxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
btc.reset();
int n = 0;
try {
n = btc.convert(ba, off, off + len, ca, 0, en);
n += btc.flush(ca, btc.nextCharIndex(), en);
} catch (CharConversionException x) {
// Yes, this is what we've always done
n = btc.nextCharIndex();
}
return trim(ca, n);
}
}
// A string decoder based upon a CharsetDecoder
//
private static class CharsetSD extends StringDecoder {
private final Charset cs;
private final CharsetDecoder cd;
private CharsetSD(Charset cs, String rcn) {
super(rcn);
this.cs = cs;
this.cd = cs.newDecoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}
char[] decode(byte[] ba, int off, int len) {
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ca, cb.position());
}
}
static char[] decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException {
StringDecoder sd = (StringDecoder) deref(decoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((sd == null)
|| !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd
.charsetName()))) {
sd = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
sd = new CharsetSD(cs, csn);
else
sd = null;
} catch (IllegalCharsetNameException x) {
// FALL THROUGH to ByteToCharConverter, for compatibility
}
if (sd == null)
sd = new ConverterSD(ByteToCharConverter.getConverter(csn), csn);
set(decoder, sd);
}
return sd.decode(ba, off, len);
}
static char[] decode(byte[] ba, int off, int len) {
String csn = Converters.getDefaultEncodingName();
try {
return decode(csn, ba, off, len);
} catch (UnsupportedEncodingException x) {
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try {
return decode("ISO-8859-1", ba, off, len);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
// -- Encoding --
// Encapsulates either a CharToByteConverter or a CharsetEncoder
//
private static abstract class StringEncoder {
private final String requestedCharsetName;
protected StringEncoder(String requestedCharsetName) {
this.requestedCharsetName = requestedCharsetName;
}
final String requestedCharsetName() {
return requestedCharsetName;
}
abstract String charsetName();
abstract byte[] encode(char[] cs, int off, int len);
}
// A string encoder based upon a CharToByteConverter
//
private static class ConverterSE extends StringEncoder {
private CharToByteConverter ctb;
private ConverterSE(CharToByteConverter ctb, String rcn) {
super(rcn);
this.ctb = ctb;
}
String charsetName() {
return ctb.getCharacterEncoding();
}
byte[] encode(char[] ca, int off, int len) {
int en = scale(len, ctb.getMaxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ctb.reset();
int n;
try {
n = ctb.convertAny(ca, off, (off + len), ba, 0, en);
n += ctb.flushAny(ba, ctb.nextByteIndex(), en);
} catch (CharConversionException x) {
throw new Error("Converter malfunction: "
+ ctb.getClass().getName(), x);
}
return trim(ba, n);
}
}
// A string encoder based upon a CharsetEncoder
//
private static class CharsetSE extends StringEncoder {
private Charset cs;
private CharsetEncoder ce;
private CharsetSE(Charset cs, String rcn) {
super(rcn);
this.cs = cs;
this.ce = cs.newEncoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}
byte[] encode(char[] ca, int off, int len) {
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ce.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ba, bb.position());
}
}
static byte[] encode(String charsetName, char[] ca, int off, int len)
throws UnsupportedEncodingException {
StringEncoder se = (StringEncoder) deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null)
|| !(csn.equals(se.requestedCharsetName()) || csn.equals(se
.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
se = new CharsetSE(cs, csn);
} catch (IllegalCharsetNameException x) {
// FALL THROUGH to CharToByteConverter, for compatibility
}
if (se == null)
se = new ConverterSE(CharToByteConverter.getConverter(csn), csn);
set(encoder, se);
}
return se.encode(ca, off, len);
}
static byte[] encode(char[] ca, int off, int len) {
String csn = Converters.getDefaultEncodingName();
System.out.println(csn);
try {
return encode(csn, ca, off, len);
} catch (UnsupportedEncodingException x) {
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try {
return encode("ISO-8859-1", ca, off, len);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
}
public class StringCoding {
private StringCoding() {
}
/**//*
* The cached coders for each thread
*/
private static ThreadLocal decoder = new ThreadLocal();
private static ThreadLocal encoder = new ThreadLocal();
private static boolean warnUnsupportedCharset = true;
private static Object deref(ThreadLocal tl) {
SoftReference sr = (SoftReference) tl.get();
if (sr == null)
return null;
return sr.get();
}
private static void set(ThreadLocal tl, Object ob) {
tl.set(new SoftReference(ob));
}
// Trim the given byte array to the given length
//
private static byte[] trim(byte[] ba, int len) {
if (len == ba.length)
return ba;
byte[] tba = new byte[len];
System.arraycopy(ba, 0, tba, 0, len);
return tba;
}
// Trim the given char array to the given length
//
private static char[] trim(char[] ca, int len) {
if (len == ca.length)
return ca;
char[] tca = new char[len];
System.arraycopy(ca, 0, tca, 0, len);
return tca;
}
private static int scale(int len, float expansionFactor) {
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when len is larger than 2**24.
return (int) (len * (double) expansionFactor);
}
private static Charset lookupCharset(String csn) {
if (Charset.isSupported(csn)) {
try {
return Charset.forName(csn);
} catch (UnsupportedCharsetException x) {
throw new Error(x);
}
}
return null;
}
private static void warnUnsupportedCharset(String csn) {
if (warnUnsupportedCharset) {
// Use sun.misc.MessageUtils rather than the Logging API or
// System.err since this method may be called during VM
// initialization before either is available.
MessageUtils.err("WARNING: Default charset " + csn
+ " not supported, using ISO-8859-1 instead");
warnUnsupportedCharset = false;
}
}
// -- Decoding --
// Encapsulates either a ByteToCharConverter or a CharsetDecoder
//
private static abstract class StringDecoder {
private final String requestedCharsetName;
protected StringDecoder(String requestedCharsetName) {
this.requestedCharsetName = requestedCharsetName;
}
final String requestedCharsetName() {
return requestedCharsetName;
}
abstract String charsetName();
abstract char[] decode(byte[] ba, int off, int len);
}
// A string decoder based upon a ByteToCharConverter
//
private static class ConverterSD extends StringDecoder {
private ByteToCharConverter btc;
private ConverterSD(ByteToCharConverter btc, String rcn) {
super(rcn);
this.btc = btc;
}
String charsetName() {
return btc.getCharacterEncoding();
}
char[] decode(byte[] ba, int off, int len) {
int en = scale(len, btc.getMaxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
btc.reset();
int n = 0;
try {
n = btc.convert(ba, off, off + len, ca, 0, en);
n += btc.flush(ca, btc.nextCharIndex(), en);
} catch (CharConversionException x) {
// Yes, this is what we've always done
n = btc.nextCharIndex();
}
return trim(ca, n);
}
}
// A string decoder based upon a CharsetDecoder
//
private static class CharsetSD extends StringDecoder {
private final Charset cs;
private final CharsetDecoder cd;
private CharsetSD(Charset cs, String rcn) {
super(rcn);
this.cs = cs;
this.cd = cs.newDecoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}
char[] decode(byte[] ba, int off, int len) {
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ca, cb.position());
}
}
static char[] decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException {
StringDecoder sd = (StringDecoder) deref(decoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((sd == null)
|| !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd
.charsetName()))) {
sd = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
sd = new CharsetSD(cs, csn);
else
sd = null;
} catch (IllegalCharsetNameException x) {
// FALL THROUGH to ByteToCharConverter, for compatibility
}
if (sd == null)
sd = new ConverterSD(ByteToCharConverter.getConverter(csn), csn);
set(decoder, sd);
}
return sd.decode(ba, off, len);
}
static char[] decode(byte[] ba, int off, int len) {
String csn = Converters.getDefaultEncodingName();
try {
return decode(csn, ba, off, len);
} catch (UnsupportedEncodingException x) {
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try {
return decode("ISO-8859-1", ba, off, len);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
// -- Encoding --
// Encapsulates either a CharToByteConverter or a CharsetEncoder
//
private static abstract class StringEncoder {
private final String requestedCharsetName;
protected StringEncoder(String requestedCharsetName) {
this.requestedCharsetName = requestedCharsetName;
}
final String requestedCharsetName() {
return requestedCharsetName;
}
abstract String charsetName();
abstract byte[] encode(char[] cs, int off, int len);
}
// A string encoder based upon a CharToByteConverter
//
private static class ConverterSE extends StringEncoder {
private CharToByteConverter ctb;
private ConverterSE(CharToByteConverter ctb, String rcn) {
super(rcn);
this.ctb = ctb;
}
String charsetName() {
return ctb.getCharacterEncoding();
}
byte[] encode(char[] ca, int off, int len) {
int en = scale(len, ctb.getMaxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ctb.reset();
int n;
try {
n = ctb.convertAny(ca, off, (off + len), ba, 0, en);
n += ctb.flushAny(ba, ctb.nextByteIndex(), en);
} catch (CharConversionException x) {
throw new Error("Converter malfunction: "
+ ctb.getClass().getName(), x);
}
return trim(ba, n);
}
}
// A string encoder based upon a CharsetEncoder
//
private static class CharsetSE extends StringEncoder {
private Charset cs;
private CharsetEncoder ce;
private CharsetSE(Charset cs, String rcn) {
super(rcn);
this.cs = cs;
this.ce = cs.newEncoder().onMalformedInput(
CodingErrorAction.REPLACE).onUnmappableCharacter(
CodingErrorAction.REPLACE);
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset) cs).historicalName();
return cs.name();
}
byte[] encode(char[] ca, int off, int len) {
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ce.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return trim(ba, bb.position());
}
}
static byte[] encode(String charsetName, char[] ca, int off, int len)
throws UnsupportedEncodingException {
StringEncoder se = (StringEncoder) deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null)
|| !(csn.equals(se.requestedCharsetName()) || csn.equals(se
.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
se = new CharsetSE(cs, csn);
} catch (IllegalCharsetNameException x) {
// FALL THROUGH to CharToByteConverter, for compatibility
}
if (se == null)
se = new ConverterSE(CharToByteConverter.getConverter(csn), csn);
set(encoder, se);
}
return se.encode(ca, off, len);
}
static byte[] encode(char[] ca, int off, int len) {
String csn = Converters.getDefaultEncodingName();
System.out.println(csn);
try {
return encode(csn, ca, off, len);
} catch (UnsupportedEncodingException x) {
Converters.resetDefaultEncodingName();
warnUnsupportedCharset(csn);
}
try {
return encode("ISO-8859-1", ca, off, len);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.
MessageUtils.err("ISO-8859-1 charset not available: "
+ x.toString());
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
}
}
}
好测试一下:
Code
public class testCode {
public static void main(String args[]) throws UnsupportedEncodingException {
char[] a = { '你', '好' };
System.out.println("default:");
byte[] bDef = StringCoding.encode(a, 0, a.length);
for (int i = 0; i < bDef.length; i++) {
System.out.println(Integer.toBinaryString(bDef[i])
.substring(24, 32));
}
System.out.println("GBK:");
byte[] b = StringCoding.encode("GBK", a, 0, a.length);
for (int i = 0; i < b.length; i++) {
System.out.println(Integer.toBinaryString(b[i]).substring(24, 32));
}
System.out.println("UTF-8:");
byte[] butf_8 = StringCoding.encode("utf-8", a, 0, a.length);
for (int i = 0; i < butf_8.length; i++) {
System.out.println(Integer.toBinaryString(butf_8[i]).substring(24,
32));
}
byte[] code = new byte[4];
code[0] = (byte)Integer.parseInt("11000100", 2);
code[1] = (byte)Integer.parseInt("11100011", 2);
code[2] = (byte)Integer.parseInt("10111010", 2);
code[3] = (byte)Integer.parseInt("11000011", 2);
System.out.println(StringCoding.decode("GBK",code,0,code.length));
System.out.println(StringCoding.decode("utf-8",code,0,code.length));
}
}
public class testCode {
public static void main(String args[]) throws UnsupportedEncodingException {
char[] a = { '你', '好' };
System.out.println("default:");
byte[] bDef = StringCoding.encode(a, 0, a.length);
for (int i = 0; i < bDef.length; i++) {
System.out.println(Integer.toBinaryString(bDef[i])
.substring(24, 32));
}
System.out.println("GBK:");
byte[] b = StringCoding.encode("GBK", a, 0, a.length);
for (int i = 0; i < b.length; i++) {
System.out.println(Integer.toBinaryString(b[i]).substring(24, 32));
}
System.out.println("UTF-8:");
byte[] butf_8 = StringCoding.encode("utf-8", a, 0, a.length);
for (int i = 0; i < butf_8.length; i++) {
System.out.println(Integer.toBinaryString(butf_8[i]).substring(24,
32));
}
byte[] code = new byte[4];
code[0] = (byte)Integer.parseInt("11000100", 2);
code[1] = (byte)Integer.parseInt("11100011", 2);
code[2] = (byte)Integer.parseInt("10111010", 2);
code[3] = (byte)Integer.parseInt("11000011", 2);
System.out.println(StringCoding.decode("GBK",code,0,code.length));
System.out.println(StringCoding.decode("utf-8",code,0,code.length));
}
}
运行结果如下:
default:
11000100
11100011
10111010
11000011
GBK:
11000100
11100011
10111010
11000011
UTF-8:
11100100
10111101
10100000
11100101
10100101
10111101
你好
???
看出不同了吧,window操作系统默认GBK编码方式,特别代码最后几行,我们把Byte输入,按不同的编码输出,就有不同的结果.想必您已轻知道您的程序为什么出现乱码了吧。