java判断byte[]数组的原字符串编码类型
最近做html脚本导入库中,读取时总会有乱码的情况。找到一些方法乱码转为正确字符串输出。
参考原文:
https://blog.csdn.net/ajaxhu/article/details/12446917
<!--GetByteEncode--> <dependency> <groupId>com.googlecode.juniversalchardet</groupId> <artifactId>juniversalchardet</artifactId> <version>1.0.3</version> </dependency>
@Slf4j public class Test { @Test public void encode() throws IOException { String file = "C:\\Users\\Victory-x\\Desktop\\code.html"; byte[] bytes = file2byte(file); //编码判断 String encoding = GetByteEncode.getEncoding(bytes); System.out.println("字符编码是:" + encoding); System.out.println("原乱码输出:" + new String(bytes)); System.out.println("//***********************//"); System.out.println("根据文件编码输出:" + new String(bytes, encoding)); } public static byte[] file2byte(String filePath) throws IOException { byte[] buffer = null; try { File file = new File(filePath); FileInputStream fis = new FileInputStream(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(); byte[] b = new byte[1024]; int n; while ((n = fis.read(b)) != -1) { bos.write(b, 0, n); } fis.close(); bos.close(); buffer = bos.toByteArray(); } catch (FileNotFoundException e) { e.printStackTrace(); } return buffer; } }
GetByteEncode:
import lombok.extern.slf4j.Slf4j; import org.mozilla.universalchardet.UniversalDetector; /** * 获取文件编码类型 * * @author XSL * @version Id: GetByteEncode.java, V 1.0 2018/11/30 10:03 XSL Exp $$ */ @Slf4j public class GetByteEncode { /** * 获取文件编码类型 * * @param bytes 文件bytes数组 * @return 编码类型 */ public static String getEncoding(byte[] bytes) { String defaultEncoding = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); log.info("字符编码是:{}", encoding); if (encoding == null) { encoding = defaultEncoding; } return encoding; } }
其它方法乱码转换,原文:
http://daikainan.iteye.com/blog/1439322