识别常见编码格式文件并转换成UTF-8编码 的java实现 源码
package com.buptsse.ate.utils; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author michaelw * @email michael.wang1028@gmail.com * @date 2012-09-03 */ public class ConverEncoding { static String CODE = "UTF-8"; static String FILE_SUFFIX = ".txt";//文件扩展名 // static String FILE_SUFFIX = ".css"; // static String FILE_SUFFIX = ".js"; // static String FILE_SUFFIX = ".htm"; static String srcDir = "C:\\WorkTools\\weenCompany_ChineseEnglish_JT_V5.3.0_UTF8";//文件所在目录 /** * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { List<String> files = new ArrayList<String>(); fetchFileList(srcDir, files, FILE_SUFFIX); String filecode = ""; for (String fileName : files) { filecode = codeString(fileName); if (!filecode.equals(CODE)) { convert(fileName, filecode, fileName, CODE); } } } public static void convert(String oldFile, String oldCharset, String newFlie, String newCharset) { BufferedReader bin; FileOutputStream fos; StringBuffer content = new StringBuffer(); try { System.out.println("the old file is :"+oldFile); System.out.println("The oldCharset is : "+oldCharset); bin = new BufferedReader(new InputStreamReader(new FileInputStream( oldFile), oldCharset)); String line = null; while ((line = bin.readLine()) != null) { // System.out.println("content:" + content); content.append(line); content.append(System.getProperty("line.separator")); } bin.close(); File dir = new File(newFlie.substring(0, newFlie.lastIndexOf("\\"))); if (!dir.exists()) { dir.mkdirs(); } fos = new FileOutputStream(newFlie); Writer out = new OutputStreamWriter(fos, newCharset); out.write(content.toString()); out.close(); fos.close(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public static void fetchFileList(String strPath, List<String> filelist, final String regex) { File dir = new File(strPath); File[] files = dir.listFiles(); Pattern p = Pattern.compile(regex); if (files == null) return; for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) { fetchFileList(files[i].getAbsolutePath(), filelist, regex); } else { String strFileName = files[i].getAbsolutePath().toLowerCase(); Matcher m = p.matcher(strFileName); if (m.find()) { filelist.add(strFileName); } } } } /** * 判断文件的编码格式 * * @param fileName * :file * @return 文件编码格式 * @throws Exception */ public static String codeString(String fileName) throws Exception { BufferedInputStream bin = new BufferedInputStream(new FileInputStream( fileName)); int p = (bin.read() << 8) + bin.read(); String code = null; switch (p) { case 0xefbb: code = "UTF-8"; break; case 0xfffe: code = "Unicode"; break; case 0xfeff: code = "UTF-16BE"; break; default: code = "GBK"; } return code; } }
posted on 2012-09-03 12:06 Michael.Wang 阅读(9610) 评论(3) 编辑 收藏 举报