文件编码探测工具

java

https://github.com/CruelPaw/CPDetector
maven依赖

<dependency>
    <groupId>net.sourceforge.cpdetector</groupId>
    <artifactId>cpdetector</artifactId>
    <version>1.0.10</version>
</dependency>

<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-all</artifactId>
    <version>5.8.5</version>
</dependency>

import java.io.File;
import java.io.FileFilter;
import java.nio.charset.Charset;
import java.util.List;

import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Assert;
import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.ByteOrderMarkDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;

/**
 * 文件编码工具
 */
public class FileCharsetUtil {

	// 编码探测
	private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();

	static {
		detector.add(new ByteOrderMarkDetector());
		detector.add(new ParsingDetector(true)); // be verbose about parsing.
		detector.add(JChardetFacade.getInstance()); // Another singleton.
		detector.add(ASCIIDetector.getInstance()); // Fallback, see javadoc.
	}

	/**
	 * 获取文件编码
	 * 
	 * @param descFile
	 * @return
	 */
	public static Charset getFileCharset(File descFile) {
		Assert.state(descFile != null && descFile.exists(), "文件不存在!");
		try {
			return detector.detectCodepage(descFile.toURI().toURL());
		} catch (Exception e) {
			return null;
		}
	}

	/**
	 * 文件编码转换函数, 主要用于代码转换
	 * 
	 * @param srcDirPath 源码文件夹路径
	 * @param desCharset 要转换的编码
	 * @param suffix     代码后缀数组不带. 如 java cpp js
	 */
	public static void parseCodeCharset(String srcDirPath, Charset desCharset, String... suffix) {

		// 代码文件夹
		File srcDir = new File(srcDirPath);
		Assert.state(srcDir.exists(), srcDirPath + "不存在!");

		// 后缀
		if (suffix == null) {
			suffix = new String[] {};
		}
		List<String> suffixList = ListUtil.toList(suffix);

		// 获取代码文件列表
		List<File> loopFiles = FileUtil.loopFiles(srcDir, new FileFilter() {
			@Override
			public boolean accept(File pathname) {
				String _suffix = FileUtil.getSuffix(pathname);
				if (suffixList.contains(_suffix)) {
					return true;
				}
				return false;
			}
		});

		// 遍历文件, 转换编码
		for (File f : loopFiles) {
			Charset charset = getFileCharset(f);
			if (charset == null) {
				System.out.println(f.getPath() + "文件编码探测异常, 跳过");
				continue;
			}
			if (desCharset.equals(charset)) {
				continue;
			}
			List<String> readLines = FileUtil.readLines(f, charset);
			FileUtil.writeLines(readLines, f, desCharset);
		}
	}

}

js

https://github.com/aadsm/jschardet
先安装

npm install jschardet
import jschardet from 'jschardet';

function readFile(file) {
    return new Promise((resolve) => {
        const reader = new FileReader();
        reader.onload = function (evt) {
            resolve(evt.target.result);
        };
    });
}

async function judgmentCode(file) {
    let buffer = await readFile(file); 
    return jschardet.detect(buffer);
}
judgmentCode(file).then(code => {
    console.log(code);
})

pytyhon

https://github.com/chardet/chardet

pip install chardet
posted @ 2022-03-31 23:24  iminifly  阅读(387)  评论(0编辑  收藏  举报