js 判断上传文件是否为utf8编码格式
首先利用fileReader.readAsText(filePart) 默认通过utf8格式读取文件,如果文件中有非utf8字符会替换成�, 如果有�说明有非utf8字符。
windows下还有utf8 BOM格式的编码,这里通过判断文件头几个字符来判断文件是否是utf8 BOM编码。最后如果文件太大,达到GB级别,如果一次性加载如内存中,可能直接让浏览器卡死,
下面的代码通过分成1kb读取来逐块判断。
const getSamples = (file) => {
const filesize = file.size;
const parts = []
if (filesize <50 * 1024 * 1024) {
parts.push(file)
} else {
let total = 100
const sampleSize = 1024 * 1024
const chunkSize = Math.floor(filesize / total);
let start = 0
let end = sampleSize
while (total> 1) {
parts.push(file.slice(start, end))
start += chunkSize
end += chunkSize
total--
}
}
return parts
}
const isUtf8 = (filePart) => {
return new Promise((resolve, reject) => {
const fileReader = new FileReader()
fileReader.readAsText(filePart)
fileReader.onload = (e) => {
const str = e.target.result
// roughly half
const sampleStr = str.slice(4, 4 + str.length / 2)
if (sampleStr.indexOf("�") === -1) {
resolve(void 0)
} else {
reject(new Error("encoding format error, please upload UTF-8 format file"))
}
}
fileReader.onerror = () => {
reject(new Error("Failed to read the content of the file, please check whether the file is damaged"))
}
})
}
const isBOM = (file) => {
return new Promise((resolve, reject) => {
var reader = new FileReader()
reader.onerror = function (err) {
console.log('Error:', err)
};
reader.onload = function (e) {
var text = reader.result
var buf = new Uint8Array(text);
// check for byte order mark
// 0xef, 0xbb and 0xbf in hex converts to 239, 187 and 191 in decimal
if (buf[0] === 239 && buf[1] === 187 && buf[2] === 191) { // check for byte order mark
console.log('File has byte order mark (BOM)')
reject(new Error("File has byte order mark (BOM)"))
} else {
resolve(void 0)
}
};
reader.onerror = () => {
reject(new Error("Failed to read the content of the file, please check whether the file is damaged"))
}
reader.readAsArrayBuffer(file)
})
}
export default async function (file) {
try {
await isBOM(file)
} catch (e) {
console.log(e)
return false
}
const samples = getSamples(file)
let res = true
for (const filePart of samples) {
try {
await isUtf8(filePart)
} catch (error) {
console.log("error: ", error)
res = false
break
}
}
return res
}