js 判断上传文件是否为utf8编码格式

首先利用fileReader.readAsText(filePart) 默认通过utf8格式读取文件,如果文件中有非utf8字符会替换成�, 如果有�说明有非utf8字符。
windows下还有utf8 BOM格式的编码,这里通过判断文件头几个字符来判断文件是否是utf8 BOM编码。最后如果文件太大,达到GB级别,如果一次性加载如内存中,可能直接让浏览器卡死,
下面的代码通过分成1kb读取来逐块判断。

const getSamples = (file) => {
  const filesize = file.size;
  const parts = []
  if (filesize <50 * 1024 * 1024) {
    parts.push(file)
  } else {
    let total = 100
    const sampleSize = 1024 * 1024
    const chunkSize = Math.floor(filesize / total);
    let start = 0
    let end = sampleSize
    while (total> 1) {
      parts.push(file.slice(start, end))
      start += chunkSize
      end += chunkSize
      total--
    }
  }
  return parts
}

const isUtf8 = (filePart) => {
  return new Promise((resolve, reject) => {
    const fileReader = new FileReader()
    fileReader.readAsText(filePart)
    fileReader.onload = (e) => {
      const str = e.target.result
      // roughly half
      const sampleStr = str.slice(4, 4 + str.length / 2)
      if (sampleStr.indexOf("�") === -1) {
        resolve(void 0)
      } else {
        reject(new Error("encoding format error, please upload UTF-8 format file"))
      }
    }
    fileReader.onerror = () => {
      reject(new Error("Failed to read the content of the file, please check whether the file is damaged"))
    }
  })
}

const isBOM = (file) => {
  return new Promise((resolve, reject) => {
    var reader = new FileReader()
    reader.onerror = function (err) {
      console.log('Error:', err)
    };
    reader.onload = function (e) {
      var text = reader.result
      var buf = new Uint8Array(text);
      // check for byte order mark
      // 0xef, 0xbb and 0xbf in hex converts to 239, 187 and 191 in decimal
      if (buf[0] === 239 && buf[1] === 187 && buf[2] === 191) { // check for byte order mark
        console.log('File has byte order mark (BOM)')
        reject(new Error("File has byte order mark (BOM)"))
      } else {
        resolve(void 0)
      }
    };
    reader.onerror = () => {
      reject(new Error("Failed to read the content of the file, please check whether the file is damaged"))
    }
    reader.readAsArrayBuffer(file)
  }) 
}

export default async function (file) {
  try {
    await isBOM(file)
  } catch (e) {
    console.log(e)
    return false
  }

  const samples = getSamples(file)
  let res = true
  for (const filePart of samples) {
    try {
      await isUtf8(filePart)
    } catch (error) {
      console.log("error: ", error)
      res = false
      break
    }
  }
  return res
}

参考链接

  1. JsChardet
  2. Use js to determine whether the file is utf-8 encoding
  3. Detecting if a file has a byte order mark (BOM) using JavaScript
posted @ 2022-05-06 22:55  yihailin  阅读(1214)  评论(0编辑  收藏  举报