VC++ 读取UTF-8和ANSI编码文件
判断是否是UTF-8文件:
bool IsUTF8Text(const void* pBuffer, long size) { bool IsUTF8 = true; unsigned char* start = (unsigned char*)pBuffer; unsigned char* end = (unsigned char*)pBuffer + size; while (start < end) { if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符 { start++; } else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符 { IsUTF8 = false; break; } else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符 { if (start >= end - 1) { break; } if ((start[1] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 2; } else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符 { if (start >= end - 2) { break; } if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 3; } else { IsUTF8 = false; break; } } return IsUTF8; } bool IsUTF8File(const char* pFileName) { FILE *f = NULL; fopen_s(&f, pFileName, "rb"); if (NULL == f) { return false; } fseek(f, 0, SEEK_END); long lSize = ftell(f); fseek(f, 0, SEEK_SET); //或rewind(f); char *pBuff = new char[lSize + 1]; memset(pBuff, 0, lSize + 1); fread(pBuff, lSize, 1, f); fclose(f); bool bIsUTF8 = IsUTF8Text(pBuff, lSize); delete[]pBuff; pBuff = NULL; return bIsUTF8; }
读取文件:
CString GetFile(CString filename, UINT CodePage) { CFile fileR; CString strFile = L""; if (!fileR.Open(filename, CFile::modeRead | CFile::typeBinary)) { return strFile; } BYTE head[3]; fileR.Read(head, 3); if (!(head[0] == 0xEF && head[1] == 0xBB && head[2] == 0xBF)) { fileR.SeekToBegin(); } ULONGLONG FileSize = fileR.GetLength(); char* pContent = (char*)calloc(FileSize + 1, sizeof(char)); fileR.Read(pContent, FileSize); fileR.Close(); int n = MultiByteToWideChar(CodePage, 0, pContent, FileSize + 1, NULL, 0); wchar_t* pWideChar = (wchar_t*)calloc(n + 1, sizeof(wchar_t)); MultiByteToWideChar(CodePage, 0, pContent, FileSize + 1, pWideChar, n); strFile = CString(pWideChar); free(pContent); free(pWideChar); return strFile; }