转自http://blog.csdn.net/lightlater/article/details/6326338
关于文本文件的文件头
第一 ANSI文件的文件头为空,不需要处理;
第二 UNICODE文件的文件头为0xFF,0xFE共计两个字节,读取时需要偏移两个字节再行读取;
第三 UTF-8文件的文件头为0xEF,0xBB,0xBF共计三个字节,读取时需要偏移三个字节后再行读取;
关于文本文件类型的判断
根据文本文件的文件头,就可以判断文本文件的类型了。
假设有如下文件类型定义:
typedef enum FileType
{
ANSI = 0,
UNICODE,
UTF8,
}FILETYPE;
我们就可以根据上述特性,来判断文本文件的类型了,下面是一段示例代码:
FILETYPE GetTextFileType(const std::string & strFileName)
{
FILETYPE fileType = ANSI;
std::ifstream file;
file.open(strFileName.c_str(), std::ios_base::in);
bool bUnicodeFile = false;
if (file.good())
{
char szFlag[3] = {0};
file.read(szFlag, sizeof(char) * 3);
if ((unsigned char)szFlag[0] == 0xFF
&& (unsigned char)szFlag[1] == 0xFE)
{
fileType = UNICODE;
}
else if ((unsigned char)szFlag[0] == 0xEF
&& (unsigned char)szFlag[1] == 0xBB
&& (unsigned char)szFlag[2] == 0xBF)
{
fileType = UTF8;
}
}
file.close();
return fileType;
}
ANSI文本文件的读取
ANSI文本文件不需要进行文件头的处理,可以直接读取。
下面是简单示例:
char szBuf[FBLOCK_MAX_BYTES];
memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
std::string strMessage;
FILE * fp = NULL;
fp = fopen(strFileName.c_str(), "rb");
if (fp != NULL)
{
// common file do not offset.
while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)
{
strMessage += szBuf;
memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
}
}
std::cout << strMessage << std::endl;
fclose(fp);
UNICODE文本文件读取
由于UNICODE普遍采用双字节来表示字符,因此读取时,当使用wchar_t类型来读取,使用fopen,fread来进行操作。
下面是简单示例:
wchar_t szBuf[FBLOCK_MAX_BYTES];
memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);
std::string strMessage;
FILE * fp = NULL;
fp = fopen(strFileName.c_str(), "rb");
if (fp != NULL)
{
// Unicode file should offset wchar_t bits(2 byte) from start.
fseek(fp, sizeof(wchar_t), 0);
while(fread(szBuf, sizeof(wchar_t), FBLOCK_MAX_BYTES, fp) > 0)
{
char szTemp[FBLOCK_MAX_BYTES] = {0};
UnicodeToANSI(szTemp, szBuf);
strMessage += szTemp;
memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES);
}
}
std::cout << strMessage << std::endl;
fclose(fp);
UTF8文本文件的读取
UTF8是可变字节,使用单一字节读取比较合理,所以读取时,使用char作为基本类型。
下面是简单示例代码:
char szBuf[FBLOCK_MAX_BYTES];
memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
std::string strMessage;
FILE * fp = NULL;
fp = fopen(strFileName.c_str(), "rb");
if (fp != NULL)
{
// UTF-8 file should offset 3 byte from start position.
fseek(fp, sizeof(char) * 3, 0);
while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0)
{
strMessage += szBuf;
memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES);
}
}
std::cout << strMessage << std::endl;
fclose(fp);
1 #include <assert.h> 2 #include <windows.h> 3 #include <iostream> 4 #include <fstream> 5 #include <string> 6 7 const int FBLOCK_MAX_BYTES = 256; 8 9 // File Type. 10 typedef enum FileType 11 { 12 ANSI = 0, 13 UNICODE, 14 UTF8, 15 }FILETYPE; 16 17 FILETYPE GetTextFileType(const std::string & strFileName); 18 19 int UnicodeToANSI(char * pDes, const wchar_t * pSrc); 20 21 void main() 22 { 23 // file test. 24 std::string strFileANSI = "C://Hello_ANSI.txt"; 25 std::string strFileUNICODE = "C://Hello_UNICODE.txt"; 26 std::string strFileUTF8 = "C://Hello_UTF8.txt"; 27 28 // please change the file name to test. 29 std::string strFileName = strFileUTF8; 30 31 FILETYPE fileType = GetTextFileType(strFileName); 32 33 if (UNICODE == fileType) 34 { 35 wchar_t szBuf[FBLOCK_MAX_BYTES]; 36 memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES); 37 38 std::string strMessage; 39 40 FILE * fp = NULL; 41 fp = fopen(strFileName.c_str(), "rb"); 42 if (fp != NULL) 43 { 44 // Unicode file should offset wchar_t bits(2 byte) from start. 45 fseek(fp, sizeof(wchar_t), 0); 46 while(fread(szBuf, sizeof(wchar_t), FBLOCK_MAX_BYTES, fp) > 0) 47 { 48 char szTemp[FBLOCK_MAX_BYTES] = {0}; 49 50 UnicodeToANSI(szTemp, szBuf); 51 strMessage += szTemp; 52 memset(szBuf, 0, sizeof(wchar_t) * FBLOCK_MAX_BYTES); 53 } 54 } 55 56 std::cout << strMessage << std::endl; 57 58 fclose(fp); 59 } 60 else if (UTF8 == fileType) 61 { 62 char szBuf[FBLOCK_MAX_BYTES]; 63 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES); 64 65 std::string strMessage; 66 67 FILE * fp = NULL; 68 fp = fopen(strFileName.c_str(), "rb"); 69 if (fp != NULL) 70 { 71 // UTF-8 file should offset 3 byte from start position. 72 fseek(fp, sizeof(char) * 3, 0); 73 while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0) 74 { 75 strMessage += szBuf; 76 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES); 77 } 78 } 79 80 std::cout << strMessage << std::endl; 81 82 fclose(fp); 83 } 84 else 85 { 86 char szBuf[FBLOCK_MAX_BYTES]; 87 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES); 88 89 std::string strMessage; 90 91 FILE * fp = NULL; 92 fp = fopen(strFileName.c_str(), "rb"); 93 if (fp != NULL) 94 { 95 // common file do not offset. 96 while(fread(szBuf, sizeof(char), FBLOCK_MAX_BYTES, fp) > 0) 97 { 98 strMessage += szBuf; 99 memset(szBuf, 0, sizeof(char) * FBLOCK_MAX_BYTES); 100 } 101 } 102 103 std::cout << strMessage << std::endl; 104 105 fclose(fp); 106 } 107 108 #ifdef _DEBUG 109 getchar(); 110 #endif 111 } 112 113 FILETYPE GetTextFileType(const std::string & strFileName) 114 { 115 FILETYPE fileType = ANSI; 116 std::ifstream file; 117 file.open(strFileName.c_str(), std::ios_base::in); 118 119 bool bUnicodeFile = false; 120 if (file.good()) 121 { 122 char szFlag[3] = {0}; 123 file.read(szFlag, sizeof(char) * 3); 124 if ((unsigned char)szFlag[0] == 0xFF 125 && (unsigned char)szFlag[1] == 0xFE) 126 { 127 fileType = UNICODE; 128 } 129 else if ((unsigned char)szFlag[0] == 0xEF 130 && (unsigned char)szFlag[1] == 0xBB 131 && (unsigned char)szFlag[2] == 0xBF) 132 { 133 fileType = UTF8; 134 } 135 } 136 137 file.close(); 138 139 return fileType; 140 } 141 142 int UnicodeToANSI(char * pDes, const wchar_t * pSrc) 143 { 144 assert(pDes != NULL); 145 assert(pSrc != NULL); 146 147 int nLen = ::WideCharToMultiByte(CP_ACP, 0, pSrc, -1, NULL, 0, NULL, NULL); 148 if (nLen == 0) 149 { 150 return -1; 151 } 152 153 return ::WideCharToMultiByte(CP_ACP, 0, pSrc, -1, pDes, nLen, NULL, NULL); 154 }