WideCharToMultiByte
//z 2014-03-25 08:18:41 IS2120@BG57IV3 T3343244181.K.F1434403198[T1,L68,R2,V15]
void UnicodeToAnsi(WCHAR *in, char *out, int cchout)
{
int len ;
len = WideCharToMultiByte(CP_ACP,
0,
in,
wcslen(in)+1,
out,
cchout,
NULL,
NULL) ;
if (!len)
ErrorExit("out of memory") ;
}
//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
2. 一个例子,将文件自动转换为 utf-8
// ChangeFileEncoding.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include "ChangeFileEncoding.h" #include <string> #ifdef _DEBUG #define new DEBUG_NEW #endif // 唯一的应用程序对象 CWinApp theApp; using namespace std; void recursiveFile(CString strFileType); void convertGBToUTF8(CString strWritePath, const char* gb2312); int _tmain(int argc, TCHAR* argv[], TCHAR* envp[]) { int nRetCode = 0; // 初始化 MFC 并在失败时显示错误 if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0)) { // TODO: 更改错误代码以符合您的需要 _tprintf(_T("错误: MFC 初始化失败\n")); nRetCode = 1; } else { /*for(int i = 0; i < argc; i++) { MessageBox(NULL, argv[i], L"Arglist contents", MB_OK); }*/ //声明一个CFileFind类变量,以用来搜索 //接受一个参数作为源代码文件的根目录 TCHAR *lpszDirName = argv[1]; CString strFileType; strFileType.Format(_T("%s\\*.*"), lpszDirName); //递归此目录下的.h文件和.cpp文件,如果发现不是utf8编码则转换为utf8编码 recursiveFile(strFileType); } return nRetCode; } void recursiveFile( CString strFileType) { CFileFind finder; BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件 while(isFinded) { isFinded = finder.FindNextFile(); //递归搜索其他的文件 if(!finder.IsDots()) //如果不是"."目录 { CString strFoundFile = finder.GetFilePath(); if(finder.IsDirectory()) //如果是目录,则递归地调用 { CString strNextFileType; strNextFileType.Format(_T("%s\\*.*"), strFoundFile); recursiveFile(strNextFileType); } else { //如果是头文件或cpp文件 if(strFoundFile.Right(4) == _T(".cpp") || strFoundFile.Right(2) == _T(".h")) { CFile fileReader(strFoundFile, CFile::modeRead); byte head[3]; fileReader.Read(head, 3); //判断是否带有BOM文件头 if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf ) { fileReader.Close(); continue; } fileReader.SeekToBegin(); int bufLength = 256; char *buf = new char[bufLength]; ZeroMemory(buf, bufLength); int nReadLength; std::string strContent; while((nReadLength = fileReader.Read(buf, bufLength))) { strContent.append(buf, nReadLength); ZeroMemory(buf, nReadLength); } delete buf; fileReader.Close(); convertGBToUTF8(strFoundFile, strContent.c_str()); } } } } finder.Close(); } void convertGBToUTF8(CString strWritePath, const char* gb2312) { CFile fp; fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL); int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0); wchar_t* wstr = new wchar_t[len+1]; memset(wstr, 0, len+1); MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len); len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL); char* str = new char[len+1]; memset(str, 0, len+1); len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL); if(wstr) delete[] wstr; str[len] = '\n'; const unsigned char aryBOM[] = {0xEF, 0xBB, 0xBF}; fp.Write(aryBOM, sizeof(aryBOM)); fp.Write(str,len); delete[] str; fp.Close(); }//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
http://blog.csdn.net/visualcatsharp/article/details/7345854
//z 2014-05-06 12:00:46 L.239'43154 BG57IV3@XCL T1109932947.K.F253293061 [T409,L5358,R263,V7006]
3. v2
// ConvertZ.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include "ConvertZ.h" #include <string> using namespace std; #ifdef _DEBUG #define new DEBUG_NEW #endif // 唯一的应用程序对象 CWinApp theApp; void recursiveFile(CString strFileType); void convertGBToUTF8(CString strWritePath, const char* gb2312); int _tmain(int argc, TCHAR* argv[], TCHAR* envp[]) { int nRetCode = 0; // 初始化 MFC 并在失败时显示错误 if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0)) { // TODO: 更改错误代码以符合您的需要 _tprintf(_T("错误: MFC 初始化失败\n")); nRetCode = 1; } else { /*for(int i = 0; i < argc; i++) { MessageBox(NULL, argv[i], L"Arglist contents", MB_OK); }*/ //声明一个CFileFind类变量,以用来搜索 if(argc != 2) { CString strUsage; strUsage.Format(_T("usage : \n %s dir\n dir [sample] : c:\\src\n"),argv[0]); _tprintf(strUsage.GetBuffer()); strUsage.ReleaseBuffer(); return nRetCode; } //接受一个参数作为源代码文件的根目录 TCHAR *lpszDirName = argv[1]; CString strFileType; strFileType.Format(_T("%s\\*.*"), lpszDirName); //递归此目录下的.h文件和.cpp文件,如果发现不是utf8编码则转换为utf8编码 recursiveFile(strFileType); } return nRetCode; } bool isSrcType(const CString strFileType) { CString strExt_R4 = strFileType.Right(4); CString strExt_R2 = strFileType.Right(2); if ((strExt_R4.CompareNoCase(_T(".cpp")) == 0) || (strExt_R2.CompareNoCase(_T(".c")) == 0) || (strExt_R2.CompareNoCase(_T(".h")) == 0) || (strExt_R4.CompareNoCase(_T(".cxx")) == 0) || (strExt_R4.CompareNoCase(_T(".hpp")) == 0) ) { return true; } return false; } void recursiveFile( CString strFileType) { CFileFind finder; BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件 while(isFinded) { isFinded = finder.FindNextFile(); //递归搜索其他的文件 if(!finder.IsDots()) //如果不是"."目录 { CString strFoundFile = finder.GetFilePath(); if(finder.IsDirectory()) //如果是目录,则递归地调用 { CString strNextFileType; strNextFileType.Format(_T("%s\\*.*"), strFoundFile); recursiveFile(strNextFileType); } else { //如果是头文件或cpp文件 if(isSrcType(strFoundFile)) { CFile fileReader(strFoundFile, CFile::modeRead|CFile::typeBinary); byte head[3]; fileReader.Read(head, 3); //判断是否带有BOM文件头 if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf ) { fileReader.Close(); continue; } fileReader.SeekToBegin(); int bufLength = 256; char *buf = new char[bufLength]; ZeroMemory(buf, bufLength); int nReadLength; std::string strContent; while((nReadLength = fileReader.Read(buf, bufLength))) { strContent.append(buf, nReadLength); ZeroMemory(buf, nReadLength); } delete buf; fileReader.Close(); convertGBToUTF8(strFoundFile, strContent.c_str()); } } } } finder.Close(); } void convertGBToUTF8(CString strWritePath, const char* gb2312) { CFile fp; fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL); const int ngblen = static_cast<int>(strlen(gb2312)); int len = MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, NULL, 0); wchar_t* wstr = new wchar_t[len+1]; memset(wstr, 0, (len+1)*sizeof(wchar_t)); MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, wstr, len); wstr[len] = '\0'; int newLen = 0; newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, NULL, 0, NULL, NULL); char* str = new char[newLen+1]; memset(str, 0, (newLen+1)*sizeof(char)); newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, str, newLen, NULL, NULL); if(wstr) { delete[] wstr; wstr = NULL; } str[newLen] = '\0'; const unsigned char aryBOM[] = {0xEF, 0xBB, 0xBF}; fp.Write(aryBOM, sizeof(aryBOM)); fp.Write(str,newLen); delete[] str; fp.Close(); }
//z 2014-05-22 16:55:50 L.223'25450 BG57IV3 T427209771 .K.F253293061 [T484,L6693,R325,V8206]
Simple Character Encoding Detection
|
|
Introduction
One very commonly asked question in programming is how to detect the character encoding of a string
. Well, I'm going
to share a cool method I came up with that can detect if a string
is UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE in
just 4 lines of code.
Explanation
We'll be working with null terminated string
s, so the first rule is that we must terminate all string
s
with a quadruple null, regardless of encoding. You may wish to add a definition such as the following:
#define NT "\0\0\0"
char *exampleString = "This is UTF-8" NT;
Next is an explanation of how the checking works.
1.===== If a string doesn't contain nulls, its UTF-8
:
else
:
2:===== If a string doesn't contain double nulls, it's UTF-16
:--.
: 3:== If the nulls are on odd numbered indices, it's UTF-16LE
: :
: else
: :
: 4'== The string defaults to UTF-16BE
:
else
:
5:===== If the index modulo 4 is 0 and the character is greater than
: 0x7F, the string is UTF-32LE. This is because the range of
: UTF-32 only goes up to 0x7FFFFFFF, meaning approximately 22%
: of the characters that can be represented will validate that
: the string is not big endian; including a BOM.
:
else
:
6'===== The string defaults to UTF-32BE
The Code
We check every byte until we reach a quadruple null:
int String_GetEncoding(char *string)
{
unsigned c, i = 0, flags = 0;
while (string[i] | string[i + 1] | string[i + 2] | string[i + 3])
flags = (c = string[i++]) ? flags | ((!(flags % 4) &&
c > 0x7F) << 3) : flags | 1 | (!(i & 1) << 1)
| ((string[i] == 0) << 2);
return (flags & 1) + ((flags & 2) != 0) +
((flags & 4) != 0) + ((flags & 8) != 0);
}
The output:
0 = UTF-8
1 = UTF-16BE
2 = UTF-16LE
3 = UTF-32BE
4 = UTF-32LE
Notes
Since UTF-32 encoding can contain several null bytes, its byte order checking is done through an alternative method that doesn't work 100% of the time, e.g., if all the characters are within the ASCII range and there isn't a BOM, it'll return UTF-32BE when it might actually be UTF-32LE.
This isn't really a big issue since UTF-32 is never used for storage, so chances are anyone that might use it will already know the byte ordering without having to check. However, if you're OCD, you could perform an additional check by treating UTF-32BE
as UTF-16 and determining that string
's byte ordering.
License
This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)
About the Author
United States
![United States](http://www.codeproject.com/script/Geo/Images/US.gif)
IsTextUnicode function
Syntax
BOOL IsTextUnicode( _In_ const VOID *lpv, _In_ int iSize, _Inout_opt_ LPINT lpiResult );
IMultiLanguage2::DetectInputCodepage method
Detects the code page of the given string.
Syntax
HRESULT DetectInputCodepage( [in] DWORD dwFlag, [in] DWORD dwPrefWinCodePage, [in] __wchar_t *pSrcStr, [in, out] INT *pcSrcSize, [in, out] DetectEncodingInfo *lpEncoding, [in, out] INT *pnScores );
Parameters
- dwFlag [in]
-
One of the MLDETECTCP-defined bit flag values that specify the type of incoming source text. Setting the bit flags helps the detection engines produce more accurate results.
- dwPrefWinCodePage [in]
-
The preferred Windows code page. If this value is set to zero, this API returns all possible encodings. Otherwise, it lists only those encodings related to this parameter.
- pSrcStr [in]
-
The ource string for which the client wants to detect the code page.
- pcSrcSize [in, out]
-
The address of the buffer that stores the size of pSrcStr, in bytes. When this method is successful, it returns the number of bytes processed to this buffer.
- lpEncoding [in, out]
-
A pointer to an array of DetectEncodingInfo structures where the detection information is returned.
- pnScores [in, out]
-
A pointer to a buffer that contains the number of DetectEncodingInfo structures allocated in lpEncoding. When this method is successful, this parameter returns the number of elements of lpEncoding that are filled in.
Return value
Returns one of the following values.
Return code | Description |
---|---|
|
Success. |
|
The method cannot determine the code page of the input stream. |
|
An error occurred. |
Remarks
The caller is responsible for allocating and freeing the lpEncoding array.
Requirements
Minimum supported client |
Windows XP |
---|---|
Minimum supported server |
Windows 2000 Server |
Header |
|
IDL |
|
DLL |
|
See also
Detect encoding of a string in C/C++
Assuming you know the length of the input array, you can make the following guesses:
- First, check to see if the first few bytes match any well know byte order marks (BOM) for Unicode. If they do, you're done!
- Next, search for '\0' before the last byte. If you find one, you might be dealing with UTF-16 or UTF-32. If you find multiple consecutive '\0's, it's probably UTF-32.
-
If any character is from
0x80
to0xff
, it's certainly not ASCII or UTF-7. If you are restricting your input to some variant of Unicode, you can assume it's UTF-8. Otherwise, you have to do some guessing to determine which multi-byte character set it is. That will not be fun. - At this point it is either: ASCII, UTF-7, Base64, or ranges of UTF-16 or UTF-32 that just happen to not use the top bit and do not have any null characters.
It's not an easy problem to solve, and generally relies on heuristics to take a best guess at what the input encoding is, which can be tripped up by relatively innocuous inputs - for example, take a look at this Wikipedia article and The Notepad file encoding Redux for more details.
If you're looking for a Windows-only solution with minimal dependencies, you can look at using a combination of IsTextUnicode and MLang's DetectInputCodePage to attempt character set detection.
If you are looking for portability, but don't mind taking on a fairly large dependency in the form of ICU then you can make use of it's character set detection routines to achieve the same thing in a portable manner.
The Notepad file encoding problem, redux
////////////////////////////////////////////////////////////////////////// // // FILE: utf8conv.h // // Header file defining helper functions for converting strings // between Unicode UTF-8 and UTF-16. // // UTF-8 is stored in std::string; UTF-16 is stored in std::wstring. // // This code just uses Win32 Platform SDK and C++ standard library; // so it can be used also with the Express editions of Visual Studio. // // // February 4th, 2011 // // by Giovanni Dicanio <gdicanio@mvps.org> // ////////////////////////////////////////////////////////////////////////// #pragma once //------------------------------------------------------------------------ // INCLUDES //------------------------------------------------------------------------ #include <stdarg.h> // variable argument lists... #include <stdio.h> // ...and vsprintf_s #include <exception> // std::exception #include <string> // STL string classes #include <Windows.h> // Win32 Platform SDK main header namespace utf8util { //------------------------------------------------------------------------ // Exception class representing an error occurred during UTF-8 conversion. //------------------------------------------------------------------------ class utf8_error : public std::exception { public: // Constructs an utf8_error with a message string that can use a // printf-like syntax for formatting. explicit utf8_error(const char * format, ...); // Override from std::exception::what() const char * what() const; // // IMPLEMENTATION // private: char m_message[512]; // buffer for error message }; inline utf8_error::utf8_error(const char * format, ...) { // Format error message in buffer va_list args; va_start(args, format); vsprintf_s(m_message, format, args); va_end(args); } inline const char * utf8_error::what() const { return m_message; } //------------------------------------------------------------------------ //------------------------------------------------------------------------ // Converts a string from UTF-8 to UTF-16. // On error, can throw an utf8_error exception. //------------------------------------------------------------------------ inline std::wstring utf16_from_utf8(const std::string & utf8) { // // Special case of empty input string // if (utf8.empty()) return std::wstring(); // // Get length (in wchar_t's) of resulting UTF-16 string // const int utf16_length = ::MultiByteToWideChar( CP_UTF8, // convert from UTF-8 0, // default flags utf8.data(), // source UTF-8 string utf8.length(), // length (in chars) of source UTF-8 string NULL, // unused - no conversion done in this step 0 // request size of destination buffer, in wchar_t's ); if (utf16_length == 0) { // Error DWORD error = ::GetLastError(); throw utf8_error( "Can't get length of UTF-16 string (MultiByteToWideChar set last error to %lu).", error); } // // Allocate destination buffer for UTF-16 string // std::wstring utf16; utf16.resize(utf16_length); // // Do the conversion from UTF-8 to UTF-16 // if ( ! ::MultiByteToWideChar( CP_UTF8, // convert from UTF-8 0, // default flags utf8.data(), // source UTF-8 string utf8.length(), // length (in chars) of source UTF-8 string &utf16[0], // destination buffer utf16.length() // size of destination buffer, in wchar_t's ) ) { // Error DWORD error = ::GetLastError(); throw utf8_error( "Can't convert string from UTF-8 to UTF-16 (MultiByteToWideChar set last error to %lu).", error); } // // Return resulting UTF-16 string // return utf16; } //------------------------------------------------------------------------ // Converts a string from UTF-16 to UTF-8. // On error, can throw an utf8_error exception. //------------------------------------------------------------------------ inline std::string utf8_from_utf16(const std::wstring & utf16) { // // Special case of empty input string // if (utf16.empty()) return std::string(); // // Get length (in chars) of resulting UTF-8 string // const int utf8_length = ::WideCharToMultiByte( CP_UTF8, // convert to UTF-8 0, // default flags utf16.data(), // source UTF-16 string utf16.length(), // source string length, in wchar_t's, NULL, // unused - no conversion required in this step 0, // request buffer size NULL, NULL // unused ); if (utf8_length == 0) { // Error DWORD error = ::GetLastError(); throw utf8_error( "Can't get length of UTF-8 string (WideCharToMultiByte set last error to %lu).", error); } // // Allocate destination buffer for UTF-8 string // std::string utf8; utf8.resize(utf8_length); // // Do the conversion from UTF-16 to UTF-8 // if ( ! ::WideCharToMultiByte( CP_UTF8, // convert to UTF-8 0, // default flags utf16.data(), // source UTF-16 string utf16.length(), // source string length, in wchar_t's, &utf8[0], // destination buffer utf8.length(), // destination buffer size, in chars NULL, NULL // unused ) ) { // Error DWORD error = ::GetLastError(); throw utf8_error( "Can't convert string from UTF-16 to UTF-8 (WideCharToMultiByte set last error to %lu).", error); } // // Return resulting UTF-8 string // return utf8; } } // namespace utf8util //////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////// // // FILE: TestUTF8Conversion.cpp // // Defines the entry point for the console test application. // // By Giovanni Dicanio <gdicanio@mvps.org> // ////////////////////////////////////////////////////////////////////////// #include "stdafx.h" // precompiled headers #include "utf8conv.h" // UTF-8 conversion helpers using namespace std; using namespace utf8util; //------------------------------------------------------------------------ // Some tests for UTF-8 <-> UTF-16 conversion. //------------------------------------------------------------------------ void test() { // // Test a simple UTF-16 <-> UTF-8 conversion // // Source UTF-16 string wstring utf16(L"Euro sign (U+20AC): \x20AC"); // Convert from UTF-16 to UTF-8 string utf8 = utf8_from_utf16(utf16); // Convert back from UTF-8 to UTF-16 wstring utf16_new = utf16_from_utf8(utf8); // Check conversion result if (utf16_new != utf16) throw runtime_error("UTF-16 <-> UTF-8 conversion failed."); // // Test with empty strings // if (! utf16_from_utf8("").empty()) throw runtime_error("Empty UTF-8 string not converted to empty UTF-16 string."); if (! utf8_from_utf16(L"").empty()) throw runtime_error("Empty UTF-16 string not converted to empty UTF-8 string."); // // Test with invalid UTF-8 bytes // // 0xC0 0xAF UTF-8 sequence is discussed in "Writing Secure Code" // (Chapter 11, "How UTF-8 Encodes Data", page 380) char utf8_invalid[] = "UTF-8 invalid sequence: \xC0\xAF"; wstring utf16_invalid = utf16_from_utf8(utf8_invalid); // // Unicode UTF-16 'REPLACEMENT CHARACTER' (U+FFFD) // is used for the invalid UTF-8 bytes. // // http://www.fileformat.info/info/unicode/char/fffd/index.htm // } //------------------------------------------------------------------------ // Entry-point. //------------------------------------------------------------------------ int wmain(int argc, wchar_t* argv[]) { static const int ok = 0; static const int fail = 1; int exit_code = ok; try { cout << "*** Testing UTF-8 <-> UTF-16 Conversion ***" << endl; test(); cout << "All right." << endl; } catch(const exception & e) { cerr << "*** ERROR: " << e.what() << endl; exit_code = fail; } return exit_code; } //////////////////////////////////////////////////////////////////////////
@IS2120#CNBLOGS.T2169364049[T1,L65,R1,V259]:备忘
$ € ₤ ₭ ₪ ₩ ₮ ₦ ₱ ฿ ₡ ₫ ﷼ ¥ ﷼ ₫ ₡ ฿ ₱ ₦ ₮ ₩ ₪ ₭ ₤ € $