WideCharToMultiByte

1. 了解下这个 API
//z 2014-03-25 08:18:41 IS2120@BG57IV3 T3343244181.K.F1434403198[T1,L68,R2,V15]
void UnicodeToAnsi(WCHAR *in, char *out, int cchout)
{
int len ;

len = WideCharToMultiByte(CP_ACP,
0,
in,
wcslen(in)+1,
out,
cchout,
NULL,
NULL) ;
if (!len)
ErrorExit("out of memory") ;
}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
2. 一个例子，将文件自动转换为 utf-8

// ChangeFileEncoding.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ChangeFileEncoding.h"
#include <string>

#ifdef _DEBUG
#define new DEBUG_NEW
#endif


// 唯一的应用程序对象

CWinApp theApp;

using namespace std;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
	int nRetCode = 0;

	// 初始化 MFC 并在失败时显示错误
	if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
	{
		// TODO: 更改错误代码以符合您的需要
		_tprintf(_T("错误: MFC 初始化失败\n"));
		nRetCode = 1;
	}
	else
	{
		/*for(int i = 0; i < argc; i++)
		{
			MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
		}*/
		//声明一个CFileFind类变量，以用来搜索
		
		//接受一个参数作为源代码文件的根目录
		TCHAR *lpszDirName = argv[1];
		CString strFileType;
		strFileType.Format(_T("%s\\*.*"), lpszDirName);
		//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
		recursiveFile(strFileType);
		
	}

	return nRetCode;
}

void recursiveFile( CString strFileType)
{
	CFileFind finder; 
	BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
	while(isFinded)
	{
		isFinded = finder.FindNextFile(); //递归搜索其他的文件
		if(!finder.IsDots()) //如果不是"."目录
		{
			CString strFoundFile = finder.GetFilePath(); 
			if(finder.IsDirectory()) //如果是目录，则递归地调用
			{ 
				CString strNextFileType;
				strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
				recursiveFile(strNextFileType);
			}
			else
			{ 
				//如果是头文件或cpp文件
				if(strFoundFile.Right(4) == _T(".cpp") || strFoundFile.Right(2) == _T(".h")) {
					CFile fileReader(strFoundFile, CFile::modeRead);
					byte head[3];
					fileReader.Read(head, 3); 
					//判断是否带有BOM文件头
					if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
					{
						fileReader.Close();
						continue;
					}
					fileReader.SeekToBegin();
					
					int bufLength = 256;
					char *buf = new char[bufLength];
					ZeroMemory(buf, bufLength);
					int nReadLength;
					std::string strContent;
					while((nReadLength = fileReader.Read(buf, bufLength)))
					{
						strContent.append(buf, nReadLength);
						ZeroMemory(buf, nReadLength);
					}
					delete buf; 
					fileReader.Close();
					convertGBToUTF8(strFoundFile, strContent.c_str());
				}
			}
		}
	}
	finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
	CFile fp;
	fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);
	int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len+1];
	memset(wstr, 0, len+1);
	MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
	len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* str = new char[len+1];
	memset(str, 0, len+1);
	len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
	if(wstr) delete[] wstr;
	str[len] = '\n';
	const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
	fp.Write(aryBOM, sizeof(aryBOM));
	fp.Write(str,len);
	delete[] str;
	fp.Close();
}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
http://blog.csdn.net/visualcatsharp/article/details/7345854

//z 2014-05-06 12:00:46 L.239'43154 BG57IV3@XCL T1109932947.K.F253293061 [T409,L5358,R263,V7006]
3. v2

// ConvertZ.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ConvertZ.h"
#include <string>

using namespace std;

#ifdef _DEBUG
#define new DEBUG_NEW
#endif

// 唯一的应用程序对象
CWinApp theApp;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
	int nRetCode = 0;

	// 初始化 MFC 并在失败时显示错误
	if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
	{
		// TODO: 更改错误代码以符合您的需要
		_tprintf(_T("错误: MFC 初始化失败\n"));
		nRetCode = 1;
	}
	else
	{
		/*for(int i = 0; i < argc; i++)
		{
		MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
		}*/
		//声明一个CFileFind类变量，以用来搜索

		if(argc != 2)
		{
			CString strUsage;
			strUsage.Format(_T("usage : \n    %s dir\n    dir [sample] : c:\\src\n"),argv[0]);
			_tprintf(strUsage.GetBuffer());
			strUsage.ReleaseBuffer();

			return nRetCode;
		}

		//接受一个参数作为源代码文件的根目录
		TCHAR *lpszDirName = argv[1];
		CString strFileType;
		strFileType.Format(_T("%s\\*.*"), lpszDirName);
		//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
		recursiveFile(strFileType);
	}

	return nRetCode;
}

bool isSrcType(const CString strFileType)
{
	CString strExt_R4 = strFileType.Right(4);
	CString strExt_R2 = strFileType.Right(2);

	if ((strExt_R4.CompareNoCase(_T(".cpp")) == 0)
		|| (strExt_R2.CompareNoCase(_T(".c")) == 0)
		|| (strExt_R2.CompareNoCase(_T(".h")) == 0)
		|| (strExt_R4.CompareNoCase(_T(".cxx")) == 0)
		|| (strExt_R4.CompareNoCase(_T(".hpp")) == 0)
		)
	{
		return true;
	}

	return false;
}

void recursiveFile( CString strFileType)
{
	CFileFind finder; 
	BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
	while(isFinded)
	{
		isFinded = finder.FindNextFile(); //递归搜索其他的文件
		if(!finder.IsDots()) //如果不是"."目录
		{
			CString strFoundFile = finder.GetFilePath(); 
			if(finder.IsDirectory()) //如果是目录，则递归地调用
			{ 
				CString strNextFileType;
				strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
				recursiveFile(strNextFileType);
			}
			else
			{ 
				//如果是头文件或cpp文件
				if(isSrcType(strFoundFile)) {
					CFile fileReader(strFoundFile, CFile::modeRead|CFile::typeBinary);
					byte head[3];
					fileReader.Read(head, 3); 
					//判断是否带有BOM文件头
					if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
					{
						fileReader.Close();
						continue;
					}
					fileReader.SeekToBegin();

					int bufLength = 256;
					char *buf = new char[bufLength];
					ZeroMemory(buf, bufLength);
					int nReadLength;
					std::string strContent;
					while((nReadLength = fileReader.Read(buf, bufLength)))
					{
						strContent.append(buf, nReadLength);
						ZeroMemory(buf, nReadLength);
					}
					delete buf; 
					fileReader.Close();
					convertGBToUTF8(strFoundFile, strContent.c_str());
				}
			}
		}
	}
	finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
	CFile fp;
	fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);

	const int ngblen = static_cast<int>(strlen(gb2312));
	int len = MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, NULL, 0);
	wchar_t* wstr = new wchar_t[len+1];
	memset(wstr, 0, (len+1)*sizeof(wchar_t));
	MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, wstr, len);
	wstr[len] = '\0';

	int newLen = 0;
	newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, NULL, 0, NULL, NULL);
	char* str = new char[newLen+1];
	memset(str, 0, (newLen+1)*sizeof(char));
	newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, str, newLen, NULL, NULL);

	if(wstr)
	{
		delete[] wstr;
		wstr = NULL;
	}

	str[newLen] = '\0';
	const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
	fp.Write(aryBOM, sizeof(aryBOM));
	fp.Write(str,newLen);
	delete[] str;
	fp.Close();
}

//z 2014-05-22 16:55:50 L.223'25450 BG57IV3 T427209771 .K.F253293061 [T484,L6693,R325,V8206]

Introduction

One very commonly asked question in programming is how to detect the character encoding of a string. Well, I'm going to share a cool method I came up with that can detect if a string is UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE in just 4 lines of code.

Explanation

We'll be working with null terminated strings, so the first rule is that we must terminate all strings with a quadruple null, regardless of encoding. You may wish to add a definition such as the following:

Collapse | Copy Code

#define NT "\0\0\0" 
 
char *exampleString = "This is UTF-8" NT;

Next is an explanation of how the checking works.

Collapse | Copy Code

1.===== If a string doesn't contain nulls, its UTF-8
 :
else
 :
2:===== If a string doesn't contain double nulls, it's UTF-16
 :--.
 : 3:== If the nulls are on odd numbered indices, it's UTF-16LE
 :  :
 : else
 :  :
 : 4'== The string defaults to UTF-16BE
 :
else
 :
5:===== If the index modulo 4 is 0 and the character is greater than
 :      0x7F, the string is UTF-32LE. This is because the range of
 :      UTF-32 only goes up to 0x7FFFFFFF, meaning approximately 22%
 :      of the characters that can be represented will validate that
 :      the string is not big endian; including a BOM.
 :
else
 :
6'===== The string defaults to UTF-32BE

The Code

We check every byte until we reach a quadruple null:

Collapse | Copy Code

int String_GetEncoding(char *string)
  {
    unsigned c, i = 0, flags = 0;
    while (string[i] | string[i + 1] | string[i + 2] | string[i + 3])
      flags = (c = string[i++]) ? flags | ((!(flags % 4) && 
      c > 0x7F) << 3) : flags | 1 | (!(i & 1) << 1) 
      | ((string[i] == 0) << 2);
    return (flags & 1) + ((flags & 2) != 0) + 
    ((flags & 4) != 0) + ((flags & 8) != 0);
  }

The output:

Collapse | Copy Code

0  = UTF-8
1  = UTF-16BE
2  = UTF-16LE
3  = UTF-32BE
4  = UTF-32LE

Notes

Since UTF-32 encoding can contain several null bytes, its byte order checking is done through an alternative method that doesn't work 100% of the time, e.g., if all the characters are within the ASCII range and there isn't a BOM, it'll return UTF-32BE when it might actually be UTF-32LE.

This isn't really a big issue since UTF-32 is never used for storage, so chances are anyone that might use it will already know the byte ordering without having to check. However, if you're OCD, you could perform an additional check by treating UTF-32BE as UTF-16 and determining that string's byte ordering.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

About the Author

Ghosuwa Wogomon

United States

IsTextUnicode function

Determines if a buffer is likely to contain a form of Unicode text.

Syntax

C++

BOOL IsTextUnicode(
  _In_         const VOID *lpv,
  _In_         int iSize,
  _Inout_opt_  LPINT lpiResult
);

IMultiLanguage2::DetectInputCodepage method

2 out of 4 rated this helpful

Detects the code page of the given string.

Syntax

C++

HRESULT DetectInputCodepage(
  [in]       DWORD dwFlag,
  [in]       DWORD dwPrefWinCodePage,
  [in]       __wchar_t *pSrcStr,
  [in, out]  INT *pcSrcSize,
  [in, out]  DetectEncodingInfo *lpEncoding,
  [in, out]  INT *pnScores
);

Parameters

dwFlag [in]: One of the MLDETECTCP-defined bit flag values that specify the type of incoming source text. Setting the bit flags helps the detection engines produce more accurate results.
dwPrefWinCodePage [in]: The preferred Windows code page. If this value is set to zero, this API returns all possible encodings. Otherwise, it lists only those encodings related to this parameter.
pSrcStr [in]: The ource string for which the client wants to detect the code page.
pcSrcSize [in, out]: The address of the buffer that stores the size of pSrcStr, in bytes. When this method is successful, it returns the number of bytes processed to this buffer.
lpEncoding [in, out]: A pointer to an array of DetectEncodingInfo structures where the detection information is returned.
pnScores [in, out]: A pointer to a buffer that contains the number of DetectEncodingInfo structures allocated in lpEncoding. When this method is successful, this parameter returns the number of elements of lpEncoding that are filled in.

Return value

Returns one of the following values.

Return code	Description
S_OK	Success.
S_FALSE	The method cannot determine the code page of the input stream.
E_FAIL	An error occurred.

Remarks

The caller is responsible for allocating and freeing the lpEncoding array.

Requirements

Minimum supported client	Windows XP
Minimum supported server	Windows 2000 Server
Header	Mlang.h
IDL	Mlang.idl
DLL	Mlang.dll

Detect encoding of a string in C/C++

Assuming you know the length of the input array, you can make the following guesses:

First, check to see if the first few bytes match any well know byte order marks (BOM) for Unicode. If they do, you're done!
Next, search for '\0' before the last byte. If you find one, you might be dealing with UTF-16 or UTF-32. If you find multiple consecutive '\0's, it's probably UTF-32.
If any character is from 0x80 to 0xff, it's certainly not ASCII or UTF-7. If you are restricting your input to some variant of Unicode, you can assume it's UTF-8. Otherwise, you have to do some guessing to determine which multi-byte character set it is. That will not be fun.
At this point it is either: ASCII, UTF-7, Base64, or ranges of UTF-16 or UTF-32 that just happen to not use the top bit and do not have any null characters.

answered Sep 23 '11 at 1:42

MSN
29.8k23661

It's not an easy problem to solve, and generally relies on heuristics to take a best guess at what the input encoding is, which can be tripped up by relatively innocuous inputs - for example, take a look at this Wikipedia article and The Notepad file encoding Redux for more details.

If you're looking for a Windows-only solution with minimal dependencies, you can look at using a combination of IsTextUnicode and MLang's DetectInputCodePage to attempt character set detection.

If you are looking for portability, but don't mind taking on a fairly large dependency in the form of ICU then you can make use of it's character set detection routines to achieve the same thing in a portable manner.

answered Sep 23 '11 at 1:49

russw_uk
53134

The Notepad file encoding problem, redux

RATE THIS

17 Apr 2007 10:00 AM

About every ten months, somebody new discovers the Notepad file encoding problem. Let's see what else there is to say about it.

First of all, can we change Notepad's detection algorithm? The problem is that there are a lot of different text files out there. Let's look just at the ones that Notepad supports.

8-bit ANSI (of which 7-bit ASCII is a subset). These have no BOM; they just dive right in with bytes of text. They are also probably the most common type of text file.
UTF-8. These usually begin with a BOM but not always.
Unicode big-endian (UTF-16BE). These usually begin with a BOM but not always.
Unicode little-endian (UTF-16LE). These usually begin with a BOM but not always.

If a BOM is found, then life is easy, since the BOM tells you what encoding the file uses. The problem is when there is no BOM. Now you have to guess, and when you guess, you can guess wrong. For example, consider this file:

D0 AE

Depending on which encoding you assume, you get very different results.

If you assume 8-bit ANSI (with code page 1252), then the file consists of the two characters U+00D0 U+00AE, or "Ð®". Sure this looks strange, but maybe it's part of the word VATNIÐ® which might be the name of an Icelandic hotel.
If you assume UTF-8, then the file consists of the single Cyrillic character U+042E, or "Ю".
If you assume Unicode big-endian, then the file consists of the Korean Hangul syllable U+D0AE, or "킮".
If you assume Unicode little-endian, then the file consists of the Korean Hangul syllable U+AED0, or "껐".

Okay, so this file can be interpreted in four different ways. Are you going to use the "try to guess" algorithm from IsTextUnicode? (Michael Kaplan has some thoughts on this subject.) If so, then you are right where Notepad is today. Notice that all four interpretations are linguistically plausible.

Some people might say that the rule should be "All files without a BOM are 8-bit ANSI." In that case, you're going to misinterpret all the files that use UTF-8 or UTF-16 and don't have a BOM. Note that the Unicode standard even advises against using a BOM for UTF-8, so you're already throwing out everybody who follows the recommendation.

Okay, given that the Unicode folks recommend against using a BOM for UTF-8, maybe your rule is "All files without a BOM are UTF-8." Well, that messes up all 8-bit ANSI files that use characters above 127.

Maybe you're willing to accept that ambiguity, and use the rule, "If the file looks like valid UTF-8, then use UTF-8; otherwise use 8-bit ANSI, but under no circumstances should you treat the file as UTF-16LE or UTF-16BE." In other words, "never auto-detect UTF-16". First, you still have ambiguous cases, like the file above, which could be either 8-bit ANSI or UTF-8. And second, you are going to be flat-out wrong when you run into a Unicode file that lacks a BOM, since you're going to misinterpret it as either UTF-8 or (more likely) 8-bit ANSI. You might decide that programs that generate UTF-16 files without a BOM are broken, but that doesn't mean that they don't exist. For example,

cmd /u /c dir >results.txt

This generates a UTF-16LE file without a BOM. If you poke around your Windows directory, you'll probably find other Unicode files without a BOM. (For example, I found COM+.log.) These files still "worked" under the old IsTextUnicode algorithm, but now they are unreadable. Maybe you consider that an acceptable loss.

The point is that no matter how you decide to resolve the ambiguity, somebody will win and somebody else will lose. And then people can start experimenting with the "losers" to find one that makes your algorithm look stupid for choosing "incorrectly".

//////////////////////////////////////////////////////////////////////////
//
// FILE: utf8conv.h
//
// Header file defining helper functions for converting strings
// between Unicode UTF-8 and UTF-16.
//
// UTF-8 is stored in std::string; UTF-16 is stored in std::wstring.
//
// This code just uses Win32 Platform SDK and C++ standard library; 
// so it can be used also with the Express editions of Visual Studio.
//
//
// February 4th, 2011
//
// by Giovanni Dicanio <gdicanio@mvps.org>
//
//////////////////////////////////////////////////////////////////////////


#pragma once


//------------------------------------------------------------------------
//                              INCLUDES
//------------------------------------------------------------------------

#include <stdarg.h>     // variable argument lists...
#include <stdio.h>      // ...and vsprintf_s

#include <exception>    // std::exception
#include <string>       // STL string classes

#include <Windows.h>    // Win32 Platform SDK main header



namespace utf8util {


//------------------------------------------------------------------------
// Exception class representing an error occurred during UTF-8 conversion.
//------------------------------------------------------------------------
class utf8_error 
    : public std::exception
{
public:
   
    // Constructs an utf8_error with a message string that can use a
    // printf-like syntax for formatting.
    explicit utf8_error(const char * format, ...);

    // Override from std::exception::what()
    const char * what() const;


    //
    // IMPLEMENTATION
    //
private:
    char m_message[512];    // buffer for error message
};


inline utf8_error::utf8_error(const char * format, ...)
{
    // Format error message in buffer
    va_list args;
    va_start(args, format);
    vsprintf_s(m_message, format, args);
    va_end(args);
}


inline const char * utf8_error::what() const
{
    return m_message;
}

//------------------------------------------------------------------------



//------------------------------------------------------------------------
// Converts a string from UTF-8 to UTF-16.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::wstring utf16_from_utf8(const std::string & utf8)
{
    //
    // Special case of empty input string
    //
    if (utf8.empty())
        return std::wstring();


    //
    // Get length (in wchar_t's) of resulting UTF-16 string
    //
    const int utf16_length = ::MultiByteToWideChar(
        CP_UTF8,            // convert from UTF-8
        0,                  // default flags
        utf8.data(),        // source UTF-8 string
        utf8.length(),      // length (in chars) of source UTF-8 string
        NULL,               // unused - no conversion done in this step
        0                   // request size of destination buffer, in wchar_t's
        );
    if (utf16_length == 0)
    {
        // Error
        DWORD error = ::GetLastError();
        throw utf8_error(
            "Can't get length of UTF-16 string (MultiByteToWideChar set last error to %lu).", 
            error);
    }


    //
    // Allocate destination buffer for UTF-16 string
    //
    std::wstring utf16;
    utf16.resize(utf16_length);


    //
    // Do the conversion from UTF-8 to UTF-16
    //
    if ( ! ::MultiByteToWideChar(
        CP_UTF8,            // convert from UTF-8
        0,                  // default flags
        utf8.data(),        // source UTF-8 string
        utf8.length(),      // length (in chars) of source UTF-8 string
        &utf16[0],          // destination buffer
        utf16.length()      // size of destination buffer, in wchar_t's
        ) )
    {
        // Error
        DWORD error = ::GetLastError();
        throw utf8_error(
            "Can't convert string from UTF-8 to UTF-16 (MultiByteToWideChar set last error to %lu).", 
            error);
    }

    //
    // Return resulting UTF-16 string
    //
    return utf16;
}


//------------------------------------------------------------------------
// Converts a string from UTF-16 to UTF-8.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::string utf8_from_utf16(const std::wstring & utf16)
{
    //
    // Special case of empty input string
    //
    if (utf16.empty())
        return std::string();


    //
    // Get length (in chars) of resulting UTF-8 string
    //
    const int utf8_length = ::WideCharToMultiByte(
        CP_UTF8,            // convert to UTF-8
        0,                  // default flags
        utf16.data(),       // source UTF-16 string
        utf16.length(),     // source string length, in wchar_t's,
        NULL,               // unused - no conversion required in this step
        0,                  // request buffer size
        NULL, NULL          // unused
        );
    if (utf8_length == 0)
    {
        // Error
        DWORD error = ::GetLastError();
        throw utf8_error(
            "Can't get length of UTF-8 string (WideCharToMultiByte set last error to %lu).", 
            error);
    }


    //
    // Allocate destination buffer for UTF-8 string
    //
    std::string utf8;
    utf8.resize(utf8_length);


    //
    // Do the conversion from UTF-16 to UTF-8
    //
    if ( ! ::WideCharToMultiByte(
        CP_UTF8,                // convert to UTF-8
        0,                      // default flags
        utf16.data(),           // source UTF-16 string
        utf16.length(),         // source string length, in wchar_t's,
        &utf8[0],               // destination buffer
        utf8.length(),          // destination buffer size, in chars
        NULL, NULL              // unused
        ) )
    {
        // Error
        DWORD error = ::GetLastError();
        throw utf8_error(
            "Can't convert string from UTF-16 to UTF-8 (WideCharToMultiByte set last error to %lu).", 
            error);
    }


    //
    // Return resulting UTF-8 string
    //
    return utf8;
}


} // namespace utf8util


//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
//
// FILE: TestUTF8Conversion.cpp
//
// Defines the entry point for the console test application.
//
// By Giovanni Dicanio <gdicanio@mvps.org>
//
//////////////////////////////////////////////////////////////////////////


#include "stdafx.h"       // precompiled headers
#include "utf8conv.h"     // UTF-8 conversion helpers

using namespace std;
using namespace utf8util;



//------------------------------------------------------------------------
// Some tests for UTF-8 <-> UTF-16 conversion.
//------------------------------------------------------------------------
void test()
{
    //
    // Test a simple UTF-16 <-> UTF-8 conversion
    //
    
    // Source UTF-16 string
    wstring utf16(L"Euro sign (U+20AC): \x20AC");
    
    // Convert from UTF-16 to UTF-8
    string utf8 = utf8_from_utf16(utf16);
    
    // Convert back from UTF-8 to UTF-16
    wstring utf16_new = utf16_from_utf8(utf8);
    
    // Check conversion result
    if (utf16_new != utf16)
        throw runtime_error("UTF-16 <-> UTF-8 conversion failed.");



    //
    // Test with empty strings
    //
    if (! utf16_from_utf8("").empty())
        throw runtime_error("Empty UTF-8 string not converted to empty UTF-16 string.");

    if (! utf8_from_utf16(L"").empty())
        throw runtime_error("Empty UTF-16 string not converted to empty UTF-8 string.");



    //
    // Test with invalid UTF-8 bytes
    //

    // 0xC0 0xAF UTF-8 sequence is discussed in "Writing Secure Code" 
    // (Chapter 11, "How UTF-8 Encodes Data", page 380)
    char utf8_invalid[] = "UTF-8 invalid sequence: \xC0\xAF";
    wstring utf16_invalid = utf16_from_utf8(utf8_invalid);
    //
    // Unicode UTF-16 'REPLACEMENT CHARACTER' (U+FFFD) 
    // is used for the invalid UTF-8 bytes.
    //
    // http://www.fileformat.info/info/unicode/char/fffd/index.htm
    //
}



//------------------------------------------------------------------------
// Entry-point.
//------------------------------------------------------------------------
int wmain(int argc, wchar_t* argv[])
{
    static const int ok = 0;
    static const int fail = 1;
    int exit_code = ok;

    try
    {
        cout << "*** Testing UTF-8 <-> UTF-16 Conversion ***" << endl;
        test();
        cout << "All right." << endl;
    }
    catch(const exception & e)
    {
        cerr << "*** ERROR: " << e.what() << endl;
        exit_code = fail;
    }

    return exit_code;
}


//////////////////////////////////////////////////////////////////////////

posted @ 2014-03-31 20:36 BiG5 阅读(828) 评论(0) 收藏举报

刷新页面返回顶部

BiG5

TiME MachInE

WideCharToMultiByte

Simple Character Encoding Detection

Introduction

Explanation

The Code

Notes

License

About the Author

IsTextUnicode function

Syntax

IMultiLanguage2::DetectInputCodepage method

Syntax

Parameters

Return value

Remarks

Requirements

See also

Detect encoding of a string in C/C++

The Notepad file encoding problem, redux

公告