音标编码 转unicode输出,VC++
字典的音标有的是用 Kingsoft Phonetic Plain 字体,有的是用KK 字体,对应的编码也不同, 输出时可以设置对应的字体,但是需要安装对应的字体,程序中还要改字体。
统一改成UTF16后就可以统一输出了,window程序一般都用UTF16,接口转换也比较方便。
对应的转换关系可以在网上找,不过网上的不一定准确,还是要测试调整一下:
wchar_t KingCodeToUTF16(wchar_t k) { wchar_t uc = 0; switch (k) { case '5': //uc = 'ˈ'; uc = 0x02c8; break; case '7': uc = 0x02cc;// 'ˌ'; //没有 break; case '9': uc = 0x02cc;// 'ˌ'; break; case 'A': uc = 0x00e6; // 'æ'; break; case 'B': uc = 0x0251; // 'ɑ'; break; case 'C': uc = 0x0254; // 'ɔ'; break; case 'E': uc = 0x0259;// 'ə'; break; case 'F': uc = 0x0283;// 'ʃ'; break; case 'I': uc = 0x026a; // 'ɪ'; break; case 'J': uc = 0x028a; // 'ʊ'; break; case 'N': uc = 0x014b; // 'ŋ'; break; case 'Q': uc = 0x028c;// 'ʌ'; break; case 'R': uc = 0x0252; // 'ɔ'; 'ɒ' IPA88 同 C break; case 'T': uc = 0x00f0; // 'ð'; break; case 'U': uc = 0x028a; // 'u'; ?? 这是大写U break; case 'V': uc = 0x0292; // 'ʒ'; break; case 'W': uc = 0x03b8; // 'θ'; break; case '\\': uc = 0x0292;// 'ɜ'; // ?? IPA88 可以用E break; case '^': uc = 0x0261;// 'ɡ'; break; case 'Z': uc = 0x025b;// 'ɛ'; break; default: uc = k; break; } return uc; } wchar_t* kingtouc16(wchar_t* pking) { int i; int len = wcslen(pking); wchar_t* puc = new wchar_t[len + 1]; for(i = 0;i< len; i++) { puc[i] = KingCodeToUTF16(pking[i]); } puc[i] = 0; return puc; } wchar_t KKToUTF16(wchar_t kk) { wchar_t uc = 0; switch (kk) { case '!': uc = 0x026a; // 'ɪ'; break; case '"': uc = 0x02c8; //'ˈ'; break; case '#': uc = 0x00e6; // 'æ'; break; case '$': uc = 0x0251; // 'ɑ'; break; case '%': uc = 0x0254; // 'ɔ'; break; case '&': uc = 0x028a; // 'ʊ'; break; case '(': uc = 0x028c;// 'ʌ'; break; case ')': uc = 0x0292;// 'ɜ'; break; case '\*': uc = 0x0259;// 'ə'; break; case '+': uc = 0x0259;// 'ə'; break; case '.': uc = 0x0283;// 'ʃ'; break; case '//': uc = 0x0292; // 'ʒ'; break; case '0': uc = 0x014b; // 'ŋ'; break; case '6': uc = 0x02c8; //'ˈ'; break; case '7': uc = 0x02cc;// 'ˌ'; break; case '8': uc = ':'; // ':'; break; case '9': uc = 0x0292; // 'ʒ'; break; case '<': uc = 0x02cc;// 'ˌ'; break; case 'G': uc = 0x03b8; // 'θ'; break; case 'H': uc = 0x00f0; // 'ð'; break; case 'W': uc = 0x025b;// 'ɛ'; break; /* //下面这些是King case 'R': uc = 0x0252; // 'ɔ'; 'ɒ' IPA88 同 C break; case 'U': uc = 0x028a; // 'u'; ?? 这是大写U break; case '^': uc = 0x0261;// 'ɡ'; break; */ default: uc = kk; break; } return uc; } wchar_t* kktouc16(wchar_t* pking) { int i; int len = wcslen(pking); wchar_t* puc = new wchar_t[len + 1]; for (i = 0; i < len; i++) { puc[i] = KKToUTF16(pking[i]); } puc[i] = 0; return puc; }
测试程序运行效果:
完整测试程序,有些其他的测试,不影响音标输出测试
// TW32.cpp : Defines the entry point for the application. // #include "stdafx.h" #include "resource.h" #include<stdio.h> #include <commctrl.h> #include <richedit.h> #define MAX_LOADSTRING 100 // Global Variables: HINSTANCE hInst; // current instance TCHAR szTitle[MAX_LOADSTRING] = _T("TW32"); // The title bar text TCHAR szWindowClass[MAX_LOADSTRING] =_T("TW32_XGZ_2023"); // The title bar text // Foward declarations of functions included in this code module: ATOM MyRegisterClass(HINSTANCE hInstance); BOOL InitInstance(HINSTANCE, int); LRESULT CALLBACK WndProc(HWND, UINT, WPARAM, LPARAM); LRESULT CALLBACK About(HWND, UINT, WPARAM, LPARAM); //检查文件类型 #define GFT_NULL 0 //没有打开文件 #define GFT_ANSI 1 //ANSI #define GFT_UTF16LE 2 //UTF-16 LE #define GFT_UTF16BE 3 //UTF-16 BE #define GFT_UTF8BOM 4 //UTF-8 BOM 文件头3字节为标识,明确是UTF8 #define GFT_UTF8 5 //UTF-8 没有BOM,有可能是UTF8 //#define GFT_UTF8_0 6 //没有UTF8长字符 //没必要 //#define GFT_UTF8_N 7 //有UTF8长字符 #define GFT_CKLENMAX 1000000 //最多检查1000,000 个字符,避免大文件 bool IsUTF8Count(const void* pBuffer, long size, long& utf8num); int GetFileType(const TCHAR* PathFileName); unsigned int a_encode = CP_UTF8; //从UTF16转换存储建议统一用UTF8,程序中最好显式调用 wchar_t* CharToWchar(const char* c, size_t m_encode = CP_ACP); // 因为VS2022 IDE中的ANSI串是ACP, char* WcharToChar(const wchar_t* wp, size_t m_encode = CP_ACP); // int PRINT(const TCHAR* fmt, ...); //int setFormat(int size, DWORD c); int setFormat(HWND hWndREdit, int size, DWORD c, TCHAR* szFaceName = _T("Tahoma")); int RPRINT(const TCHAR* fmt, ...); int OnCreate(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam); int OnPaint(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam); int OnSize(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam); #define IDC_RICHEDIT 1010 #define IDC_EDIT 1011 #define IDC_LIST 1012 HWND hWndRichEdit; HWND hWndEdit; HWND hWndList; int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) { // TODO: Place code here. MSG msg; HACCEL hAccelTable; MyRegisterClass(hInstance); // Perform application initialization: if (!InitInstance (hInstance, nCmdShow)) { return FALSE; } hAccelTable = LoadAccelerators(hInstance, (LPCTSTR)IDC_TW32); // Main message loop: while (GetMessage(&msg, NULL, 0, 0)) { if (!TranslateAccelerator(msg.hwnd, hAccelTable, &msg)) { TranslateMessage(&msg); DispatchMessage(&msg); } } return msg.wParam; } ATOM MyRegisterClass(HINSTANCE hInstance) { WNDCLASSEX wcex; wcex.cbSize = sizeof(WNDCLASSEX); wcex.style = CS_HREDRAW | CS_VREDRAW; wcex.lpfnWndProc = (WNDPROC)WndProc; wcex.cbClsExtra = 0; wcex.cbWndExtra = 0; wcex.hInstance = hInstance; wcex.hIcon = LoadIcon(hInstance, (LPCTSTR)IDI_TW32); wcex.hCursor = LoadCursor(NULL, IDC_ARROW); wcex.hbrBackground = (HBRUSH)(COLOR_WINDOW+1); wcex.lpszMenuName = (LPCTSTR)IDC_TW32; //XGZ wcex.lpszClassName = szWindowClass; wcex.hIconSm = LoadIcon(wcex.hInstance, (LPCTSTR)IDI_SMALL); return RegisterClassEx(&wcex); } BOOL InitInstance(HINSTANCE hInstance, int nCmdShow) { HWND hWnd; hInst = hInstance; // Store instance handle in our global variable hWnd = CreateWindow(szWindowClass, szTitle, WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, 0, CW_USEDEFAULT, 0, NULL, NULL, hInstance, NULL); if (!hWnd) { return FALSE; } ShowWindow(hWnd, nCmdShow); UpdateWindow(hWnd); return TRUE; } LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { int wmId, wmEvent; TCHAR szHello[MAX_LOADSTRING]; LoadString(hInst, IDS_HELLO, szHello, MAX_LOADSTRING); switch (message) { case WM_COMMAND: wmId = LOWORD(wParam); wmEvent = HIWORD(wParam); // Parse the menu selections: switch (wmId) { case IDM_ABOUT: DialogBox(hInst, (LPCTSTR)IDD_ABOUTBOX, hWnd, (DLGPROC)About); break; case IDM_EXIT: DestroyWindow(hWnd); break; default: return DefWindowProc(hWnd, message, wParam, lParam); } break; case WM_CREATE: OnCreate( hWnd, message, wParam, lParam); break; case WM_PAINT: OnPaint(hWnd, message, wParam, lParam); break; case WM_SIZE: OnSize(hWnd, message, wParam, lParam); break; case WM_DESTROY: PostQuitMessage(0); break; default: return DefWindowProc(hWnd, message, wParam, lParam); } return 0; } wchar_t KingCodeToUTF16(wchar_t k) { wchar_t uc = 0; switch (k) { case '5': //uc = 'ˈ'; uc = 0x02c8; break; case '7': uc = 0x02cc;// 'ˌ'; //没有 break; case '9': uc = 0x02cc;// 'ˌ'; break; case 'A': uc = 0x00e6; // 'æ'; break; case 'B': uc = 0x0251; // 'ɑ'; break; case 'C': uc = 0x0254; // 'ɔ'; break; case 'E': uc = 0x0259;// 'ə'; break; case 'F': uc = 0x0283;// 'ʃ'; break; case 'I': uc = 0x026a; // 'ɪ'; break; case 'J': uc = 0x028a; // 'ʊ'; break; case 'N': uc = 0x014b; // 'ŋ'; break; case 'Q': uc = 0x028c;// 'ʌ'; break; case 'R': uc = 0x0252; // 'ɔ'; 'ɒ' IPA88 同 C break; case 'T': uc = 0x00f0; // 'ð'; break; case 'U': uc = 0x028a; // 'u'; ?? 这是大写U break; case 'V': uc = 0x0292; // 'ʒ'; break; case 'W': uc = 0x03b8; // 'θ'; break; case '\\': uc = 0x0292;// 'ɜ'; // ?? IPA88 可以用E break; case '^': uc = 0x0261;// 'ɡ'; break; case 'Z': uc = 0x025b;// 'ɛ'; break; default: uc = k; break; } return uc; } wchar_t* kingtouc16(wchar_t* pking) { int i; int len = wcslen(pking); wchar_t* puc = new wchar_t[len + 1]; for(i = 0;i< len; i++) { puc[i] = KingCodeToUTF16(pking[i]); } puc[i] = 0; return puc; } wchar_t KKToUTF16(wchar_t kk) { wchar_t uc = 0; switch (kk) { case '!': uc = 0x026a; // 'ɪ'; break; case '"': uc = 0x02c8; //'ˈ'; break; case '#': uc = 0x00e6; // 'æ'; break; case '$': uc = 0x0251; // 'ɑ'; break; case '%': uc = 0x0254; // 'ɔ'; break; case '&': uc = 0x028a; // 'ʊ'; break; case '(': uc = 0x028c;// 'ʌ'; break; case ')': uc = 0x0292;// 'ɜ'; break; case '\*': uc = 0x0259;// 'ə'; break; case '+': uc = 0x0259;// 'ə'; break; case '.': uc = 0x0283;// 'ʃ'; break; case '//': uc = 0x0292; // 'ʒ'; break; case '0': uc = 0x014b; // 'ŋ'; break; case '6': uc = 0x02c8; //'ˈ'; break; case '7': uc = 0x02cc;// 'ˌ'; break; case '8': uc = ':'; // ':'; break; case '9': uc = 0x0292; // 'ʒ'; break; case '<': uc = 0x02cc;// 'ˌ'; break; case 'G': uc = 0x03b8; // 'θ'; break; case 'H': uc = 0x00f0; // 'ð'; break; case 'W': uc = 0x025b;// 'ɛ'; break; /* //下面这些是King case 'R': uc = 0x0252; // 'ɔ'; 'ɒ' IPA88 同 C break; case 'U': uc = 0x028a; // 'u'; ?? 这是大写U break; case '^': uc = 0x0261;// 'ɡ'; break; */ default: uc = kk; break; } return uc; } wchar_t* kktouc16(wchar_t* pking) { int i; int len = wcslen(pking); wchar_t* puc = new wchar_t[len + 1]; for (i = 0; i < len; i++) { puc[i] = KKToUTF16(pking[i]); } puc[i] = 0; return puc; } int OnCreate(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { // InitCommonControls(); HINSTANCE hRich = LoadLibrary(_T("Riched20.dll")); hWndRichEdit = CreateWindowEx(WS_EX_CLIENTEDGE, RICHEDIT_CLASS, NULL,// _T("RichEdit20W"), NULL, WS_CHILD | WS_VISIBLE | WS_VSCROLL | ES_MULTILINE, // | ES_READONLY 0, 0, 500, 500, hWnd, (HMENU)IDC_RICHEDIT, hInst, NULL); hWndEdit = CreateWindow(_T("edit"), NULL, WS_CHILD | WS_BORDER | WS_VISIBLE | ES_MULTILINE | WS_VSCROLL ,// | ES_READONLY 0, 0, 0, 0, hWnd, (HMENU)IDC_EDIT, hInst, NULL); hWndList = CreateWindow(_T("listbox"), NULL, WS_CHILD | WS_BORDER | WS_VISIBLE | WS_VSCROLL | WS_HSCROLL | LBS_NOTIFY,// | LBS_SORT, 0, 0, 0, 0, hWnd, (HMENU)IDC_LIST, hInst, NULL); PRINT(_T("=== [ Main OnCreate Test PRINT 圆周率 = %f ] ===\r\n"), 3.1415926); //=============== RPRINT(_T("\r\nTest\r\n")); //先输入 setFormat(hWndRichEdit,5, 0x800000, _T("宋体")); wchar_t* pWord = _T("enthymeme"); wchar_t *pking = _T("(`ZnWE9mim; 5enWE9mi:m)"); wchar_t* pkk = _T("6WnG*7mim"); wchar_t *pUc; RPRINT(_T("\r\n\r\n %s"), pWord); pUc = kingtouc16(pking); RPRINT(_T("\r\nKing = %s"), pUc); delete pUc; pUc = kktouc16(pkk); RPRINT(_T("\r\nKK = %s"), pUc); delete pUc; wchar_t* pWord2 = _T("actual"); wchar_t* pking2 = _T("(`AktFJEl; 5AktFuEl)"); wchar_t* pkk2 = _T("6#kt.u*l"); RPRINT(_T("\r\n\r\n %s"), pWord2); pUc = kingtouc16(pking2); RPRINT(_T("\r\nKing = %s"), pUc); delete pUc; pUc = kktouc16(pkk2); RPRINT(_T("\r\nKK = %s"), pUc); delete pUc; //============ TCHAR* PathFile = _T("R:\\21世纪UTF8.txt"); int iFType = GetFileType(PathFile); PRINT(_T("\r\n%s"), PathFile); PRINT(_T("\r\nGetFileType = %d"), iFType); FILE* fp; fp = _tfopen(PathFile, _T("r,ccs=UTF-8")); //ftell == 3 if (NULL == fp) { PRINT(_T("\r\n<FAIL> fopen failed!")); return 1; } LARGE_INTEGER t1, t2, tc; //计时 double time; QueryPerformanceFrequency(&tc); QueryPerformanceCounter(&t1); //计时 int fstrlen; fpos_t fpos; int i; int j; int f1; int f2; int fm; //fseek(fp, 0, SEEK_SET); //直接到0,不管有没有BOM头 ,rewind(fp)也是一样 //fseek(fp, 3, SEEK_SET); //若要跳过BOM头,则定位到3 f1 = ftell(fp); //用ccs=UTF-8打开后当前位置3 j = fseek(fp, 0, SEEK_END); f2 = ftell(fp); fm = (f1 + f2) / 2; fseek(fp, fm, 0); fgetpos(fp, &fpos); PRINT(_T("\r\nf1=%d,fm=%d,f2=%d"), f1, fm, f2); TCHAR* buf = new TCHAR[100000]; TCHAR* bufyb = new TCHAR[100]; fseek(fp, 3, SEEK_SET); for (int i = 0; i < 10; i++) { fgetws(buf, 1000000, fp); int len = lstrlen(buf); for (int j = 0; j < len; j++) { if ((buf[j] == '\`')&&(buf[j+1]=='3') && (buf[j + 2] == '\`') && (buf[j + 3] == '(')) { int k; for (k = j+4; k < j+100; k++) { if(buf[k] == ')') break; bufyb[k] = buf[k]; } bufyb[k] = 0; PRINT(bufyb+j+4); } } //PRINT(_T("\r\n%s"), buf); PRINT(_T("\r\n")); } //fsetpos(fp, &fpos); //if (fgetws(buf, 1000000, fp) != NULL) //if (fgets(buf, 1000000, fp) != NULL) //{ //ShowWTXT(buf); //} delete bufyb; delete buf; fclose(fp); QueryPerformanceCounter(&t2); time = (double)(t2.QuadPart - t1.QuadPart) / (double)tc.QuadPart; PRINT(_T("\r\nRun time = %f"), time); return 0; } int OnPaint(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { HDC hdc; PAINTSTRUCT ps; HFONT hFont; wchar_t* wp = L"Chinese中文测试 =3.1415926 immethodical (ˌɪmə`θɑdɪkl; ˌimiˈθɔdikəl)形容词 无秩序的; 无规则的; 漫无章法的; 紊乱的"; wchar_t wbuf[1024]; wchar_t wbuf2[1024]; wchar_t wbuf3[1024]; hdc = BeginPaint(hWnd, &ps); LOGFONT logfont; ZeroMemory(&logfont, sizeof(LOGFONT)); logfont.lfCharSet = DEFAULT_CHARSET; logfont.lfHeight = -20; lstrcpy(logfont.lfFaceName, _T("华文中宋")); //若在不同系统下运行不要用默认 hFont = CreateFontIndirect(&logfont); SelectObject(hdc, hFont); /*GetObject(hFont, sizeof(LOGFONT), &logfont); logfont.lfHeight = 16; _tcscpy(logfont.lfFaceName, _T("Times New Roman")); //单用时,中会用系统默认 hFont = CreateFontIndirect(&logfont); SelectObject(hdc, hFont);*/ SetTextColor(hdc, RGB(100, 0, 0)); SetBkColor(hdc, RGB(255, 255, 0)); //高亮,亮黄底色 //WritePrivateProfileString(_T("Default"), _T("Str中文1"), wp, _T("R:\\tes中文1.ini")); GetPrivateProfileString(_T("Default"), _T("Str中文1"), _T("NULL"), wbuf, 1024, _T("R:\\tes中文1.ini")); GetPrivateProfileString(_T("Default"), _T("Str中文2"), _T("NULL"), wbuf2, 1024, _T("R:\\tes中文1.ini")); GetPrivateProfileString(_T("Default"), _T("Str3"), _T("NULL"), wbuf3, 1024, _T("R:\\tes中文1.ini")); RECT rt; GetClientRect(hWnd, &rt); DrawText(hdc, wp, lstrlen(wp), &rt, DT_LEFT); TextOut(hdc, 10, 50, wbuf, lstrlen(wbuf)); TextOut(hdc, 10, 150, wbuf2, lstrlen(wbuf2)); TextOut(hdc, 10, 250, wbuf3, lstrlen(wbuf3)); EndPaint(hWnd, &ps); return 0; } int OnSize(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { int cxClient, cyClient; cxClient = LOWORD(lParam); cyClient = HIWORD(lParam); int top = 100; //MoveWindow(hWndEditInput, 5, 0, 200, 30, TRUE); MoveWindow(hWndList, 5, top, 250, cyClient - top - 20, TRUE); //MoveWindow(hWndMainToolbar, 210, 0, 400, 25, TRUE); MoveWindow(hWndRichEdit, 260, top, cxClient - 265, cyClient - top -200 -20, TRUE); MoveWindow(hWndEdit, 260, cyClient - 200, cxClient - 265, 170, TRUE); //SendMessage(hWndList, CB_SETHORIZONTALEXTENT, (WPARAM)10, (LPARAM)0); return DefWindowProc(hWnd, message, wParam, lParam); } // Mesage handler for about box. LRESULT CALLBACK About(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) { switch (message) { case WM_INITDIALOG: return TRUE; case WM_COMMAND: if (LOWORD(wParam) == IDOK || LOWORD(wParam) == IDCANCEL) { EndDialog(hDlg, LOWORD(wParam)); return TRUE; } break; } return FALSE; } TCHAR buffer[10000]; int PRINT(const TCHAR* fmt, ...) { va_list argptr; int cnt; int iEditTextLength; HWND hWnd = hWndEdit; if (NULL == hWnd) return 0; va_start(argptr, fmt); cnt = _vstprintf(buffer, fmt, argptr); // A or W but ISO C //cnt = vswprintf(buffer, fmt, argptr); // only W //cnt = wvsprintf(buffer, fmt, argptr); // not %f va_end(argptr); iEditTextLength = GetWindowTextLength(hWnd); if (iEditTextLength + cnt > 30000) // edit text max length is 30000 { SendMessage(hWnd, EM_SETSEL, 0, 10000); SendMessage(hWnd, WM_CLEAR, 0, 0); iEditTextLength = iEditTextLength - 10000; } SendMessage(hWnd, EM_SETSEL, iEditTextLength, iEditTextLength); SendMessage(hWnd, EM_REPLACESEL, 0, (LPARAM)buffer); return(cnt); } int setFormat( HWND hWndREdit, int size, DWORD c, TCHAR * szFaceName) { CHARFORMAT cf; ZeroMemory(&cf, sizeof(CHARFORMAT)); cf.cbSize = sizeof(CHARFORMAT); cf.dwMask |= CFM_COLOR; cf.crTextColor = RGB(0, 0, 0); //设置颜色 cf.dwMask |= CFM_SIZE; cf.yHeight = 200;//设置高度 cf.dwMask |= CFM_FACE; //wcscpy(cf.szFaceName, _T("Kingsoft Phonetic Plain"));//设置字体 //wcscpy(cf.szFaceName, _T("Tahoma"));//设置字体 lstrcpy(cf.szFaceName, szFaceName);//设置字体 if (size != 0) { cf.dwMask |= CFM_SIZE; cf.yHeight = size * 50; } else { cf.dwMask &= ~CFM_SIZE; } int r, g, b; r = c >> 16 & 0xff; g = c >> 8 & 0xff; b = c & 0xff; cf.dwMask |= CFM_COLOR; cf.crTextColor = RGB(r, g, b); //设置颜色 SendMessage(hWndREdit, EM_SETCHARFORMAT, SCF_SELECTION, (LPARAM)&cf); return 0; } int RPRINT(const TCHAR* fmt, ...) { va_list argptr; int cnt; int iEditTextLength; HWND hWnd = hWndRichEdit; if (NULL == hWnd) return 0; va_start(argptr, fmt); cnt = _vstprintf(buffer, fmt, argptr); // A or W but ISO C //cnt = vswprintf(buffer, fmt, argptr); // only W //cnt = wvsprintf(buffer, fmt, argptr); // not %f va_end(argptr); iEditTextLength = GetWindowTextLength(hWnd); if (iEditTextLength + cnt > 30000) // edit text max length is 30000 { SendMessage(hWnd, EM_SETSEL, 0, 10000); SendMessage(hWnd, WM_CLEAR, 0, 0); iEditTextLength = iEditTextLength - 10000; } SendMessage(hWnd, EM_SETSEL, iEditTextLength, iEditTextLength); SendMessage(hWnd, EM_REPLACESEL, 0, (LPARAM)buffer); return(cnt); } bool IsUTF8Count(const void* pBuffer, long size, long& utf8num) { bool IsUTF8 = true; unsigned char* start = (unsigned char*)pBuffer; unsigned char* end = (unsigned char*)pBuffer + size; long utf8num2 = 0; long utf8num3 = 0; long utf8num4 = 0; utf8num = 0; while (start < end) { if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符 { start++; } else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符 { IsUTF8 = false; //第一个不可能小于 1100 0000且大于 80 break; } else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符 { utf8num2++; if (start >= end - 1) { break; } if ((start[1] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 2; } else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符 { utf8num3++; if (start >= end - 2) { break; } if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 3; } else if (*start < (0xF8)) // (11111000): 此范围内为4字节UTF-8字符 { utf8num4++; if (start >= end - 3) { break; } if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80 || (start[3] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 4; //多一个判断可以+4,也不会慢 } else { IsUTF8 = false; break; } } //false 100% 不是UTF8 utf8num = utf8num2 + utf8num3 + utf8num4; return IsUTF8; //ture 说明符合UTF8规律, utf8num / size; } int GetFileType(const TCHAR* PathFileName) { long lSize; long lutf8num = 0; unsigned char Head[3]; FILE* fp = NULL; fp = _tfopen(PathFileName, _T("rb")); if (NULL == fp) { return GFT_NULL; // 文件不存在 } fseek(fp, 0, SEEK_END); lSize = ftell(fp); if (lSize < 3) { fclose(fp); return GFT_ANSI; // 1 ANSI 文件,没有BOM头 } fseek(fp, 0, SEEK_SET); fread(Head, 3, 1, fp); if ((Head[0] == 0xff) && (Head[1] == 0xfe)) { return GFT_UTF16LE; } if ((Head[0] == 0xfe) && (Head[1] == 0xff)) { return GFT_UTF16BE;; } if ((Head[0] == 0xef) && (Head[1] == 0xbb) && (Head[2] == 0xbf)) { return GFT_UTF8BOM; } //没有编码头的情况 if (lSize > GFT_CKLENMAX) lSize = GFT_CKLENMAX; fseek(fp, 0, SEEK_SET); //或rewind(f); char* pBuff = new char[lSize + 1]; memset(pBuff, 0, lSize + 1); fread(pBuff, lSize, 1, fp); fclose(fp); //bool bIsUTF8 = IsUTF8Text(pBuff, lSize); bool bIsUTF8 = IsUTF8Count(pBuff, lSize, lutf8num); delete pBuff; pBuff = NULL; if (!bIsUTF8) { return GFT_ANSI; //明确不是UTF8,UTF8解码会出错 } //if (lutf8num == 0) return GFT_UTF8_0; // 没有UTF8字符的UTF8 //return GFT_UTF8_N; // 有UTF8字符的UTF8 return GFT_UTF8; //其实程序不必管有有没有UTF8长字符 } wchar_t* CharToWchar(const char* c, size_t m_encode) { if (!c) return nullptr; int len = MultiByteToWideChar(m_encode, 0, c, strlen(c), NULL, 0); wchar_t* m_wchar = new wchar_t[len + 1]; MultiByteToWideChar(m_encode, 0, c, strlen(c), m_wchar, len); m_wchar[len] = '\0'; return m_wchar; } char* WcharToChar(const wchar_t* wp, size_t m_encode) { if (!wp) return nullptr; int len = WideCharToMultiByte(m_encode, 0, wp, wcslen(wp), NULL, 0, NULL, NULL); char* m_char = new char[len + 1]; WideCharToMultiByte(m_encode, 0, wp, wcslen(wp), m_char, len, NULL, NULL); m_char[len] = '\0'; return m_char; }