VC之网页获取(能区分GB2312与UTF8)
测试版,使用时还需要优化个别地方
代码
int FindCodePage(PBYTE p,int nLen,CString theUrl)
{
int nResult = -1;
UINT u[4];
UINT uUTF8Count = 0;
UINT uACPCount = 0;
nResult = -1;
if(nLen < 8)
return nResult;
if (p[0] == 0xFF && p[1] == 0xFE && p[2] != 0xFF)//Unicode
{
nResult = CP_UTF8;
}
else if (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)//UTF8
{
nResult = CP_UTF8;
}
else
{
for(DWORD i=4;i<nLen-4;i++)
{
u[0] = p[i];
u[1] = p[i+1];
u[2] = p[i+2];
u[3] = p[i+3];
if((u[0]&248) ==240) //& B11111000 must be:B11110XXX
{
if((u[1]&192) == 128
&&(u[2]&192) == 128
&&(u[3]&192) == 128)
{
nResult = CP_UTF8;
uUTF8Count++;
i +=3;
//break;
}
else
{
nResult = CP_ACP;
i ++;
uACPCount++;
break;
}
}
else if((u[0]&240) ==224) //& B11110000 must be:B1110XXXX
{
//if((p[i+1] & 192 ==128)
//&&(p[i+2] & 192 ==128))
if((u[1]&192) == 128
&&(u[2]&192) == 128)
{
nResult = CP_UTF8;
uUTF8Count++;
i +=2;
//break;
}
else// if(u[0]>=128 && u[1] >=128)
{
nResult = CP_ACP;
i ++;
uACPCount++;
break;
}
}
//else if((u[0]&224) ==192) //& B11100000 must be:B110XXXXX
//{
// if((u[1]&192) == 128)
// {
// nResult = CP_UTF8;
// break;
// }
//}
/*else if(p[i]>160)
{
if((p[i+1]>160))
{
nResult = CP_ACP;
break;
}
}*/
}
}
if(nResult<0)
nResult = CP_ACP;
if(uUTF8Count+uACPCount>0)
TRACE(theUrl+CString(" PageCode = %d \n"),nResult);
return nResult;
}
//获取网页内容
CString GetSourceHtml(CString theUrl)
{
CString retVal;
CInternetSession session;
CInternetFile* file = NULL;
try
{
// 试着连接到指定URL
file = (CInternetFile*) session.OpenURL(theUrl);
}
catch (CInternetException* m_pException)
{
// 如果有错误的话,置文件为空
file = NULL;
m_pException->Delete();
return retVal;
}
if (file)
{
DWORD dwFileLen = 2097152;// 2 M
//BYTE* pBuf =new byte[81920];
BYTE* pBuf =new byte[dwFileLen];
DWORD dwReadBytes = 0;
CString somecode; //也可采用LPTSTR类型,将不会删除文本中的\n回车符
int nCodePage = -1;
// 读写网页文件,直到为空
DWORD dwPos = 0;
while(1)
{
dwReadBytes = file->Read(pBuf+dwPos,4096);
if(dwReadBytes <1)
break;
else
dwPos += dwReadBytes;
}
dwReadBytes = dwPos;
//nCodePage = FindCodePage(pBuf,dwReadBytes);
nCodePage = FindCodePage(pBuf,dwReadBytes,theUrl);
//预转换,得到所需空间的大小
int oldLen = retVal.GetLength();
int wcsLen = ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf,dwReadBytes, NULL, 0);
//分配空间要给'\0'留个空间,MultiByteToWideChar不会给'\0'空间
wchar_t* wszString = new wchar_t[wcsLen + 1];
memset(wszString,0,sizeof(wchar_t)*(wcsLen + 1));
//转换
::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf, dwReadBytes, wszString, wcsLen);
//最后加上'\0'
//wszString[wcsLen] = '\0';
retVal = CString(wszString);
delete[] wszString;
file->Close();
delete file;
delete pBuf;
}
else
{
return retVal;
}
return retVal;
}
{
int nResult = -1;
UINT u[4];
UINT uUTF8Count = 0;
UINT uACPCount = 0;
nResult = -1;
if(nLen < 8)
return nResult;
if (p[0] == 0xFF && p[1] == 0xFE && p[2] != 0xFF)//Unicode
{
nResult = CP_UTF8;
}
else if (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)//UTF8
{
nResult = CP_UTF8;
}
else
{
for(DWORD i=4;i<nLen-4;i++)
{
u[0] = p[i];
u[1] = p[i+1];
u[2] = p[i+2];
u[3] = p[i+3];
if((u[0]&248) ==240) //& B11111000 must be:B11110XXX
{
if((u[1]&192) == 128
&&(u[2]&192) == 128
&&(u[3]&192) == 128)
{
nResult = CP_UTF8;
uUTF8Count++;
i +=3;
//break;
}
else
{
nResult = CP_ACP;
i ++;
uACPCount++;
break;
}
}
else if((u[0]&240) ==224) //& B11110000 must be:B1110XXXX
{
//if((p[i+1] & 192 ==128)
//&&(p[i+2] & 192 ==128))
if((u[1]&192) == 128
&&(u[2]&192) == 128)
{
nResult = CP_UTF8;
uUTF8Count++;
i +=2;
//break;
}
else// if(u[0]>=128 && u[1] >=128)
{
nResult = CP_ACP;
i ++;
uACPCount++;
break;
}
}
//else if((u[0]&224) ==192) //& B11100000 must be:B110XXXXX
//{
// if((u[1]&192) == 128)
// {
// nResult = CP_UTF8;
// break;
// }
//}
/*else if(p[i]>160)
{
if((p[i+1]>160))
{
nResult = CP_ACP;
break;
}
}*/
}
}
if(nResult<0)
nResult = CP_ACP;
if(uUTF8Count+uACPCount>0)
TRACE(theUrl+CString(" PageCode = %d \n"),nResult);
return nResult;
}
//获取网页内容
CString GetSourceHtml(CString theUrl)
{
CString retVal;
CInternetSession session;
CInternetFile* file = NULL;
try
{
// 试着连接到指定URL
file = (CInternetFile*) session.OpenURL(theUrl);
}
catch (CInternetException* m_pException)
{
// 如果有错误的话,置文件为空
file = NULL;
m_pException->Delete();
return retVal;
}
if (file)
{
DWORD dwFileLen = 2097152;// 2 M
//BYTE* pBuf =new byte[81920];
BYTE* pBuf =new byte[dwFileLen];
DWORD dwReadBytes = 0;
CString somecode; //也可采用LPTSTR类型,将不会删除文本中的\n回车符
int nCodePage = -1;
// 读写网页文件,直到为空
DWORD dwPos = 0;
while(1)
{
dwReadBytes = file->Read(pBuf+dwPos,4096);
if(dwReadBytes <1)
break;
else
dwPos += dwReadBytes;
}
dwReadBytes = dwPos;
//nCodePage = FindCodePage(pBuf,dwReadBytes);
nCodePage = FindCodePage(pBuf,dwReadBytes,theUrl);
//预转换,得到所需空间的大小
int oldLen = retVal.GetLength();
int wcsLen = ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf,dwReadBytes, NULL, 0);
//分配空间要给'\0'留个空间,MultiByteToWideChar不会给'\0'空间
wchar_t* wszString = new wchar_t[wcsLen + 1];
memset(wszString,0,sizeof(wchar_t)*(wcsLen + 1));
//转换
::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf, dwReadBytes, wszString, wcsLen);
//最后加上'\0'
//wszString[wcsLen] = '\0';
retVal = CString(wszString);
delete[] wszString;
file->Close();
delete file;
delete pBuf;
}
else
{
return retVal;
}
return retVal;
}