VC之网页获取（能区分GB2312与UTF8）

测试版，使用时还需要优化个别地方

代码

int FindCodePage(PBYTE p,int nLen,CString theUrl)
{
    int nResult = -1;
    UINT u[4];
    UINT uUTF8Count = 0;
    UINT uACPCount = 0;
    nResult = -1;
    if(nLen < 8)
        return nResult;
    if (p[0] == 0xFF && p[1] == 0xFE && p[2] != 0xFF)//Unicode
    {
        nResult = CP_UTF8;
    }
    else if (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)//UTF8
    {
        nResult = CP_UTF8;
    }
    else
    {
        for(DWORD i=4;i<nLen-4;i++)
        {
            u[0] = p[i];
            u[1] = p[i+1];
            u[2] = p[i+2];
            u[3] = p[i+3];
            if((u[0]&248) ==240)   //& B11111000     must be:B11110XXX
            {
                if((u[1]&192) == 128
                &&(u[2]&192) == 128
                &&(u[3]&192) == 128)
                {
                    nResult = CP_UTF8;
                    uUTF8Count++;
                    i +=3;
                    //break;
                }
                else
                {
                    nResult = CP_ACP;
                    i ++;
                    uACPCount++;
                    break;
                }
            }
            else if((u[0]&240) ==224)   //& B11110000     must be:B1110XXXX
            {

                //if((p[i+1] & 192 ==128)
                 //&&(p[i+2] & 192 ==128))
                if((u[1]&192) == 128
                &&(u[2]&192) == 128)
                {
                    nResult = CP_UTF8;
                    uUTF8Count++;
                    i +=2;
                    //break;
                }
                else// if(u[0]>=128 && u[1] >=128)
                {
                    nResult = CP_ACP;
                    i ++;
                    uACPCount++;
                    break;
                }
            }
            //else if((u[0]&224) ==192)   //& B11100000     must be:B110XXXXX
            //{
            //    if((u[1]&192) == 128)
            //    {
            //        nResult = CP_UTF8;
            //        break;
            //    }
            //}
            /*else if(p[i]>160)
            {
                if((p[i+1]>160))
                {
                    nResult = CP_ACP;
                    break;
                }
            }*/

        }
    }
    if(nResult<0)
        nResult = CP_ACP;
    if(uUTF8Count+uACPCount>0)
        TRACE(theUrl+CString("  PageCode = %d  \n"),nResult);
    return nResult;
}
//获取网页内容
CString GetSourceHtml(CString theUrl)
{
    CString retVal;
    CInternetSession session;
    CInternetFile* file = NULL;

    try
    {
        // 试着连接到指定URL
        file = (CInternetFile*) session.OpenURL(theUrl);
    }
    catch (CInternetException* m_pException)
    {
        // 如果有错误的话，置文件为空
        file = NULL;
        m_pException->Delete();
        return retVal;
    }

    if (file)
    {
        DWORD dwFileLen = 2097152;// 2 M
        //BYTE* pBuf =new byte[81920];
        BYTE* pBuf =new byte[dwFileLen];

        DWORD dwReadBytes = 0;
        CString  somecode; //也可采用LPTSTR类型，将不会删除文本中的\n回车符

        int nCodePage = -1;
        // 读写网页文件，直到为空
        DWORD dwPos = 0;
        while(1)
        {
            dwReadBytes = file->Read(pBuf+dwPos,4096);
            if(dwReadBytes <1)
                break;
            else
                dwPos += dwReadBytes;
        }
        dwReadBytes = dwPos;

        //nCodePage = FindCodePage(pBuf,dwReadBytes);
        nCodePage = FindCodePage(pBuf,dwReadBytes,theUrl);

        //预转换，得到所需空间的大小
        int oldLen = retVal.GetLength();

        int wcsLen = ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf,dwReadBytes, NULL, 0);
        //分配空间要给'\0'留个空间，MultiByteToWideChar不会给'\0'空间
        wchar_t* wszString = new wchar_t[wcsLen + 1];
        memset(wszString,0,sizeof(wchar_t)*(wcsLen + 1));
        //转换
        ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf, dwReadBytes, wszString, wcsLen);
        //最后加上'\0'
        //wszString[wcsLen] = '\0';
        retVal = CString(wszString);
        delete[] wszString;

        file->Close();
        delete file;
        delete pBuf;
    }
    else
    {
        return retVal;
    }

    return retVal;
}

posted @ 2009-11-29 16:08 吾非无心阅读(617) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

吾非无心

——Thinkin……g

VC之网页获取（能区分GB2312与UTF8）

公告