使用mshtml解析html

测试用例

 

测试用例
<html>
<head>
<title>
    Just a Test
</title>
</head>
<body>
gaofeng hello!!
<div>
<table bgcolor="red">

<tr>
<td bgcolor="yellow" border="2">Name</td>
<td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td>
</tr>
<tr>
<td><id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td>
<td></td>
</tr>
<tr>
<td>address</td>
<td></td>
</tr>
</table>
</div>
</body>
</html>

 

 

头文件

 

#include <iostream>
#include 
<comdef.h>
#include 
<mshtml.h>
#include 
<string>
#include 
<fstream>
#include 
<vector>
#include 
<map>
#import 
<mshtml.tlb> no_auto_exclude

 

 

测试代码

 

代码
// TestMSHTML.cpp : 定义控制台应用程序的入口点。
//

#include 
"stdafx.h"
#include 
"TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#endif


// 唯一的应用程序对象

CWinApp theApp;
FILE 
* fout;
using namespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");

typedef  
int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
    IHTMLElement 
* pBody;
    pNewDoc
->get_body(&pBody);
    pBody
->Release();
}

void PrintTabs(int n)
{
    
for (int i = 0;i<n;i++)
    {
        
//cout << '\t';
        fwprintf(fout,_T("\t"));
    }
}

void VisitNode(IHTMLElement* pElement,int level)
{
    BSTR strName,strId,strTag;
    PrintTabs(level);
    pElement
->get_className(&strName);
    pElement
->get_id(&strId);
    pElement
->get_tagName(&strTag);
    
if (strTag!=NULL)
    {
        fwprintf(fout,_T(
"TagName:%s "),strTag);
    }
    
if (strName!=NULL)
    {
        fwprintf(fout,_T(
"className:%s "),strName);
    }
    
if (strId != NULL)
    {
        fwprintf(fout,_T(
"Id:%s "),strId);
    }
    SysFreeString(strName);
    SysFreeString(strId);
    SysFreeString(strTag);
    BSTR strAttrName1 
= _T("border");
    BSTR strAttrName2 
= _T("bgcolor");
    VARIANT val;

    pElement
->getAttribute(strAttrName1,2,&val);
    
if (val.vt != VT_NULL)
    {
        
if (val.bstrVal != NULL)
        {
            fwprintf(fout,_T(
"border:%s "),val.bstrVal);
        }
    }


    pElement
->getAttribute(strAttrName2,2,&val);
    
if (val.vt != VT_NULL)
    {
        
if (val.bstrVal != NULL)
        {
            fwprintf(fout,_T(
"bgcolor:%s "),val.bstrVal);
        }
    }

    
    fwprintf(fout,_T(
"\n"));
}
//将DOM树打印出来
void Run(IHTMLElement * pElement,int level)
{
    IHTMLElementCollection 
* children;

    VisitNode(pElement,level);


    IDispatch
* pDisp;
    pElement
->get_children(&pDisp);
    pDisp
->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
    pDisp
->Release();

    
long len;
    children
->get_length(&len);
    VARIANT dummy;
    dummy.vt 
= VT_I4;
    
for (int i = 0;i < len;i++)
    {
        IHTMLElement
* child;
        dummy.intVal 
= i;
        children
->item(dummy,dummy,(IDispatch**)&pDisp);
        pDisp
->QueryInterface(IID_IHTMLElement,(void**)&child);
        pDisp
->Release();
        Run(child,level 
+ 1);
        child
->Release();
    }
    children
->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
    BSTR strText;
    IHTMLElement 
*pBody;
    pNewDoc
->get_body(&pBody);
    pBody
->get_innerText(&strText);
    wprintf(_T(
"%s\n"),strText);
    SysFreeString(strText);
    

    pNewDoc
->get_title(&strText);
    wprintf(_T(
"%s\n"),strText);
    SysFreeString(strText);
    
    cout 
<< "Run begin...."<<endl;
    Run(pBody,
0);
    cout 
<< "Run end...."<<endl;

    pBody
->Release();

    
//FindAllElementHavingBg(pNewDoc);

}
void TestMSHTML(wchar_t * wcontent)
{
    IHTMLDocument2 
*pDoc = NULL;
    CoInitialize(NULL);
    CoCreateInstance(CLSID_HTMLDocument, 
                     NULL, 
                     CLSCTX_INPROC_SERVER, 
                     IID_IHTMLDocument2, 
                    (LPVOID 
*&pDoc);

    
if (pDoc)
    {
        IPersistStreamInit 
*pPersist = NULL;
        pDoc
->QueryInterface(IID_IPersistStreamInit, 
                             (LPVOID 
*&pPersist);
        
if (pPersist)
        {
            IMarkupServices 
*pMS = NULL;
            pPersist
->InitNew();
            pPersist
->Release();
            pDoc
->QueryInterface(IID_IMarkupServices, 
                                (LPVOID 
*&pMS);

            
if (pMS)
            {
                IMarkupContainer 
*pMC = NULL;
                IMarkupPointer 
*pMkStart = NULL;
                IMarkupPointer 
*pMkFinish = NULL;
                pMS
->CreateMarkupPointer(&pMkStart);
                pMS
->CreateMarkupPointer(&pMkFinish);
                pMS
->ParseString(wcontent,
                    
0
                    
&pMC, 
                    pMkStart, 
                    pMkFinish);

                
if (pMC)
                {
                    IHTMLDocument2 
*pNewDoc = NULL;

                    pMC
->QueryInterface(IID_IHTMLDocument, 
                        (LPVOID 
*&pNewDoc);

                    
if (pNewDoc)
                    {
                        
// do anything with pNewDoc, in this case 
                        
// get the body innerText.
                        TestParse(pNewDoc);
    
                        pNewDoc
->Release();
                    }

                    pMC
->Release();
                }

                
if (pMkStart)
                    pMkStart
->Release();

                
if (pMkFinish)
                    pMkFinish
->Release();

                pMS
->Release();
            }
        }

        pDoc
->Release();
    }

    CoUninitialize();

}

inline wchar_t
* AnsiToUnicode( const char* szStr )
{
    
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
    
if (nLen == 0)
    {
        
return NULL;
    }
    wchar_t
* pResult = new wchar_t[nLen+1];
    MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, 
-1, pResult, nLen );
    pResult[nLen] 
= L'\0';
    
return pResult;
}

//调用者负责delete wcontent
wchar_t * ReadFromHtmlFile(string str,string & content)
{
    ifstream fin(str.c_str());
    
string line;
    
while(getline(fin,line))
    {
    
//    cout << line << endl;
        content = content + line;
    }
    
//cout << content << endl;
    
//cout << content.size() << endl;
    
//printf("original html code\n%s\n",content.c_str());
    wchar_t * wcontent = AnsiToUnicode(content.c_str()); 
    
//wprintf(L"after transferred\n%s\n",wcontent);
    
//delete[] wcontent;
    fin.close();
    fin.clear();
    
return wcontent;
}

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
    
int nRetCode = 0;

    
// 初始化 MFC 并在失败时显示错误
    if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
    {
        
// TODO: 更改错误代码以符合您的需要
        _tprintf(_T("错误: MFC 初始化失败\n"));
        nRetCode 
= 1;
    }
    
else
    {
        fout 
= fopen("out.txt","w");
        
string str = "test.html";
        
string content;
        wchar_t 
* wcontent = ReadFromHtmlFile(str,content);
        
int len = wcslen(wcontent);
        
//cout << len << endl;
        
        TestMSHTML(wcontent);
        delete[] wcontent;
        fclose(fout);
    }
    
    
return nRetCode;
}
输入结果
TagName:BODY
 TagName:DIV
  TagName:TABLE bgcolor:#ff0000
   TagName:TBODY
    TagName:TR
     TagName:TD border:2 bgcolor:#ffff00
     TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
    TagName:TR
     TagName:TD
      TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
     TagName:TD
    TagName:TR
     TagName:TD
     TagName:TD

 

 

posted on 2010-08-11 16:42  speedmancs  阅读(3201)  评论(0编辑  收藏  举报

导航