使用mshtml解析html
测试用例
测试用例
<html>
<head>
<title>
Just a Test
</title>
</head>
<body>
gaofeng hello!!
<div>
<table bgcolor="red">
<tr>
<td bgcolor="yellow" border="2">Name</td>
<td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td>
</tr>
<tr>
<td><p id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td>
<td></td>
</tr>
<tr>
<td>address</td>
<td></td>
</tr>
</table>
</div>
</body>
</html>
<head>
<title>
Just a Test
</title>
</head>
<body>
gaofeng hello!!
<div>
<table bgcolor="red">
<tr>
<td bgcolor="yellow" border="2">Name</td>
<td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td>
</tr>
<tr>
<td><p id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td>
<td></td>
</tr>
<tr>
<td>address</td>
<td></td>
</tr>
</table>
</div>
</body>
</html>
头文件
#include <iostream>
#include <comdef.h>
#include <mshtml.h>
#include <string>
#include <fstream>
#include <vector>
#include <map>
#import <mshtml.tlb> no_auto_exclude
#include <comdef.h>
#include <mshtml.h>
#include <string>
#include <fstream>
#include <vector>
#include <map>
#import <mshtml.tlb> no_auto_exclude
测试代码
代码
// TestMSHTML.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include "TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// 唯一的应用程序对象
CWinApp theApp;
FILE * fout;
using namespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");
typedef int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
IHTMLElement * pBody;
pNewDoc->get_body(&pBody);
pBody->Release();
}
void PrintTabs(int n)
{
for (int i = 0;i<n;i++)
{
//cout << '\t';
fwprintf(fout,_T("\t"));
}
}
void VisitNode(IHTMLElement* pElement,int level)
{
BSTR strName,strId,strTag;
PrintTabs(level);
pElement->get_className(&strName);
pElement->get_id(&strId);
pElement->get_tagName(&strTag);
if (strTag!=NULL)
{
fwprintf(fout,_T("TagName:%s "),strTag);
}
if (strName!=NULL)
{
fwprintf(fout,_T("className:%s "),strName);
}
if (strId != NULL)
{
fwprintf(fout,_T("Id:%s "),strId);
}
SysFreeString(strName);
SysFreeString(strId);
SysFreeString(strTag);
BSTR strAttrName1 = _T("border");
BSTR strAttrName2 = _T("bgcolor");
VARIANT val;
pElement->getAttribute(strAttrName1,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("border:%s "),val.bstrVal);
}
}
pElement->getAttribute(strAttrName2,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("bgcolor:%s "),val.bstrVal);
}
}
fwprintf(fout,_T("\n"));
}
//将DOM树打印出来
void Run(IHTMLElement * pElement,int level)
{
IHTMLElementCollection * children;
VisitNode(pElement,level);
IDispatch* pDisp;
pElement->get_children(&pDisp);
pDisp->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
pDisp->Release();
long len;
children->get_length(&len);
VARIANT dummy;
dummy.vt = VT_I4;
for (int i = 0;i < len;i++)
{
IHTMLElement* child;
dummy.intVal = i;
children->item(dummy,dummy,(IDispatch**)&pDisp);
pDisp->QueryInterface(IID_IHTMLElement,(void**)&child);
pDisp->Release();
Run(child,level + 1);
child->Release();
}
children->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
BSTR strText;
IHTMLElement *pBody;
pNewDoc->get_body(&pBody);
pBody->get_innerText(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
pNewDoc->get_title(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
cout << "Run begin...."<<endl;
Run(pBody,0);
cout << "Run end...."<<endl;
pBody->Release();
//FindAllElementHavingBg(pNewDoc);
}
void TestMSHTML(wchar_t * wcontent)
{
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
CoCreateInstance(CLSID_HTMLDocument,
NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2,
(LPVOID *) &pDoc);
if (pDoc)
{
IPersistStreamInit *pPersist = NULL;
pDoc->QueryInterface(IID_IPersistStreamInit,
(LPVOID *) &pPersist);
if (pPersist)
{
IMarkupServices *pMS = NULL;
pPersist->InitNew();
pPersist->Release();
pDoc->QueryInterface(IID_IMarkupServices,
(LPVOID *) &pMS);
if (pMS)
{
IMarkupContainer *pMC = NULL;
IMarkupPointer *pMkStart = NULL;
IMarkupPointer *pMkFinish = NULL;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
pMS->ParseString(wcontent,
0,
&pMC,
pMkStart,
pMkFinish);
if (pMC)
{
IHTMLDocument2 *pNewDoc = NULL;
pMC->QueryInterface(IID_IHTMLDocument,
(LPVOID *) &pNewDoc);
if (pNewDoc)
{
// do anything with pNewDoc, in this case
// get the body innerText.
TestParse(pNewDoc);
pNewDoc->Release();
}
pMC->Release();
}
if (pMkStart)
pMkStart->Release();
if (pMkFinish)
pMkFinish->Release();
pMS->Release();
}
}
pDoc->Release();
}
CoUninitialize();
}
inline wchar_t* AnsiToUnicode( const char* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if (nLen == 0)
{
return NULL;
}
wchar_t* pResult = new wchar_t[nLen+1];
MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
pResult[nLen] = L'\0';
return pResult;
}
//调用者负责delete wcontent
wchar_t * ReadFromHtmlFile(string str,string & content)
{
ifstream fin(str.c_str());
string line;
while(getline(fin,line))
{
// cout << line << endl;
content = content + line;
}
//cout << content << endl;
//cout << content.size() << endl;
//printf("original html code\n%s\n",content.c_str());
wchar_t * wcontent = AnsiToUnicode(content.c_str());
//wprintf(L"after transferred\n%s\n",wcontent);
//delete[] wcontent;
fin.close();
fin.clear();
return wcontent;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
fout = fopen("out.txt","w");
string str = "test.html";
string content;
wchar_t * wcontent = ReadFromHtmlFile(str,content);
int len = wcslen(wcontent);
//cout << len << endl;
TestMSHTML(wcontent);
delete[] wcontent;
fclose(fout);
}
return nRetCode;
}
//
#include "stdafx.h"
#include "TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// 唯一的应用程序对象
CWinApp theApp;
FILE * fout;
using namespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");
typedef int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
IHTMLElement * pBody;
pNewDoc->get_body(&pBody);
pBody->Release();
}
void PrintTabs(int n)
{
for (int i = 0;i<n;i++)
{
//cout << '\t';
fwprintf(fout,_T("\t"));
}
}
void VisitNode(IHTMLElement* pElement,int level)
{
BSTR strName,strId,strTag;
PrintTabs(level);
pElement->get_className(&strName);
pElement->get_id(&strId);
pElement->get_tagName(&strTag);
if (strTag!=NULL)
{
fwprintf(fout,_T("TagName:%s "),strTag);
}
if (strName!=NULL)
{
fwprintf(fout,_T("className:%s "),strName);
}
if (strId != NULL)
{
fwprintf(fout,_T("Id:%s "),strId);
}
SysFreeString(strName);
SysFreeString(strId);
SysFreeString(strTag);
BSTR strAttrName1 = _T("border");
BSTR strAttrName2 = _T("bgcolor");
VARIANT val;
pElement->getAttribute(strAttrName1,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("border:%s "),val.bstrVal);
}
}
pElement->getAttribute(strAttrName2,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("bgcolor:%s "),val.bstrVal);
}
}
fwprintf(fout,_T("\n"));
}
//将DOM树打印出来
void Run(IHTMLElement * pElement,int level)
{
IHTMLElementCollection * children;
VisitNode(pElement,level);
IDispatch* pDisp;
pElement->get_children(&pDisp);
pDisp->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
pDisp->Release();
long len;
children->get_length(&len);
VARIANT dummy;
dummy.vt = VT_I4;
for (int i = 0;i < len;i++)
{
IHTMLElement* child;
dummy.intVal = i;
children->item(dummy,dummy,(IDispatch**)&pDisp);
pDisp->QueryInterface(IID_IHTMLElement,(void**)&child);
pDisp->Release();
Run(child,level + 1);
child->Release();
}
children->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
BSTR strText;
IHTMLElement *pBody;
pNewDoc->get_body(&pBody);
pBody->get_innerText(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
pNewDoc->get_title(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
cout << "Run begin...."<<endl;
Run(pBody,0);
cout << "Run end...."<<endl;
pBody->Release();
//FindAllElementHavingBg(pNewDoc);
}
void TestMSHTML(wchar_t * wcontent)
{
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
CoCreateInstance(CLSID_HTMLDocument,
NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2,
(LPVOID *) &pDoc);
if (pDoc)
{
IPersistStreamInit *pPersist = NULL;
pDoc->QueryInterface(IID_IPersistStreamInit,
(LPVOID *) &pPersist);
if (pPersist)
{
IMarkupServices *pMS = NULL;
pPersist->InitNew();
pPersist->Release();
pDoc->QueryInterface(IID_IMarkupServices,
(LPVOID *) &pMS);
if (pMS)
{
IMarkupContainer *pMC = NULL;
IMarkupPointer *pMkStart = NULL;
IMarkupPointer *pMkFinish = NULL;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
pMS->ParseString(wcontent,
0,
&pMC,
pMkStart,
pMkFinish);
if (pMC)
{
IHTMLDocument2 *pNewDoc = NULL;
pMC->QueryInterface(IID_IHTMLDocument,
(LPVOID *) &pNewDoc);
if (pNewDoc)
{
// do anything with pNewDoc, in this case
// get the body innerText.
TestParse(pNewDoc);
pNewDoc->Release();
}
pMC->Release();
}
if (pMkStart)
pMkStart->Release();
if (pMkFinish)
pMkFinish->Release();
pMS->Release();
}
}
pDoc->Release();
}
CoUninitialize();
}
inline wchar_t* AnsiToUnicode( const char* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if (nLen == 0)
{
return NULL;
}
wchar_t* pResult = new wchar_t[nLen+1];
MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
pResult[nLen] = L'\0';
return pResult;
}
//调用者负责delete wcontent
wchar_t * ReadFromHtmlFile(string str,string & content)
{
ifstream fin(str.c_str());
string line;
while(getline(fin,line))
{
// cout << line << endl;
content = content + line;
}
//cout << content << endl;
//cout << content.size() << endl;
//printf("original html code\n%s\n",content.c_str());
wchar_t * wcontent = AnsiToUnicode(content.c_str());
//wprintf(L"after transferred\n%s\n",wcontent);
//delete[] wcontent;
fin.close();
fin.clear();
return wcontent;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
fout = fopen("out.txt","w");
string str = "test.html";
string content;
wchar_t * wcontent = ReadFromHtmlFile(str,content);
int len = wcslen(wcontent);
//cout << len << endl;
TestMSHTML(wcontent);
delete[] wcontent;
fclose(fout);
}
return nRetCode;
}
输入结果
TagName:BODY
TagName:DIV
TagName:TABLE bgcolor:#ff0000
TagName:TBODY
TagName:TR
TagName:TD border:2 bgcolor:#ffff00
TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
TagName:TR
TagName:TD
TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
TagName:TD
TagName:TR
TagName:TD
TagName:TD
TagName:DIV
TagName:TABLE bgcolor:#ff0000
TagName:TBODY
TagName:TR
TagName:TD border:2 bgcolor:#ffff00
TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
TagName:TR
TagName:TD
TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
TagName:TD
TagName:TR
TagName:TD
TagName:TD
posted on 2010-08-11 16:42 speedmancs 阅读(3203) 评论(0) 编辑 收藏 举报