抓取网页上所有看的见的字符

最近老板让实现一个功能,抓取指定页面中的所有可见的字符串。因为原先没做过这方面的工作在实现起来遇到了一些问题,在这里做一下纪录希望在以后再遇到这样的要求时能想起来。
对与网页来说,可见的字符串是什么概念, 样式不能是Display: none 和 visibility: hidden;所有字符串是什么概念包括在容器标签中的如
<td>hello</td> <div>hello</div> <span>hello</span>和控件的标题如<input type="btn" value="确定">这样的字符串。我采用了两个函数来分别处理这两种功能:
 1/// <summary>
 2        /// traverse the html tree and seek string which in the container
 3        /// </summary>
 4        /// <param name="container"></param>
 5        /// <param name="showIframeIndex"></param>

 6        private void SeekStringsInContianer(mshtml.IHTMLElement container, ref int showIframeIndex)
 7        {
 8            try
 9            {
10                if (null == container || null == container.innerHTML)
11                    return;
12
13                // if the container is hidden
14                if (null != container.style &&
15                    ((null != container.style.display &&  container.style.display.Equals("none")) ||
16                     (null != container.style.visibility && container.style.visibility.Equals("hidden"))))
17                    return;
18
19                if (container.tagName.ToLower().Equals("iframe"))
20                    return;
21                
22                // if the container isn't a container
23                if (null != container.innerText &&
24                    container.innerHTML.IndexOf("<IFRAME"== -1 &&
25                    container.innerHTML.IndexOf("<TABLE"== -1 &&
26                    container.innerHTML.IndexOf("<TD"== -1 &&
27                    container.innerHTML.IndexOf("<TR"== -1 &&
28                    container.innerHTML.IndexOf("<DIV"== -1 )
29                {
30                    if (((mshtml.IHTMLElementCollection)container.children).length == 0)
31                        mHuntedStringsList.Add(container.innerText);
32                    else
33                        mHuntedStringsList.Add(SpanFilters(container));
34                }

35                else
36                {
37                    mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)container.children;
38
39                    // traverse container's childs
40                    int i = 0;
41                    foreach (mshtml.IHTMLElement elem in collection)
42                    {
43                        // jump over  the hidden iframes
44                        if (elem.tagName.ToLower().Equals("iframe"))
45                        {
46                            if (null != elem.style &&
47                                null != elem.style.cssText &&
48                                elem.style.cssText.IndexOf("none"!= -1)
49                                i++;
50                            else
51                                showIframeIndex = i;
52                        }

53                        else
54                        {
55                            SeekStringsInContianer(elem, ref showIframeIndex);
56                        }

57                    }

58                }

59            }

60            catch (ArgumentNullException e)
61            {
62                throw new Exception("null container!", e);
63            }

64        }

65
66        /// <summary>
67        /// get the caption of the control , for example button
68        /// </summary>
69        /// <param name="doc"></param>

70        private void SeekStringsInControl(mshtml.IHTMLElement container)
71        {
72            try
73            {
74                mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)container.all;
75                foreach (mshtml.IHTMLElement elem in collection)
76                {
77                    if (elem.tagName.ToLower().Equals("input"))
78                    {
79                        string attr = (string)elem.getAttribute("type"0);
80                        string id = (string)elem.getAttribute("id"0);
81                        if ( attr.Equals("submit"|| ( null != id && id.IndexOf("txt"!= -1))
82                            mHuntedStringsList.Add((string)elem.getAttribute("value"0));
83                    }

84                }

85            }

86            catch (ArgumentNullException e)
87            {
88                throw new Exception("null document during seeking string in control!", e);
89            }

90        }

这两个函数主要用到了递归操作,由于 html代码具有嵌套性,并且可以不规范的书写所以中间做了一些额外的工作在去的可见字符串的时候。我使用下面的函数来完成这个工作:
 1 /// <summary>
 2        /// filter the span which is hidden in the element
 3        /// </summary>
 4        /// <param name="element"></param>
 5        /// <returns></returns>

 6        private string SpanFilters(mshtml.IHTMLElement element)
 7        {
 8            string resultStr = element.innerText;
 9
10            if (((mshtml.IHTMLElementCollection)element.children).length == 0)
11                return resultStr;
12
13            FilterHiddenText(element, ref resultStr);
14            return resultStr;
15        }

16
17        private void FilterHiddenText(mshtml.IHTMLElement element, ref string srcStr)
18        {
19            if (null != element && 
20                null != element.innerText &&
21                null != element.style &&
22                (( null != element.style.visibility && element.style.visibility.Equals("hidden") ) ||
23                (null != element.style.display && element.style.display.Equals("none"))))
24            {
25                int pos = srcStr.IndexOf(element.innerText);
26                srcStr = srcStr.Remove(pos, element.innerText.Length);
27                return;
28            }

29
30            mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)element.children;
31            if (collection.length != 0)
32            {
33                foreach (mshtml.IHTMLElement elem in collection)
34                {
35                    FilterHiddenText(elem, ref srcStr);
36                }

37            }

38        }
posted @ 2007-07-11 20:53  moonz-wu  阅读(980)  评论(0编辑  收藏  举报