nutch源代码--html的头信息解析

主要是meta、base、标签的信息
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.html;

import java.net.URL;

import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;

/**
 * Class for parsing META Directives from DOM trees.  This class
 * handles specifically Robots META directives (all, none, nofollow,
 * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
 * instructions. All meta directives are stored in a HTMLMetaTags instance.
 * 对于html头的解析，主要有meta base解析
 */
public class HTMLMetaProcessor {

  /**
   * Utility class with indicators for the robots directives "noindex"
   * and "nofollow", and HTTP-EQUIV/no-cache
   */
  
  /**
   * Sets the indicators in <code>robotsMeta</code> to appropriate
   * values, based on any META tags found under the given
   * <code>node</code>.
   */
  public static final void getMetaTags (
    HTMLMetaTags metaTags, Node node, URL currURL) {

    metaTags.reset();
    getMetaTagsHelper(metaTags, node, currURL);
  }

  private static final void getMetaTagsHelper(
    HTMLMetaTags metaTags, Node node, URL currURL) {

    if (node.getNodeType() == Node.ELEMENT_NODE) {

      if ("body".equalsIgnoreCase(node.getNodeName())) {
        // META tags should not be under body
        return;
      }
      //解析meta中的信息，详见http://www.w3school.com.cn/tags/tag_meta.asp
      if ("meta".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node nameNode = null;
        Node equivNode = null;
        Node contentNode = null;
        // Retrieves name, http-equiv and content attribues
        //取一个meta中的所有属性，such as ：
        //<meta name="keywords" content="HTML,ASP,PHP,SQL">   网站的关键字
        //<meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn">   定时刷新页面，有url属性时会跳转到相应的url
        for (int i=0; i<attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          String attrName = attr.getNodeName().toLowerCase();
          if (attrName.equals("name")) {
            nameNode = attr;
          } else if (attrName.equals("http-equiv")) {
            equivNode = attr;
          } else if (attrName.equals("content")) {
            contentNode = attr;
          }
        }
        
        if (nameNode != null) {
          if (contentNode != null) {
            String name = nameNode.getNodeValue().toLowerCase();
            metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
            
            /**
             * meta 中 robotos 信息
             Robots META标签则主要是针对一个个具体的页面。和其他的META标签（如使用的语言、页面的描述、关键词等）一样，Robots META标签也是放在页面的＜head＞＜/head＞中，专门用来告诉搜索引擎ROBOTS如何抓取该页的内容。
             Robots META标签的写法：
             Robots META标签中没有大小写之分，name=”Robots”表示所有的搜索引擎，可以针对某个具体搜索引擎写为name=”BaiduSpider”。 content部分有四个指令选项：index、noindex、follow、nofollow，指令间以“,”分隔。
             INDEX 指令告诉搜索机器人抓取该页面；
             FOLLOW 指令表示搜索机器人可以沿着该页面上的链接继续抓取下去；
             Robots Meta标签的缺省值是INDEX和FOLLOW，只有inktomi除外，对于它，缺省值是INDEX,NOFOLLOW。
                          这样，一共有四种组合：
                          以下是引用片段：
                         ＜META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”＞ 
                        ＜META NAME=”ROBOTS” CONTENT=”NOINDEX,FOLLOW”＞ 
                        ＜META NAME=”ROBOTS” CONTENT=”INDEX,NOFOLLOW”＞ 
                       ＜META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”＞ 
                       其中
                       以下是引用片段：
                       ＜META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”＞可以写成＜META NAME=”ROBOTS” CONTENT=”ALL”＞；
                        ＜META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”＞可以写成＜META NAME=”ROBOTS” CONTENT=”NONE”＞ 
                           目前看来，绝大多数的搜索引擎机器人都遵守robots.txt的规则，而对于Robots META标签，目前支持的并不多，但是正在逐渐增加，如著名搜索引擎GOOGLE就完全支持，而且GOOGLE还增加了一个指令“archive”，可以限制GOOGLE是否保留网页快照。例如：
                         以下是引用片段：
                               ＜META NAME=”googlebot” CONTENT=”index,follow,noarchive”＞
             */
            if ("robots".equals(name)) {
  
              if (contentNode != null) {
                String directives = 
                  contentNode.getNodeValue().toLowerCase();
                int index = directives.indexOf("none");
  
                if (index >= 0) {
                  metaTags.setNoIndex();
                  metaTags.setNoFollow();
                }
  
                index = directives.indexOf("all");
                if (index >= 0) {
                  // do nothing...
                }
  
                index = directives.indexOf("noindex");
                if (index >= 0) {
                  metaTags.setNoIndex();
                }
  
                index = directives.indexOf("nofollow");
                if (index >= 0) {
                  metaTags.setNoFollow();
                }
                
                index = directives.indexOf("noarchive");
                if (index >= 0) {
                  metaTags.setNoCache();
                }
              } 
  
            } // end if (name == robots)
          }
        }
        //http-equiv 信息http://www.w3school.com.cn/htmldom/prop_meta_httpequiv.asp 
        //主要定义刷新（refresh）、content-type  
        if (equivNode != null) {
          if (contentNode != null) {
            String name = equivNode.getNodeValue().toLowerCase();
            String content = contentNode.getNodeValue();
            metaTags.getHttpEquivTags().setProperty(name, content);
            if ("pragma".equals(name)) {
              content = content.toLowerCase();
              int index = content.indexOf("no-cache");
              if (index >= 0) 
                metaTags.setNoCache();
              //刷新信息  <meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn">
            } else if ("refresh".equals(name)) {
              int idx = content.indexOf(';');
              String time = null;
              if (idx == -1) { // just the refresh time
                time = content;
              } else time = content.substring(0, idx);
              try {
                metaTags.setRefreshTime(Integer.parseInt(time));
                // skip this if we couldn't parse the time
                metaTags.setRefresh(true);
              } catch (Exception e) {
                ;
              }
              URL refreshUrl = null;
              if (metaTags.getRefresh() && idx != -1) { // set the URL
                idx = content.toLowerCase().indexOf("url=");
                if (idx == -1) { // assume a mis-formatted entry with just the url
                  idx = content.indexOf(';') + 1;
                } else idx += 4;
                if (idx != -1) {
                  String url = content.substring(idx);
                  try {
                    refreshUrl = new URL(url);
                  } catch (Exception e) {
                    // XXX according to the spec, this has to be an absolute
                    // XXX url. However, many websites use relative URLs and
                    // XXX expect browsers to handle that.
                    // XXX Unfortunately, in some cases this may create a
                    // XXX infinitely recursive paths (a crawler trap)...
                    // if (!url.startsWith("/")) url = "/" + url;
                    try {
                      refreshUrl = new URL(currURL, url);
                    } catch (Exception e1) {
                      refreshUrl = null;
                    }
                  }
                }
              }
              if (metaTags.getRefresh()) {
                if (refreshUrl == null) {
                  // apparently only refresh time was present. set the URL
                  // to the same URL.
                  refreshUrl = currURL;
                }
                metaTags.setRefreshHref(refreshUrl);
              }
            }
          }
        }
        //提取base信息，http://www.w3school.com.cn/tags/tag_base.asp
        //默认的相对url
      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node hrefNode = attrs.getNamedItem("href");

        if (hrefNode != null) {
          String urlString = hrefNode.getNodeValue();

          URL url = null;
          try {
            if (currURL == null)
              url = new URL(urlString);
            else 
              url = new URL(currURL, urlString);
          } catch (Exception e) {
            ;
          }

          if (url != null) 
            metaTags.setBaseHref(url);
        }

      }

    }

    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        getMetaTagsHelper(metaTags, children.item(i), currURL);
      }
    }
  }

}
posted @ 2011-11-18 15:29 xiao晓阅读(673) 评论(0) 编辑收藏举报
刷新页面返回顶部
xiao晓

serendipity

nutch源代码--html的头信息解析

公告