nutch源代码--html的头信息解析

主要是meta、base、标签的信息

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*
http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.parse.html;

import java.net.URL;

import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;

/**
* Class for parsing META Directives from DOM trees. This class
* handles specifically Robots META directives (all, none, nofollow,
* noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
* instructions. All meta directives are stored in a HTMLMetaTags instance.
* 对于html头的解析,主要有meta base解析
*/
public class HTMLMetaProcessor {

/**
* Utility class with indicators for the robots directives "noindex"
* and "nofollow", and HTTP-EQUIV/no-cache
*/

/**
* Sets the indicators in <code>robotsMeta</code> to appropriate
* values, based on any META tags found under the given
* <code>node</code>.
*/
public static final void getMetaTags (
HTMLMetaTags metaTags, Node node, URL currURL) {

metaTags.reset();
getMetaTagsHelper(metaTags, node, currURL);
}

private static final void getMetaTagsHelper(
HTMLMetaTags metaTags, Node node, URL currURL) {

if (node.getNodeType() == Node.ELEMENT_NODE) {

if ("body".equalsIgnoreCase(node.getNodeName())) {
// META tags should not be under body
return;
}
//解析meta中的信息,详见http://www.w3school.com.cn/tags/tag_meta.asp
if ("meta".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node nameNode = null;
Node equivNode = null;
Node contentNode = null;
// Retrieves name, http-equiv and content attribues
//取一个meta中的所有属性,such as :
//<meta name="keywords" content="HTML,ASP,PHP,SQL"> 网站的关键字
//<meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn"> 定时刷新页面,有url属性时会跳转到相应的url
for (int i=0; i<attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName().toLowerCase();
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
equivNode = attr;
} else if (attrName.equals("content")) {
contentNode = attr;
}
}

if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());

/**
* meta 中 robotos 信息
Robots META标签则主要是针对一个个具体的页面。和其他的META标签(如使用的语言、页面的描述、关键词等)一样,Robots META标签也是放在页面的<head></head>中,专门用来告诉搜索引擎ROBOTS如何抓取该页的内容。
Robots META标签的写法:
Robots META标签中没有大小写之分,name=”Robots”表示所有的搜索引擎,可以针对某个具体搜索引擎写为name=”BaiduSpider”。 content部分有四个指令选项:index、noindex、follow、nofollow,指令间以“,”分隔。
INDEX 指令告诉搜索机器人抓取该页面;
FOLLOW 指令表示搜索机器人可以沿着该页面上的链接继续抓取下去;
Robots Meta标签的缺省值是INDEX和FOLLOW,只有inktomi除外,对于它,缺省值是INDEX,NOFOLLOW。
这样,一共有四种组合:
以下是引用片段:
<META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”>
<META NAME=”ROBOTS” CONTENT=”NOINDEX,FOLLOW”>
<META NAME=”ROBOTS” CONTENT=”INDEX,NOFOLLOW”>
<META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”>
其中
以下是引用片段:
<META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”>可以写成<META NAME=”ROBOTS” CONTENT=”ALL”>;
<META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”>可以写成<META NAME=”ROBOTS” CONTENT=”NONE”>
目前看来,绝大多数的搜索引擎机器人都遵守robots.txt的规则,而对于Robots META标签,目前支持的并不多,但是正在逐渐增加,如著名搜索引擎GOOGLE就完全支持,而且GOOGLE还增加了一个指令“archive”,可以限制GOOGLE是否保留网页快照。例如:
以下是引用片段:
<META NAME=”googlebot” CONTENT=”index,follow,noarchive”>
*/
if ("robots".equals(name)) {

if (contentNode != null) {
String directives =
contentNode.getNodeValue().toLowerCase();
int index = directives.indexOf("none");

if (index >= 0) {
metaTags.setNoIndex();
metaTags.setNoFollow();
}

index = directives.indexOf("all");
if (index >= 0) {
// do nothing...
}

index = directives.indexOf("noindex");
if (index >= 0) {
metaTags.setNoIndex();
}

index = directives.indexOf("nofollow");
if (index >= 0) {
metaTags.setNoFollow();
}

index = directives.indexOf("noarchive");
if (index >= 0) {
metaTags.setNoCache();
}
}

} // end if (name == robots)
}
}
//http-equiv 信息http://www.w3school.com.cn/htmldom/prop_meta_httpequiv.asp
//主要定义刷新(refresh)、content-type
if (equivNode != null) {
if (contentNode != null) {
String name = equivNode.getNodeValue().toLowerCase();
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
content = content.toLowerCase();
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
//刷新信息 <meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn">
} else if ("refresh".equals(name)) {
int idx = content.indexOf(';');
String time = null;
if (idx == -1) { // just the refresh time
time = content;
} else time = content.substring(0, idx);
try {
metaTags.setRefreshTime(Integer.parseInt(time));
// skip this if we couldn't parse the time
metaTags.setRefresh(true);
} catch (Exception e) {
;
}
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
idx = content.toLowerCase().indexOf("url=");
if (idx == -1) { // assume a mis-formatted entry with just the url
idx = content.indexOf(';') + 1;
} else idx += 4;
if (idx != -1) {
String url = content.substring(idx);
try {
refreshUrl = new URL(url);
} catch (Exception e) {
// XXX according to the spec, this has to be an absolute
// XXX url. However, many websites use relative URLs and
// XXX expect browsers to handle that.
// XXX Unfortunately, in some cases this may create a
// XXX infinitely recursive paths (a crawler trap)...
// if (!url.startsWith("/")) url = "/" + url;
try {
refreshUrl = new URL(currURL, url);
} catch (Exception e1) {
refreshUrl = null;
}
}
}
}
if (metaTags.getRefresh()) {
if (refreshUrl == null) {
// apparently only refresh time was present. set the URL
// to the same URL.
refreshUrl = currURL;
}
metaTags.setRefreshHref(refreshUrl);
}
}
}
}
//提取base信息,http://www.w3school.com.cn/tags/tag_base.asp
//默认的相对url
} else if ("base".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node hrefNode = attrs.getNamedItem("href");

if (hrefNode != null) {
String urlString = hrefNode.getNodeValue();

URL url = null;
try {
if (currURL == null)
url = new URL(urlString);
else
url = new URL(currURL, urlString);
} catch (Exception e) {
;
}

if (url != null)
metaTags.setBaseHref(url);
}

}

}

NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getMetaTagsHelper(metaTags, children.item(i), currURL);
}
}
}

}



posted @ 2011-11-18 15:29  xiao晓  阅读(673)  评论(0编辑  收藏  举报