本文接下来分析CrawlServer类和CrawlHost类,两者都实现了IdentityCacheable接口(可缓存对象接口)
CrawlServer对象代表服务器,里面存储了服务器的相关信息,包括服务名 端口 robots信息 Credential集合及相关操作等
private static final long serialVersionUID = 3L; public static final long ROBOTS_NOT_FETCHED = -1; /** only check if robots-fetch is perhaps superfluous * after this many tries */ public static final long MIN_ROBOTS_RETRIES = 3; private String server; // actually, host+port in the https case private int port; protected Robotstxt robotstxt; long robotsFetched = ROBOTS_NOT_FETCHED; boolean validRobots = false; FetchStats substats = new FetchStats(); // how many consecutive connection errors have been encountered; // used to drive exponentially increasing retry timeout or decision // to 'freeze' entire class (queue) of URIs protected int consecutiveConnectionErrors = 0; /** * Set of credentials. */ private transient Set<Credential> credentials = null;
String server表示站点服务器的标识,其构造方法如下(初始化站点服务器的标识和端口)
/** * Creates a new CrawlServer object. * * @param h the host string for the server. */ public CrawlServer(String h) { // TODO: possibly check for illegal host string server = h; int colonIndex = server.lastIndexOf(":"); if (colonIndex < 0) { port = -1; } else { try { port = Integer.parseInt(server.substring(colonIndex + 1)); } catch (NumberFormatException e) { port = -1; } } }
下面的方法是有关Robotstxt robotstxt对象操作的
public Robotstxt getRobotstxt() { return robotstxt; } /** Update the robotstxt * * @param curi the crawl URI containing the fetched robots.txt * @throws IOException */ public synchronized void updateRobots(CrawlURI curi) { robotsFetched = System.currentTimeMillis(); boolean gotSomething = curi.getFetchType() == HTTP_GET && (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND); if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) { // robots.txt lookup failed, still trying, no reason to consider IGNORE yet validRobots = false; return; } // special deeming for a particular kind of connection-lost (empty server response) if (curi.getFetchStatus() == S_CONNECT_LOST && CollectionUtils.exists(curi.getNonFatalFailures(), PredicateUtils.instanceofPredicate(NoHttpResponseException.class))) { curi.setFetchStatus(S_DEEMED_NOT_FOUND); gotSomething = true; } if (!gotSomething) { // robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet validRobots = false; return; } int fetchStatus = curi.getFetchStatus(); if (fetchStatus < 200 || fetchStatus >= 300) { // Not found or anything but a status code in the 2xx range is // treated as giving access to all of a sites' content. // This is the prevailing practice of Google, since 4xx // responses on robots.txt are usually indicative of a // misconfiguration or blanket-block, not an intentional // indicator of partial blocking. // TODO: consider handling server errors, redirects differently robotstxt = Robotstxt.NO_ROBOTS; validRobots = true; return; } InputStream contentBodyStream = null; try { BufferedReader reader; contentBodyStream = curi.getRecorder().getContentReplayInputStream(); reader = new BufferedReader(new InputStreamReader(contentBodyStream)); robotstxt = new Robotstxt(reader); validRobots = true; } catch (IOException e) { robotstxt = Robotstxt.NO_ROBOTS; logger.log(Level.WARNING,"problem reading robots.txt for "+curi,e); validRobots = true; curi.getNonFatalFailures().add(e); } finally { IOUtils.closeQuietly(contentBodyStream); } } /** * If true then valid robots.txt information has been retrieved. If false * either no attempt has been made to fetch robots.txt or the attempt * failed. * * @return Returns the validRobots. */ public synchronized boolean isValidRobots() { return validRobots; } /** * Is the robots policy expired. * * This method will also return true if we haven't tried to get the * robots.txt for this server. * * @param curi * @return true if the robots policy is expired. */ public synchronized boolean isRobotsExpired(int validityDuration) { if (robotsFetched == ROBOTS_NOT_FETCHED) { // Have not attempted to fetch robots return true; } long duration = validityDuration*1000L; if (duration == 0) { // When zero, robots should be valid forever return false; } if (robotsFetched + duration < System.currentTimeMillis()) { // Robots is still valid return true; } return false; }
Set<Credential> credentials证书集合方法
/** * @return Credential avatars for this server. Returns null if none. */ public Set<Credential> getCredentials() { return this.credentials; } /** * @return True if there are avatars attached to this instance. */ public boolean hasCredentials() { return this.credentials != null && this.credentials.size() > 0; } /** * Add an avatar. * * @param ca Credential avatar to add to set of avatars. */ public void addCredential(Credential cred) { if (this.credentials == null) { this.credentials = new HashSet<Credential>(); } this.credentials.add(cred); }
根据UURI uuri对象生成key的静态方法(用于站点服务器标识)
/** * Get key to use doing lookup on server instances. * * @param cauri CandidateURI we're to get server key for. * @return String to use as server key. * @throws URIException */ /** * 根据UURI uuri对象生成key * 这里的key不同于classkey,应该保证同一域名下的所有url的key的一致性 * @param uuri * @return * @throws URIException */ public static String getServerKey(UURI uuri) throws URIException { // TODO: evaluate if this is really necessary -- why not // make the server of a dns CandidateURI the looked-up domain, // also simplifying FetchDNS? String key = uuri.getAuthorityMinusUserinfo(); if (key == null) { // Fallback for cases where getAuthority() fails (eg 'dns:'. // DNS UURIs have the 'domain' in the 'path' parameter, not // in the authority). key = uuri.getCurrentHierPath(); if (key != null && !key.matches("[-_\\w\\.:]+")) { // Not just word chars and dots and colons and dashes and // underscores; throw away key = null; } } if (key != null && uuri.getScheme().equals(UURIFactory.HTTPS)) { // If https and no port specified, add default https port to // distinuish https from http server without a port. if (!key.matches(".+:[0-9]+")) { key += UURIFactory.HTTPS_PORT; } } return key; }
CrawlHost对象代表主机,里面存储了主机标识(域名) IP地址 抓取时间 国家代码信息等
/** Flag value indicating always-valid IP */ public static final long IP_NEVER_EXPIRES = -1; /** Flag value indicating an IP has not yet been looked up */ public static final long IP_NEVER_LOOKED_UP = -2; private String hostname; private String countryCode; private InetAddress ip; private long ipFetched = IP_NEVER_LOOKED_UP; protected FetchStats substats = new FetchStats(); /** * TTL gotten from dns record. * * From rfc2035: * <pre> * TTL a 32 bit unsigned integer that specifies the time * interval (in seconds) that the resource record may be * cached before it should be discarded. Zero values are * interpreted to mean that the RR can only be used for the * transaction in progress, and should not be cached. * </pre> */ private long ipTTL = IP_NEVER_LOOKED_UP; // Used when bandwith constraint are used private long earliestNextURIEmitTime = 0;
构造方法初始化主机标识
/** * Create a new CrawlHost object. * * @param hostname the host name for this host. */ public CrawlHost(String hostname) { this(hostname, null); } /** * Create a new CrawlHost object. * * @param hostname the host name for this host. * @param countryCode the country code for this host. */ public CrawlHost(String hostname, String countryCode) { this.hostname = hostname; this.countryCode = countryCode; InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname); if (tmp != null) { setIP(tmp, IP_NEVER_EXPIRES); } }
下面的方法用于设置IP地址
/** Return true if the IP for this host has been looked up. * * Returns true even if the lookup failed. * * @return true if the IP for this host has been looked up. */ public boolean hasBeenLookedUp() { return ipFetched != IP_NEVER_LOOKED_UP; } /** * Set the IP address for this host. * * @param address * @param ttl the TTL from the dns record in seconds or -1 if it should live * forever (is a numeric IP). */ /** * 设置IP FetchNDS处理器解析IP * @param address * @param ttl */ public void setIP(InetAddress address, long ttl) { this.ip = address; // Assume that a lookup as occurred by the time // a caller decides to set this (even to null) this.ipFetched = System.currentTimeMillis(); this.ipTTL = ttl; if (logger.isLoggable(Level.FINE)) { logger.fine(hostname + ": " + ((address != null)? address.toString(): "null")); } }
---------------------------------------------------------------------------
本系列Heritrix 3.1.0 源码解析系本人原创
转载请注明出处 博客园 刺猬的温驯
本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/29/3050940.html