君子博学而日参省乎己 则知明而行无过矣

博客园 首页 新随笔 联系 订阅 管理

本文接下来分析CrawlServer类和CrawlHost类,两者都实现了IdentityCacheable接口(可缓存对象接口)

CrawlServer对象代表服务器,里面存储了服务器的相关信息,包括服务名 端口 robots信息 Credential集合及相关操作等

private static final long serialVersionUID = 3L;

    public static final long ROBOTS_NOT_FETCHED = -1;
    /** only check if robots-fetch is perhaps superfluous 
     * after this many tries */
    public static final long MIN_ROBOTS_RETRIES = 3;

    private String server; // actually, host+port in the https case
    private int port;
    protected Robotstxt robotstxt;
    long robotsFetched = ROBOTS_NOT_FETCHED;
    boolean validRobots = false;
    FetchStats substats = new FetchStats();
    
    // how many consecutive connection errors have been encountered;
    // used to drive exponentially increasing retry timeout or decision
    // to 'freeze' entire class (queue) of URIs
    protected int consecutiveConnectionErrors = 0;

    /**
     * Set of credentials.
     */
    private transient Set<Credential> credentials =  null;

String server表示站点服务器的标识,其构造方法如下(初始化站点服务器的标识和端口

/**
     * Creates a new CrawlServer object.
     *
     * @param h the host string for the server.
     */
    public CrawlServer(String h) {
        // TODO: possibly check for illegal host string
        server = h;
        int colonIndex = server.lastIndexOf(":");
        if (colonIndex < 0) {
            port = -1;
        } else {
            try {
                port = Integer.parseInt(server.substring(colonIndex + 1));
            } catch (NumberFormatException e) {
                port = -1;
            }
        }
    }

下面的方法是有关Robotstxt robotstxt对象操作的

public Robotstxt getRobotstxt() {
        return robotstxt;
    }
    
    /** Update the robotstxt
    *
    * @param curi the crawl URI containing the fetched robots.txt
    * @throws IOException
    */
   public synchronized void updateRobots(CrawlURI curi) {

       robotsFetched = System.currentTimeMillis();
       
       boolean gotSomething = curi.getFetchType() == HTTP_GET 
           && (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND);
       
       
       if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
           // robots.txt lookup failed, still trying, no reason to consider IGNORE yet
           validRobots = false;
           return;
       }
              
       // special deeming for a particular kind of connection-lost (empty server response)
        if (curi.getFetchStatus() == S_CONNECT_LOST
                && CollectionUtils.exists(curi.getNonFatalFailures(),
                        PredicateUtils.instanceofPredicate(NoHttpResponseException.class))) {
            curi.setFetchStatus(S_DEEMED_NOT_FOUND);
            gotSomething = true;
        }
       
       if (!gotSomething) {
           // robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet
           validRobots = false;
           return;
       }
       
       int fetchStatus = curi.getFetchStatus();
       if (fetchStatus < 200 || fetchStatus >= 300) {
           // Not found or anything but a status code in the 2xx range is
           // treated as giving access to all of a sites' content.
           // This is the prevailing practice of Google, since 4xx
           // responses on robots.txt are usually indicative of a 
           // misconfiguration or blanket-block, not an intentional
           // indicator of partial blocking. 
           // TODO: consider handling server errors, redirects differently
           robotstxt = Robotstxt.NO_ROBOTS;
           validRobots = true;
           return;
       }

       InputStream contentBodyStream = null;
       try {
           BufferedReader reader;
           contentBodyStream = curi.getRecorder().getContentReplayInputStream();

           reader = new BufferedReader(new InputStreamReader(contentBodyStream));
           robotstxt = new Robotstxt(reader); 
           validRobots = true;
       } catch (IOException e) {
           robotstxt = Robotstxt.NO_ROBOTS;
           logger.log(Level.WARNING,"problem reading robots.txt for "+curi,e);
           validRobots = true;
           curi.getNonFatalFailures().add(e);
       } finally {
           IOUtils.closeQuietly(contentBodyStream);
       }
   }    
/**
     * If true then valid robots.txt information has been retrieved. If false
     * either no attempt has been made to fetch robots.txt or the attempt
     * failed.
     *
     * @return Returns the validRobots.
     */
    public synchronized boolean isValidRobots() {
        return validRobots;
    }
/**
     * Is the robots policy expired.
     *
     * This method will also return true if we haven't tried to get the
     * robots.txt for this server.
     *
     * @param curi
     * @return true if the robots policy is expired.
     */
    public synchronized boolean isRobotsExpired(int validityDuration) {
        if (robotsFetched == ROBOTS_NOT_FETCHED) {
            // Have not attempted to fetch robots
            return true;
        }
        long duration = validityDuration*1000L;
        if (duration == 0) {
            // When zero, robots should be valid forever
            return false;
        }
        if (robotsFetched + duration < System.currentTimeMillis()) {
            // Robots is still valid
            return true;
        }
        return false;
    }

Set<Credential> credentials证书集合方法

/**
     * @return Credential avatars for this server.  Returns null if none.
     */
    public Set<Credential> getCredentials() {
        return this.credentials;
    }

    /**
     * @return True if there are avatars attached to this instance.
     */
    public boolean hasCredentials() {
        return this.credentials != null && this.credentials.size() > 0;
    }

    /**
     * Add an avatar.
     *
     * @param ca Credential avatar to add to set of avatars.
     */
    public void addCredential(Credential cred) {
        if (this.credentials == null) {
            this.credentials = new HashSet<Credential>();
        }
        this.credentials.add(cred);
    }

根据UURI uuri对象生成key的静态方法(用于站点服务器标识)

/**
     * Get key to use doing lookup on server instances.
     * 
     * @param cauri  CandidateURI we're to get server key for.
     * @return String to use as server key.
     * @throws URIException
     */
    /**
     * 根据UURI uuri对象生成key
     * 这里的key不同于classkey,应该保证同一域名下的所有url的key的一致性
     * @param uuri
     * @return
     * @throws URIException
     */
    public static String getServerKey(UURI uuri) throws URIException {
        // TODO: evaluate if this is really necessary -- why not
        // make the server of a dns CandidateURI the looked-up domain,
        // also simplifying FetchDNS?
        String key = uuri.getAuthorityMinusUserinfo();
        if (key == null) {
            // Fallback for cases where getAuthority() fails (eg 'dns:'.
            // DNS UURIs have the 'domain' in the 'path' parameter, not
            // in the authority).
            key = uuri.getCurrentHierPath();
            if (key != null && !key.matches("[-_\\w\\.:]+")) {
                // Not just word chars and dots and colons and dashes and
                // underscores; throw away
                key = null;
            }
        }
        if (key != null && uuri.getScheme().equals(UURIFactory.HTTPS)) {
            // If https and no port specified, add default https port to
            // distinuish https from http server without a port.
            if (!key.matches(".+:[0-9]+")) {
                key += UURIFactory.HTTPS_PORT;
            }
        }
        return key;
    }

CrawlHost对象代表主机,里面存储了主机标识(域名) IP地址 抓取时间  国家代码信息等

/** Flag value indicating always-valid IP */
    public static final long IP_NEVER_EXPIRES = -1;
    /** Flag value indicating an IP has not yet been looked up */
    public static final long IP_NEVER_LOOKED_UP = -2;
    private String hostname;
    private String countryCode;
    private InetAddress ip;
    private long ipFetched = IP_NEVER_LOOKED_UP;
    protected FetchStats substats = new FetchStats(); 
    /**
     * TTL gotten from dns record.
     *
     * From rfc2035:
     * <pre>
     * TTL       a 32 bit unsigned integer that specifies the time
     *           interval (in seconds) that the resource record may be
     *           cached before it should be discarded.  Zero values are
     *           interpreted to mean that the RR can only be used for the
     *           transaction in progress, and should not be cached.
     * </pre>
     */
    private long ipTTL = IP_NEVER_LOOKED_UP;

    // Used when bandwith constraint are used
    private long earliestNextURIEmitTime = 0;

构造方法初始化主机标识

/** 
     * Create a new CrawlHost object.
     *
     * @param hostname the host name for this host.
     */
    public CrawlHost(String hostname) {
            this(hostname, null);
    }

    /** 
     * Create a new CrawlHost object.
     *
     * @param hostname the host name for this host.
     * @param countryCode the country code for this host.
     */
    public CrawlHost(String hostname, String countryCode) {
        this.hostname = hostname;
        this.countryCode = countryCode;
        InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);
        if (tmp != null) {
            setIP(tmp, IP_NEVER_EXPIRES);
        }
    }

下面的方法用于设置IP地址

/** Return true if the IP for this host has been looked up.
     *
     * Returns true even if the lookup failed.
     *
     * @return true if the IP for this host has been looked up.
     */
    public boolean hasBeenLookedUp() {
        return ipFetched != IP_NEVER_LOOKED_UP;
    }

    /**
     * Set the IP address for this host.
     *
     * @param address
     * @param ttl the TTL from the dns record in seconds or -1 if it should live
     * forever (is a numeric IP).
     */
    /**
     * 设置IP FetchNDS处理器解析IP
     * @param address
     * @param ttl
     */
    public void setIP(InetAddress address, long ttl) {
        this.ip = address;
        // Assume that a lookup as occurred by the time
        // a caller decides to set this (even to null)
        this.ipFetched = System.currentTimeMillis();
        this.ipTTL = ttl;
        if (logger.isLoggable(Level.FINE)) {
            logger.fine(hostname + ": " +
                ((address != null)? address.toString(): "null"));
        }
    }

---------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/29/3050940.html

posted on 2013-04-29 18:24  刺猬的温驯  阅读(670)  评论(0编辑  收藏  举报