君子博学而日参省乎己 则知明而行无过矣

博客园 首页 新随笔 联系 订阅 管理

上文分析了Heritrix3.1.0系统的对请求认证机制的封装,本文接下来分析Heritrix3.1.0系统对cookies的处理的封装

Heritrix3.1.0系统提供了CookieStorage接口,用于提供cookies的存储

CookieStorage接口很简单,声明了保存cookies对象的Map容器的方法和获取cookies对象的Map容器的方法

public interface CookieStorage extends Lifecycle {

    SortedMap<String,Cookie> getCookiesMap();

    void saveCookiesMap(Map<String,Cookie> map);

}

抽象类AbstractCookieStorage实现了CookieStorage接口,用于为具体实现类提供公用模板 

public abstract class AbstractCookieStorage 
    implements CookieStorage, 
               Lifecycle, // InitializingBean, 
               Closeable {

    final private static Logger LOGGER = 
        Logger.getLogger(AbstractCookieStorage.class.getName());
    //cookies配置文件(用于加载)
    protected ConfigFile cookiesLoadFile = null;
    public ConfigFile getCookiesLoadFile() {
        return cookiesLoadFile;
    }
    public void setCookiesLoadFile(ConfigFile cookiesLoadFile) {
        this.cookiesLoadFile = cookiesLoadFile;
    }

    //cookies文件路径(用于保存)
    protected ConfigPath cookiesSaveFile = null;
    public ConfigPath getCookiesSaveFile() {
        return cookiesSaveFile;
    }
    public void setCookiesSaveFile(ConfigPath cookiesSaveFile) {
        this.cookiesSaveFile = cookiesSaveFile;
    }

    boolean isRunning = false; 
    /**
     * 初始化
     */
    @Override
    public void start() {
        if(isRunning()) {
            return;
        }
        SortedMap<String,Cookie> cookies = prepareMap();
        if (getCookiesLoadFile()!=null) {
            //从cookies配置文件加载cookies
            loadCookies(getCookiesLoadFile(), cookies);
        }
        isRunning = true; 
    }
    @Override
    public boolean isRunning() {
        return isRunning;
    }
    @Override
    public void stop() {
        isRunning = false; 
    }
    /**
     * 初始化SortedMap<String,Cookie> 由具体子类实现
     * @return
     */
    protected abstract SortedMap<String,Cookie> prepareMap();    
    
    /**
     * 从Reader reader对象加载cookies
     * @param reader
     * @param cookies
     */
    public static void loadCookies(Reader reader,
            SortedMap<String, Cookie> cookies) {
        BufferedReader br = new BufferedReader(reader);
        try {
            String line;
            int lineNo = 1;
            while ((line = br.readLine()) != null) {
                if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments
                    String[] tokens = line.split("\\t");
                    if (tokens.length == 7) {
                        long epochSeconds = Long.parseLong(tokens[4]);
                        Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null);
                        Cookie cookie = new Cookie(tokens[0], tokens[5],
                                tokens[6], tokens[2], expirationDate, 
                                Boolean.valueOf(tokens[3]).booleanValue());
                        cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue());
                        
                        LOGGER.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie.toExternalForm());
                        cookies.put(cookie.getSortKey(), cookie);
                    } else {
                        LOGGER.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens");
                    }
                }
                
                lineNo++;
            }
        } catch (IOException e) {
            LOGGER.log(Level.WARNING,e.getMessage(), e);
        }
    }
    /**
     * 从配置文件加载SortedMap<String, Cookie> cookies
     * @param file
     * @param cookies
     */
    protected static void loadCookies(ConfigFile file,
            SortedMap<String, Cookie> cookies) {
        
        Reader reader = null;
        try {
            reader = file.obtainReader();
            loadCookies(reader, cookies);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    public static void loadCookies(String cookiesFile, 
            SortedMap<String,Cookie> result) {

        // Do nothing if cookiesFile is not specified.
        if (cookiesFile == null || cookiesFile.length() <= 0) {
            return;
        }
        
        FileReader reader = null;
        try {
            reader = new FileReader(cookiesFile);
            loadCookies(reader, result);
        } catch (FileNotFoundException e) {
            LOGGER.log(Level.WARNING,"Could not find file: " + cookiesFile, e);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }
    /**
     * 保存map容器中的cookies到文件
     * @param saveCookiesFile
     * @param cookies
     */
    public static void saveCookies(String saveCookiesFile, Map<String,Cookie> cookies) { 
        // Do nothing if cookiesFile is not specified. 
        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { 
            return; 
        }
      
        FileOutputStream out = null; 
        try { 
            out = new FileOutputStream(new File(saveCookiesFile)); 
            String tab ="\t"; 
            out.write("# Heritrix Cookie File\n".getBytes()); 
            out.write("# This file is the Netscape cookies.txt format\n\n".getBytes()); 
            for (Cookie cookie: cookies.values()) { 
                // Guess an initial size 
                MutableString line = new MutableString(1024 * 2); 
                line.append(cookie.getDomain()); 
                line.append(tab);
                line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE"); 
                line.append(tab); 
                line.append(cookie.getPath());
                line.append(tab); 
                line.append(cookie.getSecure() ? "TRUE" : "FALSE"); 
                line.append(tab);
                line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1);
                line.append(tab);
                line.append(cookie.getName());
                line.append(tab);                
                line.append(cookie.getValue() != null ? cookie.getValue() : ""); 
                line.append("\n");
                out.write(line.toString().getBytes()); 
            } 
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e);
        } finally {
            IOUtils.closeQuietly(out);
        } 
    }
    /**
     * 具体子类实现
     */
    @Override
    public abstract SortedMap<String,Cookie> getCookiesMap();
    /**
     * 保存map容器中的cookies
     */
    @Override
    public void saveCookiesMap(Map<String, Cookie> map) {
        //抽象方法由具体子类实现
        innerSaveCookiesMap(map);
        if (getCookiesSaveFile()!=null) {
            saveCookies(getCookiesSaveFile().getFile().getAbsolutePath(), map);
        }
    }
    /**
     * 具体子类实现
     * @param map
     */
    protected abstract void innerSaveCookiesMap(Map<String,Cookie> map);
    @Override
    public void close() throws IOException {
    }

}

Heritrix3.1.0提供了两个继承类,分别为BdbCookieStorage和SimpleCookieStorage,前者将cookies保存在BDB数据库,后者保存在Map对象里面

BdbCookieStorage类的相关方法如下

protected BdbModule bdb;
    @Autowired
    public void setBdbModule(BdbModule bdb) {
        this.bdb = bdb;
    }
    
    /** are we a checkpoint recovery? (in which case, reuse stored cookie data?) */
    boolean isCheckpointRecovery = false; 
    
    public static String COOKIEDB_NAME = "http_cookies";
 
    private transient Database cookieDb;
    private transient StoredSortedMap<String,Cookie> cookies;

    public BdbCookieStorage() {
    }

    protected SortedMap<String,Cookie> prepareMap() {
        try {
            StoredClassCatalog classCatalog = bdb.getClassCatalog();
            BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
            dbConfig.setTransactional(false);
            dbConfig.setAllowCreate(true);
            cookieDb = bdb.openDatabase(COOKIEDB_NAME, dbConfig, isCheckpointRecovery);
            cookies = 
                new StoredSortedMap<String,Cookie>(
                    cookieDb,
                    new StringBinding(), 
                    new SerialBinding<Cookie>(classCatalog,Cookie.class), 
                    true);
            return cookies;
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }
    }

    public SortedMap<String, Cookie> getCookiesMap() {
//        assert cookies != null : "cookie map not set up";
        return cookies;
    }

    protected void innerSaveCookiesMap(Map<String, Cookie> map) {
    }

SimpleCookieStorage类与之类似,不在这里贴出来了

这里需要注意的是,Heritrix3.1.0系统改写了HttpClient组件的Cookie类,逻辑与HttpClient组件的Cookie类类似

那么Heritrix3.1.0系统怎样将CookieStorage接口实现类获取的SortedMap<String, Cookie>容器中的Cookies添加在HttpClient组件的相关对象呢?

Heritrix3.1.0系统还改写了HttpClient组件的HttpState类,添加了设置SortedMap cookiesMap对象的方法,相关方法如下

private SortedMap cookiesMap = new ConcurrentSkipListMap();
// START IA/HERITRIX ADDITIONS
    /**
     * Returns a sorted map of {@link Cookie cookies} that this HTTP
     * state currently contains.
     * 
     * Any operations on this map should be synchronized with respect 
     * to this HttpState instance.
     * 
     * @return sorter map of {@link Cookie cookies}
     */
    public SortedMap getCookiesMap() {
        return cookiesMap;
    }
    
    /**
     * Replace the standard sorted map with an external implemenations 
     * (such as one backed by persistent store, like BDB's StoredSortedMap.)
     * 
     * @param map alternate sorted map to use to store cookies
     */
    public void setCookiesMap(SortedMap map) {
        this.cookiesMap = map;
    }
// END IA/HERITRIX ADDITIONS

同时HttpMethodBase对象相关方法里面从HttpState state对象获取Cookies对象也做了相应的改写 

/**
     * Generates <tt>Cookie</tt> request headers for those {@link Cookie cookie}s
     * that match the given host, port and path.
     *
     * @param state the {@link HttpState state} information associated with this method
     * @param conn the {@link HttpConnection connection} used to execute
     *        this HTTP method
     *
     * @throws IOException if an I/O (transport) error occurs. Some transport exceptions
     *                     can be recovered from.
     * @throws HttpException  if a protocol exception occurs. Usually protocol exceptions 
     *                    cannot be recovered from.
     */
    protected void addCookieRequestHeader(HttpState state, HttpConnection conn)
        throws IOException, HttpException {

        LOG.trace("enter HttpMethodBase.addCookieRequestHeader(HttpState, "
                  + "HttpConnection)");

        Header[] cookieheaders = getRequestHeaderGroup().getHeaders("Cookie");
        for (int i = 0; i < cookieheaders.length; i++) {
            Header cookieheader = cookieheaders[i];
            if (cookieheader.isAutogenerated()) {
                getRequestHeaderGroup().removeHeader(cookieheader);
            }
        }

        CookieSpec matcher = getCookieSpec(state);
        String host = this.params.getVirtualHost();
        if (host == null) {
            host = conn.getHost();
        }
        // BEGIN IA/HERITRIX CHANGES
        Cookie[] cookies = matcher.match(host, conn.getPort(),
            getPath(), conn.isSecure(), state.getCookiesMap());
        // END IA/HERITRIX CHANGES
        if ((cookies != null) && (cookies.length > 0)) {
            if (getParams().isParameterTrue(HttpMethodParams.SINGLE_COOKIE_HEADER)) {
                // In strict mode put all cookies on the same header
                String s = matcher.formatCookies(cookies);
                getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));
            } else {
                // In non-strict mode put each cookie on a separate header
                for (int i = 0; i < cookies.length; i++) {
                    String s = matcher.formatCookie(cookies[i]);
                    getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));
                }
            }
        }
    }

最后我们怎样在配置文件crawler-beans.cxml配置cookie文件呢,本人做了一个示例

 <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
 <bean id="cookieStorage" 
   class="org.archive.modules.fetcher.BdbCookieStorage">
  <property name="cookiesLoadFile"><ref bean="cookieInit"/></property> 
 <property name="cookiesSaveFile"><ref bean="cookieSave"/></property>
  <property name="bdb">
        <ref bean="bdb"/>
       </property>
 </bean>
 <bean id="cookieInit" class="org.archive.spring.ConfigFile">
    <property name="name" value="cookie.txt" />
    <property name="path" value="/root/stpl/cookie.txt" />
</bean>
<bean id="cookieSave" class="org.archive.spring.ConfigPath">
    <property name="name" value="cookies_dump.txt" />
    <property name="path" value="/root/stpl/cookies_dump.txt" />
</bean>

cookie.txt文件格式可以参考这段英文注释,这段注释你懂的

* format. Example entry of cookies.txt file:
     * <p>
     * www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond
     * </p>
     * <p>
     * Each line has 7 tab-separated fields:
     * </p>
     * <ol>
     * <li>DOMAIN: The domain that created and have access to the cookie value.</li>
     * <li>FLAG: A TRUE or FALSE value indicating if hosts within the given
     * domain can access the cookie value.</li>
     * <li>PATH: The path within the domain that the cookie value is valid for.</li>
     * <li>SECURE: A TRUE or FALSE value indicating if to use a secure
     * connection to access the cookie value.</li>
     * <li>EXPIRATION: The expiration time of the cookie value, or -1 for no
     * expiration</li>
     * <li>NAME: The name of the cookie value</li>
     * <li>VALUE: The cookie value</li>
     * </ol>

---------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/28/3049673.html

posted on 2013-04-28 18:43  刺猬的温驯  阅读(1070)  评论(0编辑  收藏  举报