网络爬虫速成指南（四） URL判重

如果采集量比较小：
布隆过滤器详解原理

如果采集量比较大：
redis：主要是把URL转为md5，作为key来进行判重


关于布隆过滤器器的参数说明，简单点说：
private static BloomFilter<String> bloomFilter = new BloomFilter<String>(2X, X);
这样用就可以了

附：布隆过滤器实现，@author Magnus Skjegstad <magnus@skjegstad.com>

import java.io.Serializable;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.Collection;

/**
 * Implementation of a Bloom-filter, as described here:
 * http://en.wikipedia.org/wiki/Bloom_filter
 * 
 * Inspired by the SimpleBloomFilter-class written by Ian Clarke. This
 * implementation provides a more evenly distributed Hash-function by using a
 * proper digest instead of the Java RNG. Many of the changes were proposed in
 * comments in his blog:
 * http://blog.locut.us/2008/01/12/a-decent-stand-alone-java
 * -bloom-filter-implementation/
 * 
 * @param <E>
 *            Object type that is to be inserted into the Bloom filter, e.g.
 *            String or Integer.
 * @author Magnus Skjegstad <magnus@skjegstad.com>
 */
public class BloomFilter<E> implements Serializable {
    private BitSet bitset;
    private int bitSetSize;
    private double bitsPerElement;
    private int expectedNumberOfFilterElements; // expected (maximum) number of
                                                // elements to be added
    private int numberOfAddedElements; // number of elements actually added to
                                        // the Bloom filter
    private int k; // number of hash functions

    static final Charset charset = Charset.forName("UTF-8"); // encoding used
                                                                // for storing
                                                                // hash values
                                                                // as strings

    static final String hashName = "MD5"; // MD5 gives good enough accuracy in
                                            // most circumstances. Change to
                                            // SHA1 if it's needed
    static final MessageDigest digestFunction;
    static { // The digest method is reused between instances
        MessageDigest tmp;
        try {
            tmp = java.security.MessageDigest.getInstance(hashName);
        } catch (NoSuchAlgorithmException e) {
            tmp = null;
        }
        digestFunction = tmp;
    }

    /**
     * Constructs an empty Bloom filter. The total length of the Bloom filter
     * will be c*n.
     * 
     * @param c
     *            is the number of bits used per element.
     * @param n
     *            is the expected number of elements the filter will contain.
     * @param k
     *            is the number of hash functions used.
     */
    public BloomFilter(double c, int n, int k) {
        this.expectedNumberOfFilterElements = n;
        this.k = k;
        this.bitsPerElement = c;
        this.bitSetSize = (int) Math.ceil(c * n);
        numberOfAddedElements = 0;
        this.bitset = new BitSet(bitSetSize);
    }

    /**
     * Constructs an empty Bloom filter. The optimal number of hash functions
     * (k) is estimated from the total size of the Bloom and the number of
     * expected elements.
     * 
     * @param bitSetSize
     *            defines how many bits should be used in total for the filter.
     * @param expectedNumberOElements
     *            defines the maximum number of elements the filter is expected
     *            to contain.
     */
    public BloomFilter(int bitSetSize, int expectedNumberOElements) {
        this(bitSetSize / (double) expectedNumberOElements,
                expectedNumberOElements, (int) Math
                        .round((bitSetSize / (double) expectedNumberOElements)
                                * Math.log(2.0)));
    }

    /**
     * Constructs an empty Bloom filter with a given false positive probability.
     * The number of bits per element and the number of hash functions is
     * estimated to match the false positive probability.
     * 
     * @param falsePositiveProbability
     *            is the desired false positive probability.
     * @param expectedNumberOfElements
     *            is the expected number of elements in the Bloom filter.
     */
    public BloomFilter(double falsePositiveProbability,
            int expectedNumberOfElements) {
        this(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))
                / Math.log(2), // c = k / ln(2)
                expectedNumberOfElements, (int) Math.ceil(-(Math
                        .log(falsePositiveProbability) / Math.log(2)))); // k =
                                                                            // ceil
                                                                            // (
                                                                            // -
                                                                            // log_2
                                                                            // (
                                                                            // false
                                                                            // prob
                                                                            // .
                                                                            // )
                                                                            // )
    }

    /**
     * Construct a new Bloom filter based on existing Bloom filter data.
     * 
     * @param bitSetSize
     *            defines how many bits should be used for the filter.
     * @param expectedNumberOfFilterElements
     *            defines the maximum number of elements the filter is expected
     *            to contain.
     * @param actualNumberOfFilterElements
     *            specifies how many elements have been inserted into the
     *            <code>filterData</code> BitSet.
     * @param filterData
     *            a BitSet representing an existing Bloom filter.
     */
    public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements,
            int actualNumberOfFilterElements, BitSet filterData) {
        this(bitSetSize, expectedNumberOfFilterElements);
        this.bitset = filterData;
        this.numberOfAddedElements = actualNumberOfFilterElements;
    }

    /**
     * Generates a digest based on the contents of a String.
     * 
     * @param val
     *            specifies the input data.
     * @param charset
     *            specifies the encoding of the input data.
     * @return digest as long.
     */
    public static long createHash(String val, Charset charset) {
        return createHash(val.getBytes(charset));
    }

    /**
     * Generates a digest based on the contents of a String.
     * 
     * @param val
     *            specifies the input data. The encoding is expected to be
     *            UTF-8.
     * @return digest as long.
     */
    public static long createHash(String val) {
        return createHash(val, charset);
    }

    /**
     * Generates a digest based on the contents of an array of bytes.
     * 
     * @param data
     *            specifies input data.
     * @return digest as long.
     */
    public static long createHash(byte[] data) {
        long h = 0;
        byte[] res;

        synchronized (digestFunction) {
            res = digestFunction.digest(data);
        }

        for (int i = 0; i < 4; i++) {
            h <<= 8;
            h |= ((int) res[i]) & 0xFF;
        }
        return h;
    }

    /**
     * Compares the contents of two instances to see if they are equal.
     * 
     * @param obj
     *            is the object to compare to.
     * @return True if the contents of the objects are equal.
     */
    @Override
    public boolean equals(Object obj) {
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        final BloomFilter<E> other = (BloomFilter<E>) obj;
        if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) {
            return false;
        }
        if (this.k != other.k) {
            return false;
        }
        if (this.bitSetSize != other.bitSetSize) {
            return false;
        }
        if (this.bitset != other.bitset
                && (this.bitset == null || !this.bitset.equals(other.bitset))) {
            return false;
        }
        return true;
    }

    /**
     * Calculates a hash code for this class.
     * 
     * @return hash code representing the contents of an instance of this class.
     */
    @Override
    public int hashCode() {
        int hash = 7;
        hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0);
        hash = 61 * hash + this.expectedNumberOfFilterElements;
        hash = 61 * hash + this.bitSetSize;
        hash = 61 * hash + this.k;
        return hash;
    }

    /**
     * Calculates the expected probability of false positives based on the
     * number of expected filter elements and the size of the Bloom filter. <br
     * /><br /> The value returned by this method is the <i>expected</i> rate of
     * false positives, assuming the number of inserted elements equals the
     * number of expected elements. If the number of elements in the Bloom
     * filter is less than the expected value, the true probability of false
     * positives will be lower.
     * 
     * @return expected probability of false positives.
     */
    public double expectedFalsePositiveProbability() {
        return getFalsePositiveProbability(expectedNumberOfFilterElements);
    }

    /**
     * Calculate the probability of a false positive given the specified number
     * of inserted elements.
     * 
     * @param numberOfElements
     *            number of inserted elements.
     * @return probability of a false positive.
     */
    public double getFalsePositiveProbability(double numberOfElements) {
        // (1 - e^(-k * n / m)) ^ k
        return Math.pow((1 - Math.exp(-k * (double) numberOfElements
                / (double) bitSetSize)), k);

    }

    /**
     * Get the current probability of a false positive. The probability is
     * calculated from the size of the Bloom filter and the current number of
     * elements added to it.
     * 
     * @return probability of false positives.
     */
    public double getFalsePositiveProbability() {
        return getFalsePositiveProbability(numberOfAddedElements);
    }

    /**
     * Returns the value chosen for K.<br /> <br /> K is the optimal number of
     * hash functions based on the size of the Bloom filter and the expected
     * number of inserted elements.
     * 
     * @return optimal k.
     */
    public int getK() {
        return k;
    }

    /**
     * Sets all bits to false in the Bloom filter.
     */
    public void clear() {
        bitset.clear();
        numberOfAddedElements = 0;
    }

    /**
     * Adds an object to the Bloom filter. The output from the object's
     * toString() method is used as input to the hash functions.
     * 
     * @param element
     *            is an element to register in the Bloom filter.
     */
    public void add(E element) {
        long hash;
        String valString = element.toString();
        for (int x = 0; x < k; x++) {
            hash = createHash(valString + Integer.toString(x));
            hash = hash % (long) bitSetSize;
            bitset.set(Math.abs((int) hash), true);
        }
        numberOfAddedElements++;
    }

    /**
     * Adds all elements from a Collection to the Bloom filter.
     * 
     * @param c
     *            Collection of elements.
     */
    public void addAll(Collection<? extends E> c) {
        for (E element : c)
            add(element);
    }

    /**
     * Returns true if the element could have been inserted into the Bloom
     * filter. Use getFalsePositiveProbability() to calculate the probability of
     * this being correct.
     * 
     * @param element
     *            element to check.
     * @return true if the element could have been inserted into the Bloom
     *         filter.
     */
    public boolean contains(E element) {
        long hash;
        String valString = element.toString();
        for (int x = 0; x < k; x++) {
            hash = createHash(valString + Integer.toString(x));
            hash = hash % (long) bitSetSize;
            if (!bitset.get(Math.abs((int) hash)))
                return false;
        }
        return true;
    }

    /**
     * Returns true if all the elements of a Collection could have been inserted
     * into the Bloom filter. Use getFalsePositiveProbability() to calculate the
     * probability of this being correct.
     * 
     * @param c
     *            elements to check.
     * @return true if all the elements in c could have been inserted into the
     *         Bloom filter.
     */
    public boolean containsAll(Collection<? extends E> c) {
        for (E element : c)
            if (!contains(element))
                return false;
        return true;
    }

    /**
     * Read a single bit from the Bloom filter.
     * 
     * @param bit
     *            the bit to read.
     * @return true if the bit is set, false if it is not.
     */
    public boolean getBit(int bit) {
        return bitset.get(bit);
    }

    /**
     * Set a single bit in the Bloom filter.
     * 
     * @param bit
     *            is the bit to set.
     * @param value
     *            If true, the bit is set. If false, the bit is cleared.
     */
    public void setBit(int bit, boolean value) {
        bitset.set(bit, value);
    }

    /**
     * Return the bit set used to store the Bloom filter.
     * 
     * @return bit set representing the Bloom filter.
     */
    public BitSet getBitSet() {
        return bitset;
    }

    /**
     * Returns the number of bits in the Bloom filter. Use count() to retrieve
     * the number of inserted elements.
     * 
     * @return the size of the bitset used by the Bloom filter.
     */
    public int size() {
        return this.bitSetSize;
    }

    /**
     * Returns the number of elements added to the Bloom filter after it was
     * constructed or after clear() was called.
     * 
     * @return number of elements added to the Bloom filter.
     */
    public int count() {
        return this.numberOfAddedElements;
    }

    /**
     * Returns the expected number of elements to be inserted into the filter.
     * This value is the same value as the one passed to the constructor.
     * 
     * @return expected number of elements.
     */
    public int getExpectedNumberOfElements() {
        return expectedNumberOfFilterElements;
    }

    /**
     * Get expected number of bits per element when the Bloom filter is full.
     * This value is set by the constructor when the Bloom filter is created.
     * See also getBitsPerElement().
     * 
     * @return expected number of bits per element.
     */
    public double getExpectedBitsPerElement() {
        return this.bitsPerElement;
    }

    /**
     * Get actual number of bits per element based on the number of elements
     * that have currently been inserted and the length of the Bloom filter. See
     * also getExpectedBitsPerElement().
     * 
     * @return number of bits per element.
     */
    public double getBitsPerElement() {
        return this.bitSetSize / (double) numberOfAddedElements;
    }
}

package com.lietu.show;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Properties;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

//提取商品信息
public class ExtractProduct {

    static BloomFilter<String> urlSeen = new BloomFilter<String>(200000, 20000);

    /**
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        String url = "http://www.xiu.com/about/classifymap.shtml";
        Document doc = Jsoup.connect(url).get();
        Elements links = doc.select("a[href]"); // 带有href属性的a标签
        for (Element link : links) { // 遍历每个链接
            String linkHref = link.attr("href"); // 得到href属性中的值，也就是url地址
            if (linkHref.startsWith("http://list.xiu.com/")
                    || linkHref.startsWith("http://brand.xiu.com/")) {
                linkHref = getListURL(linkHref);
                if (urlSeen.contains(linkHref))
                    continue;
                // System.out.println(linkHref); //输出url地址和锚点上的文字说明
                extractList(linkHref);
            }
        }
    }
    
    public static String getListURL(String oldURL){
        ParseURL splitURL = new ParseURL(oldURL);
        if(splitURL.searchparms==null){
            return splitURL.baseURL;
        }
        //System.out.println(splitURL.baseURL);
        String curPage = splitURL.searchparms.get("currentPage");
        // http://list.xiu.com/20069.html?currentPage=2
        if (curPage != null)
            //System.out.println(splitURL.baseURL + "?currentPage=" + curPage);
            return splitURL.baseURL + "?currentPage=" + curPage;
        else{
            //System.out.println(splitURL.baseURL);
            return splitURL.baseURL;
        }
    }

    //处理目录页
    // http://list.xiu.com/19757.html
    // http://brand.xiu.com/3117.html
    public static void extractList(String url) {
        System.out.println("List url:" + url);
        urlSeen.add(url);
        Document doc = null;
        try {
            doc = Jsoup.connect(url).get();
        } catch (Exception e) {
            try {
                doc = Jsoup.connect(url).get();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
                return;
            }
        }

        Elements links = doc.select("a[href]"); // 带有href属性的a标签
        for (Element link : links) { // 遍历每个链接
            String linkHref = link.attr("href"); // 得到href属性中的值，也就是url地址
            if (linkHref.startsWith("http://list.xiu.com/")
                    || linkHref.startsWith("http://brand.xiu.com/")) {
                linkHref = getListURL(linkHref);
                if (urlSeen.contains(linkHref))
                    continue;
                extractList(linkHref); // 输出url地址和锚点上的文字说明
            } else if (linkHref.startsWith("http://item.xiu.com/product/")) { // "http://item.xiu.com/product/0359097.shtml"
                if (urlSeen.contains(linkHref))
                    continue;
                getProduct(linkHref);
            }
        }
    }

    //处理详细页
    public static void getProduct(String url) {
        System.out.println("Product url:" + url);
        urlSeen.add(url);
        // String url = "http://item.xiu.com/product/0359097.shtml";
        Document doc = null;
        try {
            doc = Jsoup.connect(url).get();
        } catch (Exception e) {
            try{
                doc = Jsoup.connect(url).get();
            }
            catch (Exception e2) {
                try {
                    doc = Jsoup.connect(url).get();
                } catch (IOException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                    return;
                }
            }
        }
        Elements links = doc.select("div.p_title"); // 带有href属性的a标签
        String name = links.get(0).childNode(1).childNode(0).toString();
        // System.out.println(name);

        Element link = doc.select("div.conlist").get(2);
        String desc = link.text();
        // System.out.println(link.text());

        Element img = doc.select("#imgPic").first();
        String imgSrc = img.attr("src");
        // System.out.println(img.attr("src"));
        
        Element thumb = doc.select("img[onload]").first();
        String thumbSrc = thumb.attr("src");
        //System.out.println(thumbSrc);
        
        insertDatabase(url,name,desc,imgSrc,thumbSrc);
    }

    public static void insertDatabase(String url, String name, String desc,
            String img,String thumbSrc) {
        Connection conn = null;
        PreparedStatement stmt2 = null;
        try {
            Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
            Properties p = new Properties(); 
            p.put("charSet", "gb2312");
            conn = DriverManager
                    .getConnection("jdbc:odbc:driver={Microsoft Access Driver (*.mdb)};DBQ=good.mdb",p);

            stmt2 = conn
                    .prepareStatement("insert into good(url,productName,productDesc,img,thumb)values(?,?,?,?,?)");
            stmt2.setString(1, url);
            stmt2.setString(2, name);
            stmt2.setString(3, desc);
            stmt2.setString(4, img);
            stmt2.setString(5, thumbSrc);
            stmt2.executeUpdate();
        } catch (Exception ex) {
            ex.printStackTrace();
        } finally {
            try {
                if (stmt2 != null)
                    stmt2.close();
                if (conn != null)
                    conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

给定误算率p，插入元素个数n，那么最优位数组大小为：

Hash函数个数选取最优数目为：


python 实现：

import math

N = 30000000000.0
P = 0.001

def get_len_arr(n, p):
    return -(n * math.log(p) / (math.log(2) ** 2))


def get_k(p):
    return - (math.log(p) / math.log(2))

m = get_len_arr(N, P)

print m, 'bit'

g = m / (8 * 1024 * 1024 * 1024)

print g, 'GB'

k = get_k(P)

print k, 'ge hash hanshu'
=========================
4.31327626982e+11 bit
50.2131445079 GB       300亿，50G内存。
9.96578428466 ge hash hanshu

posted on 2012-12-14 15:39 雨渐渐阅读(1004) 评论(0) 编辑收藏举报

刷新页面返回顶部

雨渐渐

网络爬虫速成指南（四） URL判重

导航

公告

雨渐渐

网络爬虫速成指南 （四） URL判重

导航

公告

网络爬虫速成指南（四） URL判重