JAVA爬取URL,并利用布隆算法去重

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.BitSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlSpider {
    /**
     * 定义相关全局变量
     * url:访问的地址
     * sb:缓存爬取出来的url
     */

    private static String url = "https://www.cncn.com/";
    private static StringBuffer sb = new StringBuffer("");

    public static void main(String[] args) throws Exception {
        String[] urls = urlSpider(url);
        MyBloomFilter filter = new MyBloomFilter();
        int num = 0;
        for (String s : urls
                ) {
            if (!filter.contain(s)) {
                System.out.println(s);
                num++;
            }
        }
        System.out.println();
        System.out.println("用布隆算法去重后该页面url的数量为: "+num);

    }

    /**
     * 爬取网页数据
     */

    public static String[] urlSpider(String urlAddress) throws Exception {
        //定义一个 String[],接收url
        String[] urls = null;
        URL url = new URL(urlAddress);
        //创建连接
        URLConnection connection = url.openConnection();
        //添加User-Agent
        connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36");
        BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(), "GBK"));
        String line = null;
        //这里不能使用(line = br.readLine()).length(),因为最后没有数据的时候,会报空指针异常
        while ((line = br.readLine()) != null) {
            sb.append(line);
        }
        //利用正则或者Xpath判定
        urls = getUrl(sb);
        return urls;
    }

    /**
     * Regex Function,取得url
     */
    public static String[] getUrl(StringBuffer sb) {
        String[] urls = new String[0];
        String regex = "<a href=\"([(https)|(http)]+://www.cncn.com/[^\\\"]+)\"[^>]+>[\\u4e00-\\u9fa5]+</a>";
        Pattern pattern = Pattern.compile(regex);
        Matcher urlMatcher = pattern.matcher(sb);
        while (urlMatcher.find()) {
            String group = urlMatcher.group(1);
            urls = Arrays.copyOf(urls, urls.length + 1);
            urls[urls.length - 1] = group;
        }
        return urls;
    }
}

/**
 * 布隆去重
 */
class MyBloomFilter {
    //定义BitSet默认大小,2 << 25没有超过int范围
    private final static int DEFAULT_SIZE = 2 << 25;
    //创建BitSet
    private static BitSet bitSet = new BitSet(DEFAULT_SIZE);
    //定义哈希种子,类型为质数、个数决定哈希函数个数
    private static int[] seeds = new int[]{5, 7, 11, 13, 31, 37, 61};
    //定义哈希函数数组
    private static MyHash[] func = new MyHash[seeds.length];

    public MyBloomFilter() {
        //....构造所需的哈希函数
        for (int i = 0; i < seeds.length; i++) {
            func[i] = new MyHash(DEFAULT_SIZE, seeds[i]);
        }
    }

    /**
     * 将字符串标记到bits中
     */
    public void add(String url) {
        if (url != null) {
            for (MyHash f : func
                    ) {
                //true代表这个位置被标记了
                bitSet.set(f.hash(url), true);
            }
        }
    }

    /**
     * 判断url是否已经被BitSet标记过
     */
    public boolean contain(String url) {
        if (url == null) {
            return false;
        }
        boolean flag = true;
        //循环判断bitset中是否包含该url
        for (MyHash f : func
                ) {
            flag = flag && bitSet.get(f.hash(url));
        }
        return flag;
    }

    /**
     * 定义一个静态内部类MyHash,实现哈希函数功能
     */
    public static class MyHash {
        /**
         * cap = DEFAULT_SIZE
         */
        private int cap;
        private int seed;

        public MyHash(int cap, int seed) {
            this.cap = cap;
            this.seed = seed;
        }

        public int hash(String url) {
            int no = 0;
            for (int i = 0; i < url.length(); i++) {
                no = no * seed + url.charAt(i);
            }
            return (cap - 1) & no;
        }
    }
}

posted on 2019-02-28 15:29  一支兰州  阅读(476)  评论(0编辑  收藏  举报

导航