Jsoup抓取页面的小技巧

前面写了好多jsoup的例子

 现在写写小技巧吧

(1) 得到document 的方法,

(有时候这个不一定能得到document,可以看前面介绍的两个方法中的另一个方法,也可以将post 方法改成get ——在try里)

    public static Document readUrlFist(String url) {
        Document doc = null;
        Connection conn = Jsoup.connect(url);
        conn
                .header(
                        "User-Agent",
                        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1");
        try {
            doc = conn.timeout(200 * 1000).post();
        } catch (IOException e) {
            e.printStackTrace();
            if ((e instanceof UnknownHostException)
                    || (e instanceof SocketTimeoutException)) {
                doc = readUrlFist(url);
            }

        }
        return doc;
    }

 得到body 的方法

    // 读取url得到一级节点
    public static Elements readBody(String url) {
        Document doc = readUrlFist(url);
        Elements body = doc.select("body");
        return body;
    }

得到某节点下孩子最多的节点

    // 得到该节点下的孩子最多的那个节点(这个节点就是含有Url的节点)
    public static Element readChildByMaxNum(Element body) {
        Elements divOne = body.children();
        Element bestElement = divOne.get(0);
        int best = divOne.get(0).children().size();
        for (int i = 0; i < divOne.size(); i++) {
            Elements divTwo = divOne.get(i).select("a");
            int temp = divTwo.size();
            if (temp > best) {
                best = temp;
                bestElement = divOne.get(i);
            }
        }
        // System.out.println(bestElement.attr("id"));
        return bestElement;
    }

得到tagName 最多的节点

// 得到tagName最多的节点
    public static List<Element> takeAparentByTagName(List<Element> as,
            Element bestElement) {
        List<Element> bestElements = new ArrayList<Element>();
        Map<String, Integer> aparent = new HashMap<String, Integer>();
        String index = "";
        for (Element element : as) {
            String tag = element.tagName();
            if (aparent.containsKey(tag)) {
                aparent.put(tag, aparent.get(tag) + 1);
            } else {
                aparent.put(tag, 1);
            }
        }
        Set<String> keys = aparent.keySet();
        Iterator<String> iterable = keys.iterator();
        int max = 0;
        String best = "";
        while (iterable.hasNext()) {
            String key = iterable.next();
            if (max < aparent.get(key)) {
                max = aparent.get(key);
                best = key;
            }
        }
        index = best;
        bestElements = bestElement.select(index);

        return bestElements;
    }

得到className 的名字和他的数量

// 得到className的名字和他的数量
    public static Map<String, Integer> takeMapClass(List<Element> bestElements) {
        Map<String, Integer> myClass = new HashMap<String, Integer>();
        for (int i = 0; i < bestElements.size(); i++) {
            Element element = bestElements.get(i);
            String temp = element.className();
            int sum = element.children().size();
            if (sum == 0) {
                sum = 1;
            }
            if (temp == null || temp.equals("")) {
                temp = "iiiuuuzzz";
            }
            if (myClass.containsKey(temp)) {
                myClass.put(temp, myClass.get(temp) + sum);
            } else {
                myClass.put(temp, sum);
            }
        }
        // System.out.println("myClass.size() "+myClass.size());
        return myClass;
    }

得到所有节点的父亲节点

    public static List<Element> gerFaterPonit(Elements elements) {
        List<Element> bestElements = new ArrayList<Element>();
        for (Element element : elements) {
            Element elementFater = element.parent();
            bestElements.add(elementFater);
        }
        return bestElements;
    }

 

得到className数量最多的节点的名字

public static String takeIndexByClassName(List<Element> bestElements) {
        Map<String, Integer> myClass = takeMapClass(bestElements);
        Set<String> keys = myClass.keySet();
        Iterator<String> iterable = keys.iterator();
        int max = 0;
        String best = "";
        while (iterable.hasNext()) {
            String key = iterable.next();
            // System.out.println(key+ myClass.get(key));
            if (max < myClass.get(key)) {
                max = myClass.get(key);
                best = key;
            }
        }
        String index = best;
        // System.out.println("index :" +index);
        return index;
    }

 得到className 数量居第二的名称

// 和className数量次多的节点的索引
    public static String takeBetterIndexByClassName(List<Element> bestElements) {
        Map<String, Integer> myClass = takeMapClass(bestElements);
        String index = takeIndexByClassName(bestElements);
        String index2 = "";
        Set<String> keys = myClass.keySet();
        Iterator<String> iterable = keys.iterator();
        int max = 0;
        String best = "";
        while (iterable.hasNext()) {
            String key = iterable.next();
            if (!key.equals(index)) {
                if (max < myClass.get(key)) {
                    max = myClass.get(key);
                    best = key;
                }
            }
        }
        index2 = best;
        // System.out.println("index2   :" +index2);
        return index2;
    }
// 根据索引得出所要的节点
    public static List<Element> getElementByClassName(
            List<Element> bestElements, String index) {
        List<Element> elementList = new ArrayList<Element>();
        for (Element element : bestElements) {
            String temp = element.className();
            if (temp == null || temp.equals("")) {
                temp = "iiiuuuzzz";
            }
            if (index.equals(temp)) {
                elementList.add(element);
            }

        }
        return elementList;

    }

// 得到孩子节点

public static List<Element> takeChildren(List<Element> bestElements) {
        List<Element> children = new ArrayList<Element>();
        for (Element element : bestElements) {
            Elements childrens = element.children();
            for (Element element2 : childrens) {
                children.add(element2);
            }
        }
        return children;

    }
// 得到和自己内容不同的父亲节点
    public static Element getParent(Element element) {
        Element parent = element.parent();
        if (element.siblingElements().size() > 0) {
            while (parent.text().equals(element.text())) {
                parent = parent.parent();
            }
        }
        return parent;
    }

 

posted @ 2013-01-29 10:34  杨桃  阅读(672)  评论(0编辑  收藏  举报