Jsoup抓取页面的小技巧
前面写了好多jsoup的例子
现在写写小技巧吧
(1) 得到document 的方法,
(有时候这个不一定能得到document,可以看前面介绍的两个方法中的另一个方法,也可以将post 方法改成get ——在try里)
public static Document readUrlFist(String url) { Document doc = null; Connection conn = Jsoup.connect(url); conn .header( "User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1"); try { doc = conn.timeout(200 * 1000).post(); } catch (IOException e) { e.printStackTrace(); if ((e instanceof UnknownHostException) || (e instanceof SocketTimeoutException)) { doc = readUrlFist(url); } } return doc; }
得到body 的方法
// 读取url得到一级节点 public static Elements readBody(String url) { Document doc = readUrlFist(url); Elements body = doc.select("body"); return body; }
得到某节点下孩子最多的节点
// 得到该节点下的孩子最多的那个节点(这个节点就是含有Url的节点) public static Element readChildByMaxNum(Element body) { Elements divOne = body.children(); Element bestElement = divOne.get(0); int best = divOne.get(0).children().size(); for (int i = 0; i < divOne.size(); i++) { Elements divTwo = divOne.get(i).select("a"); int temp = divTwo.size(); if (temp > best) { best = temp; bestElement = divOne.get(i); } } // System.out.println(bestElement.attr("id")); return bestElement; }
得到tagName 最多的节点
// 得到tagName最多的节点 public static List<Element> takeAparentByTagName(List<Element> as, Element bestElement) { List<Element> bestElements = new ArrayList<Element>(); Map<String, Integer> aparent = new HashMap<String, Integer>(); String index = ""; for (Element element : as) { String tag = element.tagName(); if (aparent.containsKey(tag)) { aparent.put(tag, aparent.get(tag) + 1); } else { aparent.put(tag, 1); } } Set<String> keys = aparent.keySet(); Iterator<String> iterable = keys.iterator(); int max = 0; String best = ""; while (iterable.hasNext()) { String key = iterable.next(); if (max < aparent.get(key)) { max = aparent.get(key); best = key; } } index = best; bestElements = bestElement.select(index); return bestElements; }
得到className 的名字和他的数量
// 得到className的名字和他的数量 public static Map<String, Integer> takeMapClass(List<Element> bestElements) { Map<String, Integer> myClass = new HashMap<String, Integer>(); for (int i = 0; i < bestElements.size(); i++) { Element element = bestElements.get(i); String temp = element.className(); int sum = element.children().size(); if (sum == 0) { sum = 1; } if (temp == null || temp.equals("")) { temp = "iiiuuuzzz"; } if (myClass.containsKey(temp)) { myClass.put(temp, myClass.get(temp) + sum); } else { myClass.put(temp, sum); } } // System.out.println("myClass.size() "+myClass.size()); return myClass; }
得到所有节点的父亲节点
public static List<Element> gerFaterPonit(Elements elements) { List<Element> bestElements = new ArrayList<Element>(); for (Element element : elements) { Element elementFater = element.parent(); bestElements.add(elementFater); } return bestElements; }
得到className数量最多的节点的名字
public static String takeIndexByClassName(List<Element> bestElements) { Map<String, Integer> myClass = takeMapClass(bestElements); Set<String> keys = myClass.keySet(); Iterator<String> iterable = keys.iterator(); int max = 0; String best = ""; while (iterable.hasNext()) { String key = iterable.next(); // System.out.println(key+ myClass.get(key)); if (max < myClass.get(key)) { max = myClass.get(key); best = key; } } String index = best; // System.out.println("index :" +index); return index; }
得到className 数量居第二的名称
// 和className数量次多的节点的索引 public static String takeBetterIndexByClassName(List<Element> bestElements) { Map<String, Integer> myClass = takeMapClass(bestElements); String index = takeIndexByClassName(bestElements); String index2 = ""; Set<String> keys = myClass.keySet(); Iterator<String> iterable = keys.iterator(); int max = 0; String best = ""; while (iterable.hasNext()) { String key = iterable.next(); if (!key.equals(index)) { if (max < myClass.get(key)) { max = myClass.get(key); best = key; } } } index2 = best; // System.out.println("index2 :" +index2); return index2; }
// 根据索引得出所要的节点 public static List<Element> getElementByClassName( List<Element> bestElements, String index) { List<Element> elementList = new ArrayList<Element>(); for (Element element : bestElements) { String temp = element.className(); if (temp == null || temp.equals("")) { temp = "iiiuuuzzz"; } if (index.equals(temp)) { elementList.add(element); } } return elementList; }
// 得到孩子节点
public static List<Element> takeChildren(List<Element> bestElements) { List<Element> children = new ArrayList<Element>(); for (Element element : bestElements) { Elements childrens = element.children(); for (Element element2 : childrens) { children.add(element2); } } return children; }
// 得到和自己内容不同的父亲节点 public static Element getParent(Element element) { Element parent = element.parent(); if (element.siblingElements().size() > 0) { while (parent.text().equals(element.text())) { parent = parent.parent(); } } return parent; }