Jsoup应用对比测试
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.jsoup.select.Selector.SelectorParseException; public class ICBCategoryTest { public void print(String s) { System.out.println(s); } public String[] ICB(String URL) throws IOException { String[] result = {"","","","","","","","","","","","",""}; try { Document doc = Jsoup.connect(URL).timeout(120000).get(); // title try { Elements title = doc.select("title"); result[0] = title.text(); // System.out.println(result[0]); } catch (SelectorParseException e) { result[0] = ""; } // descrption try { Elements description = doc.select("meta[name]"); result[1] = description.attr("content"); } catch (SelectorParseException e) { result[1] = ""; } // canonical try { Elements canonical = doc.select("link[rel=canonical]"); result[2] = canonical.attr("href"); result[2] = result[2].replaceAll("www.internetcorkboard.", "staging.internetcorkboard."); } catch (SelectorParseException e) { result[2] = ""; } // og:site_name try { Elements site_name = doc.select("meta[property=og:site_name]"); result[3] = site_name.attr("content"); } catch (SelectorParseException e) { result[3] = ""; } // og:image:width try { Elements image_width = doc.select("meta[property=og:image:width]"); result[4] = image_width.attr("content"); } catch (SelectorParseException e) { result[4] = ""; } // og:image:height try { Elements image_height = doc .select("meta[property=og:image:height]"); result[5] = image_height.attr("content"); } catch (SelectorParseException e) { result[5] = ""; } // og:title try { Elements og_title = doc.select("meta[property=og:title]"); result[6] = og_title.attr("content"); } catch (SelectorParseException e) { result[6] = ""; } // og:description try { Elements og_description = doc.select("meta[property=og:description]"); result[7] = og_description.attr("content"); } catch (SelectorParseException e) { result[7] = ""; } // og:url try{ Elements og_url = doc.select("meta[property=og:description]"); result[8] = og_url.attr("content"); result[8] = result[8].replaceAll("www.", "staging"); }catch(SelectorParseException e){ result[8] = ""; } // og:type try{ Elements og_type = doc.select("meta[property=og:description]"); result[9] = og_type.attr("content"); }catch(SelectorParseException e){ result[9] = ""; } //body try{ Elements body = doc.getElementsByClass("NoAdsBody"); result[10] = body.text(); }catch(SelectorParseException e){ result[10] = ""; } //related articles try{ Elements related = doc.getElementsByClass("relatedarticles"); result[11] = related.text(); }catch(SelectorParseException e){ result[11] = ""; } //you may also like try{ Elements related = doc.getElementsByClass("rgtitle"); result[12] = related.text(); }catch(SelectorParseException e){ result[12] = ""; } return(result); } catch (java.lang.NullPointerException e) { System.out.println("null "+URL); } catch (org.jsoup.HttpStatusException e) { int i = e.getStatusCode(); System.out.println(i+" "+URL); } catch(java.net.ConnectException e){ System.out.println("Time out :"+URL); } return result; } public static void main(String args[]) throws IOException { ArrayList<String[]> a=new ArrayList<String[]>(); ArrayList<String[]> b=new ArrayList<String[]>(); ArrayList<String> CategoryUrl=new ArrayList<String>(); File f1 = new File("C:/ICBTest/CategoryUrl.txt"); File f2 = new File("C:/ICBTest/CategoryError.txt"); String line = ""; String Url=""; FileReader reader = new FileReader(f1); FileWriter writer = new FileWriter(f2, true); BufferedReader br = new BufferedReader(reader); BufferedWriter bw = new BufferedWriter(writer); while ((line = br.readLine()) != null) { CategoryUrl.add(line); Url = "http://www.internetcorkboard.com"+line+"?source=miva"; a.add(new ICBCategoryTest().ICB(Url)); Url = "http://staging.internetcorkboard.com"+line+"?source=miva"; b.add(new ICBCategoryTest().ICB(Url)); } String[] list={"title","descrption","canonical","og:site_name","og:image:width","og:image:height","og:title","og:description","og:url","og:type","body","related articles","you may also like"}; if(a.size()==b.size()){ for(int i=0;i<a.size();i++){ String[] aa=a.get(i); String[] bb=b.get(i); String url=CategoryUrl.get(i); for(int j=0;j<aa.length;j++){ if(aa[j].equals(bb[j])==false){ bw.write("Error:"+"\t"+url+"\t"+list[j]); bw.newLine(); bw.flush(); } } } }else{ System.out.println("总数不一致"); } br.close(); bw.close(); } }