Jsoup应用对比测试

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector.SelectorParseException;

public class ICBCategoryTest {
    public void print(String s) {
        System.out.println(s);
    }

    public String[] ICB(String URL) throws IOException {
        String[] result = {"","","","","","","","","","","","",""};
        try {
            Document doc = Jsoup.connect(URL).timeout(120000).get();
            // title
            try {
                Elements title = doc.select("title");
                result[0] = title.text();
//                System.out.println(result[0]);
            } catch (SelectorParseException e) {
                result[0] = "";
            }
            // descrption
            try {
                Elements description = doc.select("meta[name]");
                result[1] = description.attr("content");
            } catch (SelectorParseException e) {
                result[1] = "";
            }
            // canonical
            try {
                Elements canonical = doc.select("link[rel=canonical]");
                result[2] = canonical.attr("href");
                result[2] = result[2].replaceAll("www.internetcorkboard.", "staging.internetcorkboard.");
            } catch (SelectorParseException e) {
                result[2] = "";
            }
            // og:site_name
            try {
                Elements site_name = doc.select("meta[property=og:site_name]");
                result[3] = site_name.attr("content");
            } catch (SelectorParseException e) {
                result[3] = "";
            }
            // og:image:width
            try {
                Elements image_width = doc.select("meta[property=og:image:width]");
                result[4] = image_width.attr("content");
            } catch (SelectorParseException e) {
                result[4] = "";
            }
            // og:image:height
            try {
                Elements image_height = doc
                        .select("meta[property=og:image:height]");
                result[5] = image_height.attr("content");
            } catch (SelectorParseException e) {
                result[5] = "";
            }
            // og:title
            try {
                Elements og_title = doc.select("meta[property=og:title]");
                result[6] = og_title.attr("content");
            } catch (SelectorParseException e) {
                result[6] = "";
            }
            // og:description
            try {
                Elements og_description = doc.select("meta[property=og:description]");
                result[7] = og_description.attr("content");
            } catch (SelectorParseException e) {
                result[7] = "";
            }
            // og:url
            try{
                Elements og_url = doc.select("meta[property=og:description]");
                result[8] = og_url.attr("content");
                result[8] = result[8].replaceAll("www.", "staging");
            }catch(SelectorParseException e){
                result[8] = "";
            }
            // og:type
            try{
                Elements og_type = doc.select("meta[property=og:description]");
                result[9] = og_type.attr("content");
            }catch(SelectorParseException e){
                result[9] = "";
            }
            //body
            try{
                Elements body = doc.getElementsByClass("NoAdsBody");
                result[10] = body.text();
            }catch(SelectorParseException e){
                result[10] = "";
            }
            //related articles
            try{
                Elements related = doc.getElementsByClass("relatedarticles");
                result[11] = related.text();
            }catch(SelectorParseException e){
                result[11] = "";
            }
            //you may also like
            try{
                Elements related = doc.getElementsByClass("rgtitle");
                result[12] = related.text();
            }catch(SelectorParseException e){
                result[12] = "";
            }
            return(result);
            
        } catch (java.lang.NullPointerException e) {
            System.out.println("null   "+URL);
        } catch (org.jsoup.HttpStatusException e) {
            int i = e.getStatusCode();
            System.out.println(i+"  "+URL);
        } catch(java.net.ConnectException e){
            System.out.println("Time out :"+URL);
        }
        return result;
    }

    public static void main(String args[]) throws IOException {
        ArrayList<String[]> a=new ArrayList<String[]>();
        ArrayList<String[]> b=new ArrayList<String[]>();
        ArrayList<String> CategoryUrl=new ArrayList<String>();
        File f1 = new File("C:/ICBTest/CategoryUrl.txt");
        File f2 = new File("C:/ICBTest/CategoryError.txt");
        String line = "";
        String Url="";
        FileReader reader = new FileReader(f1);
        FileWriter writer = new FileWriter(f2, true);
        BufferedReader br = new BufferedReader(reader);
        BufferedWriter bw = new BufferedWriter(writer);
        while ((line = br.readLine()) != null) {
            CategoryUrl.add(line);
            Url = "http://www.internetcorkboard.com"+line+"?source=miva";            
            a.add(new ICBCategoryTest().ICB(Url));
            Url = "http://staging.internetcorkboard.com"+line+"?source=miva";
            b.add(new ICBCategoryTest().ICB(Url));
        }
        String[] list={"title","descrption","canonical","og:site_name","og:image:width","og:image:height","og:title","og:description","og:url","og:type","body","related articles","you may also like"};
        if(a.size()==b.size()){
            for(int i=0;i<a.size();i++){
                String[] aa=a.get(i);
                String[] bb=b.get(i);
                String url=CategoryUrl.get(i);
                for(int j=0;j<aa.length;j++){
                    if(aa[j].equals(bb[j])==false){
                        bw.write("Error:"+"\t"+url+"\t"+list[j]);
                        bw.newLine();
                        bw.flush();
                    }
                }
            }
        }else{
            System.out.println("总数不一致");
        }
        br.close();
        bw.close();
    }
}

 

posted on 2013-04-16 14:42  IT Ⅳ  阅读(1063)  评论(0编辑  收藏  举报

导航