Jsoup 解析html

Jsoup解析线上html文件 

 1 /**
 2      * 根据url地址获取元素
 3      * 
 4      * @param url
 5      *            要爬虫的url地址
 6      * @return 返回需要继续的元素值
 7      * 
 8      *         这里解析的为的 xy 网页上的应用信息 内容 ul 标签中
 9      * 
10      */
11     public Elements getHtmlRescoureByurl(String url) {
12         Connection con = Jsoup.connect(url).timeout(5000); 
13         try {
14             con.header("Connection", "keep-alive");
15             con.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
16             con.header("User-Agent",
17                     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
18 
19             Document doc = con.get();
20             Elements divs = doc.getElementsByTag("div");
21 
22             Element div = null;
23             for (Element element : divs) {
24                 if (element.attr("class").equals("seventyfive")) {
25                     div = element;
26                     break;
27                 }
28             }
29             if (div != null) {
30                 Elements uls = div.getElementsByTag("ul");
31                 if (uls.size() > 0) {
32                     Element ul = uls.get(0);
33                     return ul.getElementsByTag("li");
34                 }
35             }
36         } catch (IOException e) {
37             System.out.println(e.getMessage());
38         }
39         return null;
40     }
41 
42     
43     /**
44      *    解析返回的xml内容
45      * @param lis
46      * @return
47      */
48     public List<Object[]> parElement(Elements lis , int type) {
49         List<Object[]> list = new ArrayList<Object[]>();
50         int i = 0;
51         String defaulttime = new SimpleDateFormat("yyyy-MM-dd").format(new Date());
52         
53         for (Element li : lis) {
54             String icon = li.getElementsByAttributeValue("class", "xy_img").attr("src"); // icon图标
55             Elements als = li.getElementsByTag("a");
56             String appname = als.get(2).text();                                             // appname
57             String downloaded = li.getElementsByAttributeValue("class","edloaded").text();  // 下载次数
58             String app_size = li.getElementById("app_size_" + i).text();
59             String size = app_size.split("\\|").length > 0? app_size.split("\\|")[0]:"";                                         // 软件大小
60             String updatetime = app_size.split("\\|").length > 1? app_size.split("\\|")[1]:defaulttime;                                     // 更新日期
61             String itunesid = als.get(3).attr("id");                                         // itunesid
62             String version = als.get(3).attr("version");                                     // 版本号
63             String bundleid = als.get(3).attr("bid");                                         // bundleid
64             String ipaurl = als.get(3).attr("resurl");
65             ipaurl = getFromBase64(ipaurl);                                                    //  ipa下载地址
66             i++;
67             
68             //   ipa 下载地址为空   appname太长  更时间为空
69             //  参数顺序   itunesid,appname,icon,version,bundleid,size,downloaded,ipaurl,updatetime  
70             Object[] params = new Object[]{itunesid,appname,icon,version,bundleid,size,downloaded,type,ipaurl,updatetime};
71             list.add(params);
72         }
73         return list;
74     }
75 
76 
77 // post 请求
78   
79     public void testJsop(){
80         try {
81             Connection conn = Jsoup.connect("https://passport.jd.com/new/login.aspx");
82             conn.data("loginname","test1");
83             conn.data("loginpwd","test1");
84             Document doc = conn.post(); 
         // Document doc = con.ignoreContentType(true).get();
85 System.out.println(doc); 86 } catch (IOException e) { 87 e.printStackTrace(); 88 } 89 }

 

 

需要jar jsoup

posted @ 2016-01-25 14:20  幸福流浪  阅读(272)  评论(0编辑  收藏  举报