Jsoup 解析html
Jsoup解析线上html文件
1 /** 2 * 根据url地址获取元素 3 * 4 * @param url 5 * 要爬虫的url地址 6 * @return 返回需要继续的元素值 7 * 8 * 这里解析的为的 xy 网页上的应用信息 内容 ul 标签中 9 * 10 */ 11 public Elements getHtmlRescoureByurl(String url) { 12 Connection con = Jsoup.connect(url).timeout(5000); 13 try { 14 con.header("Connection", "keep-alive"); 15 con.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 16 con.header("User-Agent", 17 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"); 18 19 Document doc = con.get(); 20 Elements divs = doc.getElementsByTag("div"); 21 22 Element div = null; 23 for (Element element : divs) { 24 if (element.attr("class").equals("seventyfive")) { 25 div = element; 26 break; 27 } 28 } 29 if (div != null) { 30 Elements uls = div.getElementsByTag("ul"); 31 if (uls.size() > 0) { 32 Element ul = uls.get(0); 33 return ul.getElementsByTag("li"); 34 } 35 } 36 } catch (IOException e) { 37 System.out.println(e.getMessage()); 38 } 39 return null; 40 } 41 42 43 /** 44 * 解析返回的xml内容 45 * @param lis 46 * @return 47 */ 48 public List<Object[]> parElement(Elements lis , int type) { 49 List<Object[]> list = new ArrayList<Object[]>(); 50 int i = 0; 51 String defaulttime = new SimpleDateFormat("yyyy-MM-dd").format(new Date()); 52 53 for (Element li : lis) { 54 String icon = li.getElementsByAttributeValue("class", "xy_img").attr("src"); // icon图标 55 Elements als = li.getElementsByTag("a"); 56 String appname = als.get(2).text(); // appname 57 String downloaded = li.getElementsByAttributeValue("class","edloaded").text(); // 下载次数 58 String app_size = li.getElementById("app_size_" + i).text(); 59 String size = app_size.split("\\|").length > 0? app_size.split("\\|")[0]:""; // 软件大小 60 String updatetime = app_size.split("\\|").length > 1? app_size.split("\\|")[1]:defaulttime; // 更新日期 61 String itunesid = als.get(3).attr("id"); // itunesid 62 String version = als.get(3).attr("version"); // 版本号 63 String bundleid = als.get(3).attr("bid"); // bundleid 64 String ipaurl = als.get(3).attr("resurl"); 65 ipaurl = getFromBase64(ipaurl); // ipa下载地址 66 i++; 67 68 // ipa 下载地址为空 appname太长 更时间为空 69 // 参数顺序 itunesid,appname,icon,version,bundleid,size,downloaded,ipaurl,updatetime 70 Object[] params = new Object[]{itunesid,appname,icon,version,bundleid,size,downloaded,type,ipaurl,updatetime}; 71 list.add(params); 72 } 73 return list; 74 } 75 76 77 // post 请求 78 79 public void testJsop(){ 80 try { 81 Connection conn = Jsoup.connect("https://passport.jd.com/new/login.aspx"); 82 conn.data("loginname","test1"); 83 conn.data("loginpwd","test1"); 84 Document doc = conn.post();
// Document doc = con.ignoreContentType(true).get(); 85 System.out.println(doc); 86 } catch (IOException e) { 87 e.printStackTrace(); 88 } 89 }
需要jar jsoup