jsoup针对html工具类
// <div class="menu-list"> // <div class="margin-auto min-width1200"> // <div class="menu-item active"> // <a href="/">首页</a> // </div> // <div style="width: 145px;" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5040.jspx">技能提升培训课程 </a> // </div> // <div style="width: 145px;" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5479.jspx">特色视频展示区 </a> // </div> // <div style="width: 145px;" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5473.jspx">技能人才招聘专区 </a> // </div> // <div style="width: 145px;" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5478.jspx">院校及培训机构招生区 </a> // </div> // <div style="" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5472.jspx">政策通知动态专区 </a> // </div> // <div style="" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/5374.jspx">在线模考 </a> // </div> // <div style="" class="menu-item "> // <a href="http://v.qq.com/vplus/df399a8d1cf80ae06f356522325b0902?page=video" target="_blank">技能体验 </a> // </div> // <div style="" class="menu-item "> // <a href="//zjjnts.ghlearning.com:80/node/4516.jspx">帮助中心 </a> // </div> // </div> // </div> //目标地址 String url = "http://42.51.69.234:8001/"; try { Document document = Jsoup.connect(url).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36") .get(); Elements nav_com = document.getElementsByClass("menu-list"); ListIterator<Element> listIterator = nav_com.listIterator(); while(listIterator.hasNext()) { Element elementmenulist = listIterator.next(); //System.out.println(next.getElementsByClass("menu-item ")); Elements select = elementmenulist.select(".menu-item a"); ListIterator<Element> aList = select.listIterator(); while (aList.hasNext()) { Element a = (Element) aList.next(); System.out.println(a.text()+"\n"+a.attr("href")); } } } catch (IOException e) { System.out.println("出现错误:" + e.getMessage()); } }
public class HtmlUtil {
// 只有纯文本可以通过
public static String getText(String html) {
if (html == null)
return null;
return Jsoup.clean(html, Whitelist.none());
}
// 以下标签可以通过
// b, em, i, strong, u. 纯文本
public static String getSimpleHtml(String html) {
if (html == null)
return null;
return Jsoup.clean(html, Whitelist.simpleText());
}
// 以下标签可以通过
//a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, strike, strong, sub, sup, u, ul
public static String getBasicHtml(String html) {
if (html == null)
return null;
return Jsoup.clean(html, Whitelist.basic());
}
//在basic基础上 增加图片通过
public static String getBasicHtmlandimage(String html) {
if (html == null)
return null;
return Jsoup.clean(html, Whitelist.basicWithImages());
}
// 以下标签可以通过
//a, b, blockquote, br, caption, cite, code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, u, ul
public static String getFullHtml(String html) {
if (html == null)
return null;
return Jsoup.clean(html, Whitelist.relaxed());
}
//只允许指定的html标签
public static String clearTags(String html, String ...tags) {
Whitelist wl = new Whitelist();
return Jsoup.clean(html, wl.addTags(tags));
}
// // 对关键字加上颜色
// public static String markKeywods (String keywords, String target) {
// if (StringKit.notBlank(keywords)) {
// String[] arr = keywords.split(" ");
// for (String s : arr) {
// if (StringKit.notBlank(s)) {
// String temp = "<span class=\"highlight\">" + s + "</span>";
// if(temp!=null)
// target = target.replaceAll(s, temp);
// }
// }
// }
// return target;
// }
// 获取文章中的img url
public static String getImgSrc(String html) {
if (html == null)
return null;
Document doc = Jsoup.parseBodyFragment(html);
Element image = doc.select("img").first();
return image == null ? null : image.attr("src");
}