Jsoup爬虫的简单使用
添加POM依赖
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency>
JAVA代码示例
public static void main(String[] args) throws IOException{ // 天眼查 // String result1= HttpRequest.get("http://open.api.tianyancha.com/services/open/cb/ic/2.0?keyword=XXXX公司").header("Authorization", "").execute().body();; // System.err.println(result1); /*Document doc = Jsoup.connect("https://www.tianyancha.com/search?key=北京百度网讯科技有限公司").timeout(3000).get(); System.err.println(doc.title()); Elements newsHeadlines = doc.select(".cate_menu_lk"); System.err.println(newsHeadlines.size()); for (Element headline : newsHeadlines) { System.err.println( headline.text()); } */ try { Document document = Jsoup.connect("https://www.so.com/s?ie=utf-8&fr=so.com&src=home_so.com&ssid=&q=java") .timeout(5000) .get(); Elements elements = document.select(".res-title a"); elements.forEach(element -> { System.out.println(element.text()); System.err.println(element.attr("href")); }); System.err.println("---------------------"); for(int i=2;i<=10;i++){ Document documentt = Jsoup.connect("https://www.so.com/s?q=java&pn="+i+"&src=srp_paging&fr=so.com") .timeout(5000) .get(); Elements eelements = documentt.select(".res-title a"); eelements.forEach(element -> { System.out.println(element.text()); System.err.println(element.attr("href")); }); System.err.println("---------------------"); } } catch (IOException e) { e.printStackTrace(); } }