爬虫学习-入门
语言:JAVA
软件:eclipse
首先需要到网上下载Jsoup的jar包。
下载地址:http://www.pc6.com/softview/SoftView_541368.html
之后是在eclipse中创建项目,把jar包导入lib文件夹中
提取网页中的链接:
import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /* * * 提取链接 */ public class Text_211 { public static void main(String args[]) { String url="http://www.lietu.com"; try { Document doc=Jsoup.connect(url).get(); Elements links=doc.select("a[href]"); for(Element link:links) { String linkHref=link.attr("href"); System.out.println(linkHref); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
提取标题和详细页链接的完整代码
得到网页的所有的标题和链接
import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /* * * 采集新闻 */ public class Text_212 { public static void main(String args[]) { //text_first("http://politics.people.com.cn/GB/1024/"); text_second("http://china.cnr.cn/yaowen/"); } //提取标题和详细页链接的完整代码 public static void text_first(String address) { try { Document document=Jsoup.connect(address).get(); Elements es=document.getElementsByClass("list_16"); Elements links=es.select("a[href]"); for(Element link:links) { String title=link.text(); System.out.println(title); String linkHref=link.attr("href"); System.out.println(linkHref); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //得到网页的所有的标题和链接 public static void text_second(String address) { try { Document document=Jsoup.connect(address).timeout(5000).get(); Element content=document.getElementById("subNav_menu"); Elements es=document.getElementsByClass("text"); for(Element linck:es) { Element alink=linck.getElementsByTag("a").first(); if(alink!=null) { System.out.println(alink.attr("href")); System.out.println(alink.text()); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }