爬虫:HttpClient Jsoup 爬虫学习
maven工程:
log4j.properties
log4j.rootLogger=DEBUG,A1 log4j.logger.org.example=DEBUG log4j.appender.A1=org.apache.log4j.ConsoleAppender log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>com.mwq.webmagic</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.2</version> </dependency> <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency> <!-- https://mvnrepository.com/artifact/junit/junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.7</version> </dependency> </dependencies> </project>
一、HttpClient模拟浏览器请求网页
package com.mwq.crawler.test; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class Crawler { public static void main(String[] args) throws Exception { //打开浏览器,创建hhtpclient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //输入网址,发起GET请求创建HttpGet对象 HttpGet httpGet = new HttpGet("http://www.itcast.cn"); //按回车,发起请求,返回响应,使用HttpClient对象发起请求 CloseableHttpResponse response = httpClient.execute(httpGet); if(response.getStatusLine().getStatusCode() == 200){ HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity,"utf-8"); System.out.println(content); } } }
二、HttpGet
package com.mwq.crawler.test; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpGetTest { public static void main(String[] args) { //创建httpjclient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建HttpGet对象,设置URL访问地址 HttpGet httpGet = new HttpGet("http://www.itcast.cn"); //使用HttpClient发起请求 CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity()); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { //关闭response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
三、带参数的HttpGet
package com.mwq.crawler.test; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.net.URISyntaxException; public class HttpGetParamTest { public static void main(String[] args) throws URISyntaxException { //创建httpjclient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //设置请求地址是:http://yun.itheima.com/search?keys=Java //创建urlbuilder URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search"); //设置参数 uriBuilder.setParameter("keys","Java"); //创建HttpGet对象,设置URL访问地址 HttpGet httpGet = new HttpGet(uriBuilder.build()); System.out.println("发起请求的信息:" + httpGet); //使用HttpClient发起请求 CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity()); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { //关闭response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
四、HttpPost
package com.mwq.crawler.test; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpPostTest { public static void main(String[] args) { //创建httpjclient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建HttpGet对象,设置URL访问地址 HttpPost httpPost = new HttpPost("http://www.itcast.cn"); //使用HttpClient发起请求 CloseableHttpResponse response = null; try { response = httpClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity()); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { //关闭response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
五、带参数的HttpPost
package com.mwq.crawler.test; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; public class HttpPostParamTest { public static void main(String[] args) throws URISyntaxException, Exception { //创建httpjclient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建HttpGet对象,设置URL访问地址 HttpPost httpPost = new HttpPost("http://yun.itheima.com/search"); //声明LIST稽核,封装表单中的参数 List<NameValuePair> params = new ArrayList<NameValuePair>(); //设置请求地址:http://yun.itheima.com/search?keys=Java params.add(new BasicNameValuePair("keys","Java")); //设置表单的Entity 对象,第一个参数就是封装号的表单数据,第二个参数就是编码 UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8"); httpPost.setEntity(formEntity); System.out.println("发起请求的信息:" + httpPost); //使用HttpClient发起请求 CloseableHttpResponse response = null; try { response = httpClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity()); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { //关闭response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
六、HttpConfig
package com.mwq.crawler.test; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpConfigTest { public static void main(String[] args) { //创建httpjclient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建HttpGet对象,设置URL访问地址 HttpGet httpGet = new HttpGet("http://www.itcast.cn"); //配置请求信息 RequestConfig config= RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位毫秒 .setConnectionRequestTimeout(500) //设置获取连接的最长时间,单位是毫秒 .setSocketTimeout(10*1000) //设置数据传输的最长时间,单位是毫秒 .build(); //给请求设置请求信息 httpGet.setConfig(config); //使用HttpClient发起请求 CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity()); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { //关闭response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
七、HttpClientPool
package com.mwq.crawler.test; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpClientPoolTest { public static void main(String[] args) { //创建连接池管理器 PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); //设置连接最大连接数 cm.setMaxTotal(100); //设置每个主机的最大连接数 cm.setDefaultMaxPerRoute(10); //使用连接池发请求 doGet(cm); doGet(cm); } private static void doGet(PoolingHttpClientConnectionManager cm) { //不是每次创建新的HttpClient,而是从连接池中获取HttpClient对象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if(response.getStatusLine().getStatusCode() == 200){ String content = EntityUtils.toString(response.getEntity(),"utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { if(response !=null){ try { response.close(); } catch (IOException e) { e.printStackTrace(); } //不能关闭,由连接池管理 } } } }
八、Jsoup
package jsoup; /** * Jsoup解详文件,一般不用其爬取,因为其对一些技术支持不如httpclient号 * 最终都是转为dom对象进行解析 * 解析为元素后可以获取元素中的数据 * selector查找元素单一方式,组合方式 */ import org.apache.commons.io.FileUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Set; public class JsoupFirstTest { @Test public void testUrl()throws Exception{ //解析URL地址,第一个参数是访问的URL,第二个url是访问超时时间 Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000); //使用 标签选择器,获取title标签中的内容 String title = doc.getElementsByTag("title").first().text(); //dayin System.out.println(title); } @Test public void testString() throws Exception{ //使用工具类读取文件,获取字符串 String content = FileUtils.readFileToString(new File(""),"utf8"); //解析字符串 Document doc = Jsoup.parse(content); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void testFile() throws Exception{ //解析文件 Document doc = Jsoup.parse(new File(""), "utf8"); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void testDom() throws Exception{ //解析文件获取document 对象 Document doc = Jsoup.parse(new File(""), "utf8"); //根据ID查询元素 Element element = doc.getElementById("city_di"); //根据标签获取元素 Element element1 = doc.getElementsByTag("span").first(); //根据CLASS获取元素 Elements el = doc.getElementsByClass("class_a class_b"); //根据属性获取元素 Elements abc = doc.getElementsByAttribute("abc"); Element href = doc.getElementsByAttributeValue("href", "http://sh.itcase.cn").first(); //打印元素内容 System.out.println(element.text()); System.out.println(element1); System.out.println(el); System.out.println(href); } @Test public void testData() throws Exception{ //解析文件,获取doc对象 Document doc = Jsoup.parse(new File(""), "utf8"); //根据ID获取元素 Element element=doc.getElementById("test"); String str = ""; //从元素中获取数据 //从元素中获取id str = element.id(); //从元素中获取classname str = element.className(); Set<String> classSet = element.classNames(); for(String s :classSet){ System.out.println(s); } //从元素中获取属性值 str = element.attr("id"); //从元素中获取所有属性 Attributes attributes = element.attributes(); System.out.println(attributes.toString()); //从元素中获取文本内容 str = element.text(); System.out.println(str); } @Test public void testSelecter() throws Exception{ //解析HTML文件,获取dom对象 Document doc = Jsoup.parse(new File(""), "utf8"); //通过标签查找元素 // Elements span = doc.select("span"); // for (Element element : span) { // System.out.println(element.text()); // } // Element element = doc.select("#city_bj").first(); // System.out.println(element.text()); //通过class查找元素 Element e = doc.select(".class_a").first(); //通过属性查找元素 Element element = doc.select("[abc]").first(); //利用属性值查找元素 Elements elements = doc.select("[class-s_name]"); for (Element element1 : elements) { System.out.println(element1.text()); } } @Test public void testSelector2()throws Exception{ //解析HTML文件,获取dom对象 Document doc = Jsoup.parse(new File(""), "utf8"); //元素+id Element select = doc.select("h3#city_bj").first(); System.out.println(select.text()); //元素+class select = doc.select("li.class_a").first(); //元素+属性名 select = doc.select("span[abc]").first(); //任意组合 select = doc.select("span[abc].s_name").first(); //ancestor child:查找某个元素下子元素,比如:city_con li Elements select1 = doc.select(".city_con li"); for (Element element : select1) { System.out.println(element.text()); } //parent > child:查找某个父元素下的直接子元素,如: //。city_con > ul >li 查找city_con第一级(直接子元素)的ul,在找所有ul下的第一级Li,ul和Li都是标签 select1 = doc.select(".city_con > ul >li"); //parent > *:查找父元素下所有直接子元素 select1 = doc.select(".city_con > ul >*"); } }