爬虫笔记1
目录:
一.get请求,没有参数
二.get请求,有参数
三.post请求,没有参数
四.post请求,有参数
五.连接池创建连接
六.jsou解析
七.使用dom方式遍历文档
1.get请求,没有参数
package Demo1.CrawlerDemo1; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.net.URI; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.ResponseAuthCache; import org.apache.http.impl.client.CloseableHttpClient; public class Crawler { public static void main(String[] args) { //1.browser:create httpclient CloseableHttpClient httpClient=HttpClients.createDefault(); //2.url HttpGet httpGet=new HttpGet("http://www.itcast.cn"); //3.request CloseableHttpResponse response=null; try { response=httpClient.execute(httpGet); if(response.getStatusLine().getStatusCode()==200) { HttpEntity httpEntity=response.getEntity(); String content=EntityUtils.toString(httpEntity,"utf-8"); System.out.println(content); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { try { response.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //4.get response } }
2.get请求,有参数
package Demo1.CrawlerDemo1; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.ResponseAuthCache; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; /**get请求带参数 * * @author 18430 * */ public class CrawlerParameter { public static void main(String[] args) throws Exception { CloseableHttpClient httpClient=HttpClients.createDefault(); URIBuilder uriBuilder=new URIBuilder("http://yun.itheima.com/search"); uriBuilder.setParameter("keys", "java"); //多个参数的设置 //uriBuilder.setParameter("keys", "java").setParameter("keys", "java"); HttpGet httpGet=new HttpGet(uriBuilder.build()); CloseableHttpResponse response=null; try { response=httpClient.execute(httpGet); if(response.getStatusLine().getStatusCode()==200) { HttpEntity httpEntity=response.getEntity(); String content=EntityUtils.toString(httpEntity,"utf-8"); System.out.println(content); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { try { response.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //4.get response } }
3.post请求不带参数
package Demo1.CrawlerDemo1; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.protocol.ResponseAuthCache; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; /** * post请求 * @author 18430 * */ public class CrawlerPost { public static void main(String[] args) throws Exception { CloseableHttpClient httpClient=HttpClients.createDefault(); URIBuilder uriBuilder=new URIBuilder("http://www.itcast.cn"); HttpPost httpPost=new HttpPost(uriBuilder.build()); CloseableHttpResponse response=null; try { response=httpClient.execute(httpPost); if(response.getStatusLine().getStatusCode()==200) { HttpEntity httpEntity=response.getEntity(); String content=EntityUtils.toString(httpEntity,"utf-8"); System.out.println(content); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { try { response.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //4.get response } }
4.post请求,带参数
import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; public class PostWithParam { public static void main(String[] args) throws Exception { CloseableHttpClient httpClient = HttpClients.createDefault(); HttpPost httpPost = new HttpPost("http://yun.itheima.com/search"); //申明list封装表单中的参数 List<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("keys","java")); //创建表单entity对象 UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf-8"); //设置表单entity对象到post请求中 httpPost.setEntity(formEntity); CloseableHttpResponse response = null; try { response = httpClient.execute(httpPost); if(response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(),"utf-8"); System.out.println(content); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
5.使用连接池
import java.io.IOException; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; public class ConnectPool { public static void main(String[] args) { //创建连接池管理器 PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); //设置连接数 cm.setMaxTotal(100); //设置每个主机的最大连接数 cm.setDefaultMaxPerRoute(10); //使用连接池管理器发起请求 doGet(cm); } private static void doGet(PoolingHttpClientConnectionManager cm) { CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { response =httpClient.execute(httpGet); if(response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(),"utf-8"); System.out.println(content.length()); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { //httpclient有连接池管理,不能关闭 if(response != null) { try { response.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
6.Jsoup解析
package Demo1.CrawlerDemo1; import java.io.File; import java.net.URL; import org.apache.commons.io.FileUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.junit.Test; public class Jsoup1Test2 { @Test public void testUrl() throws Exception { // 解析url Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000); // 使用标签选择器获取title标签内容 String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void testString() throws Exception { // 解析字符串 String content = FileUtils.readFileToString( new File(System.getProperty("user.dir") + "\\src\\main\\java\\resources\\test.html"), "utf-8"); Document doc = Jsoup.parse(content); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void testFile() throws Exception { // 解析文件 Document doc = Jsoup.parse(new File(System.getProperty("user.dir") + "\\src\\main\\java\\resources\\test.html"), "utf-8"); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } }