【apache】使用HttpClient,进行简单网页抓取
1 package com.lw.httpclient.test; 2 import org.apache.http.client.methods.CloseableHttpResponse; 3 import org.apache.http.client.methods.HttpGet; 4 import org.apache.http.impl.client.CloseableHttpClient; 5 import org.apache.http.impl.client.HttpClients; 6 import org.apache.http.util.EntityUtils; 7 8 public class HttpClientTest { 9 public static void main(String[] args) throws Exception { 10 // TODO Auto-generated method stub 11 get1(); 12 get2(); 13 } 14 /** 15 * 获取指定链接的网页的内容【初级版】 16 * @throws Exception 17 */ 18 public static void get1()throws Exception{ 19 //HttpClient hc=new DefaultHttpClient(); 20 String url="http://www.budejie.com"; 21 url="http://www.btba.com.cn";//网站限制爬,这种方式不再实用。 22 CloseableHttpClient chc=HttpClients.createDefault(); 23 HttpGet hg=new HttpGet(url); 24 CloseableHttpResponse chp=chc.execute(hg); 25 System.out.println(EntityUtils.toString(chp.getEntity(),"UTF-8")); 26 } 27 /** 28 * 通过模拟浏览器获取指定链接的页面 29 * @throws Exception 30 */ 31 public static void get2()throws Exception{ 32 CloseableHttpClient closeableHttpClient=HttpClients.createDefault(); 33 String url="http://www.btba.com.cn"; 34 HttpGet httpGet=new HttpGet(url); 35 //设置请求头,模拟浏览器访问 36 httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"); 37 CloseableHttpResponse chr=closeableHttpClient.execute(httpGet); 38 System.out.println(EntityUtils.toString(chr.getEntity(),"UTF-8")); 39 } 40 }
未完待续
将会添加如何解析获取到的内容,得到自己想要的部分。。