Java使用HttpClient爬取数据
1.建立http连接返回html页面:
public static String doRequest(String url,String method) { //创建httpClient对象 CloseableHttpClient client=HttpClientBuilder.create().build(); URIBuilder uriBuilder= null; HttpUriRequest uri=null; CloseableHttpResponse response= null; //获取相应对象 String html=""; //存放响应信息 try { uriBuilder = new URIBuilder(url); switch (method){ case "POST": uri=new HttpPost(uriBuilder.build()); break; case "PUT": uri=new HttpPut(uriBuilder.build()); break; case "DELETE": uri=new HttpDelete(uriBuilder.build()); break; default: uri=new HttpGet(uriBuilder.build()); break; } response = client.execute(uri); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { // 返回 200 表示成功 html = EntityUtils.toString(response.getEntity(), "utf-8"); // 获取服务器响应实体的内容 } } catch (URISyntaxException e) { e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); }finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } return html; }
2.解析页面获取想要的数据:
//解析 筛选网页所需信息 public static void analysisHtml(String html){ //第一步,将字符内容解析成一个Document类 Document document = Jsoup.parse(html); //第二步,根据我们需要得到的标签,选择提取相应标签的内容 Elements elements = document.select("div[class=leftbox]").select("div[class=pr0]"); String name=""; String address=""; for(Element e : elements){ name=e.getElementsByClass("pr2").select("ul").select("li").select("a").first().text(); address=e.getElementsByClass("pr4").text(); Company company=new Company(name,address); companyList.add(company); } }
3.启动方法启动:
public static List<Company> start() { String url="http://www.chinawj.com.cn/qiye/wujinjidian/c1_1_0_"; for(int i=1;i<=10;i++){ System.out.println("开始爬取数据[页码:"+i+"]"); analysisHtml(doRequest(url+i+".html","GET")); } System.out.println(companyList); return companyList; }