3月12日第四周上课,疫情可视化加爬虫学习

源码,爬虫部分

package utils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import com.alibaba.fastjson.JSONObject;

public class HttpClientPool {
/**
 * 这是httpClient连接池
 * @throws Exception 
 */
    public static void HttpClientPool() {
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        
        
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
//        doGet(cm);
//        doPost(cm);
    }

public static String doPost(PoolingHttpClientConnectionManager cm) throws Exception {
    //从连接池中获取httpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

          //2、输入网址,发起请求,创建httpPost对象
          HttpPost httpPost= new HttpPost("http://m.sinovision.net/newpneumonia.php");
          System.out.println("发起请求的信息:"+httpPost);
          
          //Post使用,声明List集合,封装表单中的参数
          List<NameValuePair> params= new ArrayList<NameValuePair>();
          params.add(new BasicNameValuePair("",""));
          
          //创建表单的Entity对象,第一个参数是封装好的参数,第二个是编码
          UrlEncodedFormEntity formEntity= new UrlEncodedFormEntity(params,"utf8");
          
          //设置表单的Entity对象到Post请求中
          httpPost.setEntity(formEntity);
          
          
        //配置请求信息
          RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//设置创建连接的最长时间,单位为毫秒
          .setConnectionRequestTimeout(500)//设置获取连接的最长时间,单位为毫秒
          .setSocketTimeout(10*1000)//设置传输数据的最长时间,单位为毫秒
          .build();
          //给请求设置请求信息
          httpPost.setConfig(config);
          
          CloseableHttpResponse response=null;
          String content=null;
          try {
          //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
           response = httpClient.execute(httpPost);
          //解析响应,获取数据
          //判断状态码是否为两百
          if(response.getStatusLine().getStatusCode()==200) {
              HttpEntity httpEntity = response.getEntity();
              if(httpEntity!=null) {
                  content = EntityUtils.toString(httpEntity, "utf8");
                  System.out.println(content.length());
//                  System.out.println(content);
                  }
          }else {
              System.out.println("请求失败"+response);
          }
          }catch(Exception e) {
              e.printStackTrace();
          }finally {
    
              try {
                //关闭response
                  if(response!=null) {
                      //关闭response 
                      response.close();
                  }
                //不关闭httpClient
                //httpClient.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
          }
          return content;              
}

public static String doGet(PoolingHttpClientConnectionManager cm) throws Exception {
    //从连接池中获取httpClient对象
    CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    //创建URIBuilder
      URIBuilder uribuilder= new URIBuilder("http://m.sinovision.net/newpneumonia.php");
      //设置参数:参数名+参数值,可设置多个
      uribuilder.setParameter("","");
      
      //2、输入网址,发起请求,创建httpGet对象
      HttpGet httpGet= new HttpGet(uribuilder.build());
      System.out.println("发起请求的信息:"+httpGet);
      
      //配置请求信息
      RequestConfig config = RequestConfig.custom().setConnectTimeout(1000000000*1000000000)//设置创建连接的最长时间,单位为毫秒
      .setConnectionRequestTimeout(1000000000*1000000000)//设置获取连接的最长时间,单位为毫秒
      .setSocketTimeout(1000000000*1000000000)//设置传输数据的最长时间,单位为毫秒
      .build();
      //给请求设置请求信息
      httpGet.setConfig(config);
      
      
      
      
      
      CloseableHttpResponse response=null;
      String content=null;
      try {
      //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
       response = httpClient.execute(httpGet);
      //解析响应,获取数据
      //判断状态码是否为两百
      if(response.getStatusLine().getStatusCode()==200) {
          HttpEntity httpEntity = response.getEntity();
          if(httpEntity!=null) {
          content = EntityUtils.toString(httpEntity, "utf8");
//          System.out.println(content.length());
//          System.out.println(content);
          }
      }
      }catch(Exception e) {
          e.printStackTrace();
      }finally {

          try {
              if(response!=null) {
                  //关闭response 
                  response.close();
              }
            //不能关闭httpClient
            //httpClient.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
      }
      return content; 
}
}
package utils;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Set;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import dao.dao;
import entity.Info;
public class Jsouputil {

    /**
     * 使用Selector选择器获取元素
     */
    public static void testSelector()throws Exception{
        //获取Document对象
        HttpClientPool httpClient =new HttpClientPool();
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        //获取网页HTML字符串
        String content=httpClient.doGet(cm);
                        
        //解析字符串
        Document doc = Jsoup.parse(content);
//        System.out.println(doc.toString());
    
        //[attr=value],利用属性获取
        Elements elements = doc.select("div[class=todaydata] ").select("div[class=prod]");
        System.out.println(elements.toString());
        Info info=new Info();
        dao dao=new dao();
        for(Element ele:elements) {
            String province=ele.select("span[class=area]").text();
            String Confirmed_num=ele.select("span[class=confirm]").text();
            String Dead_num=ele.select("span[class=dead]").text();
            String Cured_num=ele.select("span[class=cured]").text();
            info=new Info(province,Confirmed_num,Dead_num,Cured_num);
            dao.add(info);
        }
        
    }
}

主要参考了https://home.cnblogs.com/u/yeyueweiliang

然后就是其中遇到的麻烦:

在找网站爬墙的时候先是找的这个网站

https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1

结果某易发现他有反爬虫的功能,就又换了一个网址

http://m.sinovision.net/newpneumonia.php

具体位置如下:

 

中间又出现导不到数据库 的情况,然后发现是他跟我原来的数据库结构不一样,导致最后先是不出来图像

然后又 出现tomcat崩溃等一系列情况,都被重启服务,或者换端口,或者重启电脑解决了

本次学习爬虫,在学习上面花费三个小时,实践花费两小时,解决问题花费两小时

具体源码可以见我github库

https://github.com/diaolingjun/-2

 

 

posted @ 2020-03-12 10:27  凋零_(  阅读(209)  评论(0编辑  收藏  举报