爬取自己的csdn目录

csdn目录的分页:
https://blog.csdn.net/qq_38366063/article/list/1
后面是几就是几个分页,简单点,直接遍历增加就好了,

导包就不细说了:

	<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
		<dependency>
			<groupId>net.sourceforge.htmlunit</groupId>
			<artifactId>htmlunit</artifactId>
			<version>2.35.0</version>
		</dependency>
		<!-- 解析html -->
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
		</dependency>
	<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
			<version>2.0.1</version>
		</dependency>
		
		<!-- 阿里JSON解析器 -->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.31</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-text</artifactId>
			<version>1.4</version>
		</dependency>

 public static void main(String[] args) {
		String name = "qq_38366063";
		String url = "https://blog.csdn.net/"+name+"/article/list/";

		//多少页:
		for (int i = 0; i < 14; i++) {
			String oneUrl = url + i;
			try {
				getCSDNArticleUrlList2(name,oneUrl,new ArrayList<String>());
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

    }

    public static void getCSDNArticleUrlList2(String name, String oneUrl, List<String> urlList)
            throws FailingHttpStatusCodeException, MalformedURLException, IOException {
        // 模拟浏览器操作
        InputStream inputStream = HttpUtil.doGet(oneUrl);
        String content = StreamUtil.inputStreamToString(inputStream, "UTF-8");
        Document doc = Jsoup.parse(content);
        Element pageMsg22 = doc.select("div.article-list").first();
        if (pageMsg22 == null) {
            return;
        }
        Elements pageMsg = pageMsg22.select("div.article-item-box");
        Element linkNode;
        for (Element e : pageMsg) {
            linkNode = e.select("h4 a").first();
            // 不知为何,所有的bloglist第一条都是这个:https://blog.csdn.net/yoyo_liyy/article/details/82762601
            if (linkNode.attr("href").contains(name)) {
//					System.out.println(linkNode.attr("href"));
                TextNode textNode = linkNode.textNodes().get(1);
                System.out.println("[" + textNode + "](" + linkNode.attr("href") + ")");
                urlList.add(linkNode.attr("href"));
            }
        }
        return;
    }

工具类方法,HttpUtil的一个,和流转字符串的一个

 public static InputStream doGet(String urlstr, Map<String, String> headers) throws IOException {
        URL url = new URL(urlstr);
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 " +
                "(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
        conn.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp," +
                "image/apng,*/*;q=0" +
                ".8");

        if (headers != null) {
            Set<String> keys = headers.keySet();
            for (String key : keys) {
                conn.setRequestProperty(key, headers.get(key));
            }
        }
        Random random = new Random();
        String ip =
                (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100);
        conn.setRequestProperty("x-forwarded-for", ip);
        InputStream inputStream = conn.getInputStream();
        return inputStream;
    }

  public static String inputStreamToString(InputStream is, String charset) throws IOException {

        byte[] bytes = new byte[1024];
        int byteLength = 0;
        StringBuffer sb = new StringBuffer();
        while ((byteLength = is.read(bytes)) != -1) {
            sb.append(new String(bytes, 0, byteLength, charset));
        }
        return sb.toString();
    }

爬取结果:
在这里插入图片描述
然后直接考到csdn写文章里面去就可以了:
https://blog.csdn.net/qq_38366063/article/details/101760545

posted @ 2019-09-30 11:46  你就像甜甜的益达  阅读(80)  评论(0编辑  收藏  举报