2021.11.2

今天对中图分类号的网站进行了解析,按照层级进行遍历,为解决普通类无法调用AutoWired,采用test注解来解决。

@Test
    public void testCode() throws Exception{
        String url = "https://www.clcindex.com/category/T/";
          Stack<String> stack  = new Stack<>();
        //打开浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        System.out.println(url);
        //输入网址
        HttpGet httpGet = new HttpGet(url);
        //按回车,发起请求,返回响应,使用httpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //解析响应,获取数据
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "UTF-8");
//            System.out.println(content);
            //解析字符串
            Document document = Jsoup.parse(content);

            Elements elements = document.getElementsByAttributeValue("name", "item-row");
            if (elements != null) {
                for (Element element : elements) {
                    Elements tds = element.select("td");
//                System.out.println(tds);
                    int count = 0;
                    CLCNumber clcNumber = new CLCNumber();
                    for (Element td : tds) {
                        if (count == 1) {
                            clcNumber.setName(td.text());
                            System.out.println(td.text());
                        }
                        if (count == 2) {
                            clcNumber.setContent(td.text());
                            System.out.println("href:");
                            System.out.println(td);
                            String href = td.select("a").attr("href");
                            href = href.replace("[", "%5B");
                            href = href.replace("]", "%5D");
                            href = href.replace("{", "%7B");
                            href = href.replace("}", "%7D");
                            System.out.println(href);
                            System.out.println("https://www.clcindex.com" + href);
                            stack.push("https://www.clcindex.com" + href);
                            System.out.println(td.text());
                        }

                        count++;
                    }
                    String parent = url.replace("https://www.clcindex.com/category/", "");
                    if (parent.endsWith("/")) {
                        parent = parent.substring(0, parent.length() - 1);
                    }
                    if (url.equals("https://www.clcindex.com/category/")) {
                        String label = 1 + "";
                        clcNumber.setLabel(label);
                    } else {
                    }
                    clcNumber.setParent(parent);
//                System.out.println(element);
                }
            }
        }
    }
posted @ 2021-11-02 21:58  夜月薇凉映银弩  阅读(45)  评论(0编辑  收藏  举报