java提取(获取)博客信息(内容)

package com.wbg.my.service;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Jack Chen
 * */
public class BlogUtil {

    /**
     * URL_PAGE:cnblogs url
     * URL_PAGE_DETAIL:详情页url
     * PAGE_COUNT:页数
     * urlLists:所有详情页url Set集合(防止重复)
     * p:匹配模式
     * */
    public final static String URL_PAGE = "https://www.cnblogs.com/weibanggang/default.html?page=";
    public final static String URL_PAGE_DETAIL = "https://www.cnblogs.com/weibanggang/p/([0-9]+.html)";
    public final static int PAGE_COUNT = 20;
    public static Set<String> urlLists = new TreeSet<String>();
    public final static Pattern p = Pattern.compile(URL_PAGE_DETAIL);
    //文件路径
    public static String file="d:index.html";
    static String  [] arr=null;
   static int sun=0;
    public static void main(String[] args) throws Exception {
        for(int i = 1;i<=PAGE_COUNT;i++) {
            getUrls(i);
        }
        System.out.println("开始获取内容!");
        arr=new String[urlLists.size()];
        for(Iterator<String> i = urlLists.iterator();i.hasNext();) {
            createFile(i.next());
            sun++;
        }
        System.out.println("获取内容完毕!");
        System.out.println("开始写入文件!");
        StringBuffer stringBuffer=new StringBuffer(kais());
        for (int i = 0; i < arr.length; i++) {
            stringBuffer.append(arr[i]);
        }
        stringBuffer.append(jiehun());
        System.out.println("写入文件完毕!");
        System.out.println("开始导出文件!");
        createFile(file,stringBuffer);
        System.out.println("导出文件完毕!");
        System.out.println("输出文件地址为:"+file);
    }
    /*
     * 将结果写入文件
     */
    private static void createFile(String file, StringBuffer buffer) {
        try {
            File newFile = new File(file);
            if (newFile.exists())// 存在,则删除
                if (!newFile.delete())// 删除成功则创建
                {
                    System.err.println("删除文件" + newFile + "失败");
                }
            if (newFile.createNewFile()) {// 创建成功,则写入文件内容
                PrintWriter p = new PrintWriter(new FileOutputStream(newFile
                        .getAbsolutePath()));
                p.write(buffer.toString());
                p.close();
            } else {
                System.err.println("创建文件:" + newFile + "失败");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    //开始头部
    public static String kais(){
        return "<!DOCTYPE html>\n" +
                "<html>\n" +
                "<head>\n" +
                "    <meta charset=\"utf-8\">\n" +
                "    <title>weibanggang.github.io</title>\n" +
                "    <meta name=\"renderer\" content=\"webkit\">\n" +
                "    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\">\n" +
                "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1, maximum-scale=1\">\n" +
                "    <style>\n" +
                "        html,body{width:100%;height: 100%}\n" +
                "        table{width: 1150px;height:500px;margin: auto}\n" +
                "        table,td,th{border: 1px solid #e6e6e6;border-collapse:collapse; }\n" +
                "        body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url(\"link.jpg\");background-repeat: no-repeat}         body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url(\"link.jpg\");background-repeat: no-repeat}\n" +
                "        * { margin: 0; padding: 0; }\n" +
                "        table { border-collapse: collapse; text-align: center;  }\n" +
                "        /*关键设置 tbody出现滚动条*/\n" +
                "        table tbody {\n" +
                "            display: block;\n" +
                "            height: 500px;\n" +
                "            overflow-y: scroll;overflow-x:hidden;\n" +
                "        }\n" +
                "  table thead,  tbody tr { display: table;width: 100%; table-layout: fixed;  }\n" +
                "        table thead th {  height: 40px  }\n" +
                "        table tbody td {height: 30px }\n" +
                "    </style>\n" +
                "</head>\n" +
                "\n" +
                "<body>\n" +
                "<marquee><h1 style=\"color:white;\">本网页仅作为参考博客、github等地址</h1></marquee>\n" +
                "<table width=\"80%\" border=\"1\">\n" +
                "    <thead>\n" +
                "    <tr>\n" +
                "        <th style=\"width:230px\">序号</th>\n" +
                "        <th style=\"width:231px\">标题链接</th>\n" +
                "        <th style=\"width:231px\">时间</th>\n" +
                "        <th style=\"width:231px\">来源</th>\n" +
                "        <th style=\"width:249px\">备注</th>\n" +
                "    </tr>\n" +
                "    </thead>\n" +
                "    <tbody>\n" +
                "\n" +
                "    </tbody>\n" +
                "</table>\n" +
                "</body>\n" +
                "<script src=\"js/jquery.js\"></script>\n" +
                "<script>\n" +
                "    var sum=[";
    }
    //结尾
    public static String jiehun(){
        return " ];\n" +
                "    \n" +
                "    for(var i=0;i<sum.length;i++){\n" +
                "        var tr=$(\"<tr/>\");\n" +
                "            //序号\n" +
                "            $(\"<td/>\").html(i+1).appendTo(tr);\n" +
                "            //标题链接\n" +
                "            var a=\"<a href='\"+sum[i][0]+\"' target='_blank'>\"+sum[i][1]+\"</a>\"\n" +
                "            $(\"<td/>\").html(a).appendTo(tr);\n" +
                "            //时间\n" +
                "            $(\"<td/>\").html(sum[i][2]).appendTo(tr);\n" +
                "            //来源\n" +
                "            $(\"<td/>\").html(sum[i][3]).appendTo(tr);\n" +
                "            //备注\n" +
                "            $(\"<td/>\").html(sum[i][4]).appendTo(tr);\n" +
                "            $(\"table tbody\").append(tr);\n" +
                "    }\n" +
                "</script>\n" +
                "</html>";
    }
    static String fh="";
    /**
     * @param url
     * 获取所有内容
     * @throws
     */
    private static void createFile(String url) throws Exception {
        Matcher m = p.matcher(url);
        m.find();
        String fileName = m.group(1);
        URL u = new URL(url);
        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
        conn.connect();
        BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
        String str;
        StringBuffer s=new StringBuffer();
        while((str = br.readLine()) != null){
            s.append(str);
        }
        String href="https://www.cnblogs.com/weibanggang/p/"+fileName;
        String title=getTitle(s);
        String data=getDate(s);
        arr[sun]=fh+"[\""+href+"\",\""+title+"\",\""+data+"\",\"博客\",\"正常\"]";
        fh=",";
        br.close();
        conn.disconnect();
    }
    //获取时间
    public static String getDate(StringBuffer sb){
        int first=sb.indexOf("<span id=\"post-date\">")+"<span id=\"post-date\">".length();
        String aa=sb.substring(first);
        int last=aa.indexOf("</span>");
        String sa=aa.substring(0,last);
        return sa;
    }
    //获取标题
    public static String getTitle(StringBuffer sb){
        int first=sb.indexOf("<title>");
        int last=sb.indexOf("</title>");
        String sa=sb.substring(first+7,last);
        int errorindex=sa.lastIndexOf("- 韦邦杠 - 博客园");
        return sa.substring(0,errorindex);
    }
    /**
     * @param idx
     * 获取页数
     * @throws
     */
    private static void getUrls(int idx) throws Exception{
        URL u = new URL(URL_PAGE+""+idx);
        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
        conn.connect();
        BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
        String str;
        while((str = br.readLine()) != null){
            if(null != str && str.contains("https://www.cnblogs.com/weibanggang/p/")) {
                Matcher m = p.matcher(str);
                if(m.find()) {
                    urlLists.add(m.group());
                }
            }
        }
        br.close();
        conn.disconnect();
    }

}

 

posted @ 2018-11-26 11:37  韦邦杠  阅读(753)  评论(0编辑  收藏  举报