爬虫--cnblogs采集程序

  小偷采集,有经验的猿猿应该都会做,我借此做一个回忆。

  2013年我就职盛大文学,当时因文学版权、流量等问题,做了一套监控系统,用来监控当时的创世、龙空、纵横等等比较知名中文网站。对于监控,我还可以自吹一下经验满满。

  当下社会,正是监控系统兴风作浪的大好时机,而且有利可图。

  举例说明一下:

  1、公共wifi,收集用户的部分信息,地理坐标,连接时长,搜索内容等等,通过数据分析给用户贴上标签[吃货]、[美娇娘]、[剁手党] 等等,然后将信息打包贩卖,基本能买到好几块一条呐。

  2、通过采集获得大量的优秀文章,将其精修后可作为一些书籍的底稿来用,而且这些底稿可以换钱滴。

  3、搜索大网站的用户信息,比如就搜索cnblogs的推荐博客博主,把他们的信息down下来,整理分析,贴上[淫才]标签,打包贩卖,至少得30块一条吧。

  4、如果能有幸通过一些高级酒店的网站,搜索到一些零碎的用户信息,并将其整理、拼接、合成比较完整的用户信息,这些皇冠级用户信息拿到4S、售楼处贩卖,怎么着一条信息也得小一百吧。

...

  不用再写下去了,监控能创造出巨额利益。

  开始教做最简单的小偷。当然这篇小偷偷的就是cnblogs,学会了可不要乱搞,搞坏了谁负责?

java编写,需要引入绿色框的包 mysql-connector-java-5.1.13.jar,用来连接mysql数据库,如果采集信息不入库,则可以不用下载此包。
jdk1.8.0_112

黄色框内是方法:

downhtml 下载HTML内容

downImages 下载图片

GetDocument 下载博客日志

GetList 下载精华区列表

InsertMysql 博客入库

main 程序入口

 

蓝色框内是正则匹配式和一些配置 信息:

contectPattern 正则匹配出内容

imgPattern 正则匹配出图片

listPattern 正则匹配出精华区列表

localPath 下载的图片本地存放路径

pickFormat 精华区url Formatter

webSite 你的网站

 

主要结构

蓝色框总剖析

    // 精选列表Formatter
    private static String pickFormat = "https://www.cnblogs.com/pick/%s/";
    
    //获取图片url的正则表达式
    private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\"");
    
    //获取精华列表的正则表达式
    private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>");

    //获取博客内容的正则表达式
    private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>");

    //网站放置下载图片的路径
    private static String webSite = "http://www.Website.com/loadimages/";
    
    //本地下载图片的路径
    private static String localPath = "D:\\WWW\\loadimages\\";

 

DownHtml剖析

    /**
     * 下载html
     *
     * @param  url
     *         博客URL
     * @return  html的内容
     */
    static String downhtml(String url) throws UnsupportedEncodingException, IOException{
        BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String read = "";
        while ((read = br.readLine()) != null) {
            if (stringBuffer.length() == 0) {
                stringBuffer.append(read);
            } else {
                stringBuffer.append("\r\n").append(read);
            }
        }
        br.close();
        return stringBuffer.toString();
    }

 

DownImages剖析

    /**
     * 下载网络图片到本地
     *
     * @param  imgUrl
     *         图片URL
     * @param  imgName
     *         保存到本地名称
     */
    static void downImages(String imgUrl, String imgName) {
        System.out.println("downfile --> " + imgName + "\t" + imgUrl);
        try {
            URL url = new URL(imgUrl);
            URLConnection conn = url.openConnection();
            conn.setConnectTimeout(10000);
            InputStream inStream = conn.getInputStream();
            FileOutputStream fs = new FileOutputStream(imgName);
            int byteread = 0;
            byte[] buffer = new byte[1204];
            while ((byteread = inStream.read(buffer)) != -1) {
                fs.write(buffer, 0, byteread);
            }
            fs.close();
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

 

GetList剖析

    /**
     * 下载精华区列表
     */
    static void GetList() {
        for (int i = 1; i < 80; i++) {
            try {
                Thread.sleep(7777);
                String url = String.format(pickFormat, i);  //得到第一条是https://www.cnblogs.com/pick/1/  精华区列表第一页
                String html = downhtml(url);//下载精华区
                Matcher listMatcher = listPattern.matcher(html);//匹配精华区列表
                while (listMatcher.find()) {
                    String title = listMatcher.group("title");//博客标题
                    url = listMatcher.group("url");//博客URL
                    System.out.println(title + "\t" + url);
                    GetDocument(url, title);//下载博客内容
                    Thread.sleep(7777);
                }

            } catch (Exception e) {
                System.err.println(e.getStackTrace());
            }
        }
    }


GetDocument剖析

    /**
     * 下载博客日志
     */
    static void GetDocument(String url, String title) {
        try {
            String html = downhtml(url);//下载博客日志
            System.out.println("html.length --> " + html.length());
            Matcher contectMatcher = contectPattern.matcher(html);//匹配博客内容
            if (contectMatcher.find()) {
                String content = contectMatcher.group("conect");//获得博客内容
                
                //图片的url去重复
                HashMap<String, String> map = new HashMap<String, String>();
                Matcher imgMatcher = imgPattern.matcher(content);//匹配博客图片url

                while (imgMatcher.find()) {
                    String matVal = imgMatcher.group();
                    String webUrl = matVal.substring(1, matVal.length() - 1);
                    if (map.containsKey(webUrl))
                        continue;

                    String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");//保存本地随机生成图片名
                    String fullName = localPath + fileName;//保存图片的全路径
                    String webFileName = webSite + fileName;//博客内容需要转换的新图片url
                    downImages(webUrl, fullName);//下载图片
                    map.put(webUrl, webFileName);
                    try {
                        Thread.sleep(333);
                    } catch (InterruptedException e) {
                        System.err.println(e.getStackTrace());
                    }
                }

                //将cnblogs的图片url替换成预设网站的url
                for (Entry<String, String> entry : map.entrySet()) {
                    content = content.replace(entry.getKey(), entry.getValue());
                }

                System.out.println("Match content --> " + content.substring(0, 50));
                
                InsertMysql(title, content);//入库
            }
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

 

InsertMysql剖析

        //博客入库
    static void InsertMysql(String title, String content) {
        String url = "jdbc:mysql://localhost:3306/wordpress";
        String user = "root";
        String password = "root";

        Connection connection = null;
        PreparedStatement preparedStatement = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url, user, password);
            preparedStatement = connection
                    .prepareStatement("INSERT INTO wordpress.wp_posts "
                            + "(post_author, " + "post_date, "
                            + "post_date_gmt, " + "post_content, "
                            + "post_title, " + "post_excerpt, "
                            + "post_status, " + "comment_status, "
                            + "ping_status, " + "post_password, "
                            + "post_name, " + "to_ping, " + "pinged, "
                            + "post_modified, " + "post_modified_gmt, "
                            + "post_content_filtered, " + "post_parent, "
                            + "guid, " + "menu_order, " + "post_type, "
                            + "post_mime_type, " + "comment_count" + ")"
                            + "VALUES" + "(1, " + "now(), " + "now(), " + "?, "
                            + "?, " + "'', " + "'publish', " + "'open', "
                            + "'open', " + "'', " + "'', " + "'', " + "'', "
                            + "now(), " + "now(), " + "'', " + "0, " + "'', "
                            + "0, " + "'post', " + "'', " + "0" + ");");

            preparedStatement.setString(1, content);
            preparedStatement.setString(2, title);
            System.out.println(preparedStatement.executeUpdate() + " " + title);
        } catch (Exception e) {
            System.err.println(e.getStackTrace());
        } finally {
            if (preparedStatement != null)
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
            if (connection != null)
                try {
                    connection.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
        }
    } 

 

完整代码(本人喜欢随性乱涂,代码基本无注释,抱歉)

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;

public class appCnblogsCollect {
    
    // 精选列表Formatter
    private static String pickFormat = "https://www.cnblogs.com/pick/%s/";
    
    //获取图片url的正则表达式
    private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\"");
    
    //获取精华列表的正则表达式
    private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>");

    //获取博客内容的正则表达式
    private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>");

    //网站放置下载图片的路径
    private static String webSite = "http://www.Website.com/loadimages/";
    
    //本地下载图片的路径
    private static String localPath = "D:\\WWW\\loadimages\\";
    
    public static void main(String[] args) throws Exception {
        GetList();
    }

    /**
     * 下载精华区列表
     */
    static void GetList() {
        for (int i = 1; i < 80; i++) {
            try {
                Thread.sleep(7777);
                String url = String.format(pickFormat, i);  //得到第一条是https://www.cnblogs.com/pick/1/  精华区列表第一页
                String html = downhtml(url);//下载精华区
                Matcher listMatcher = listPattern.matcher(html);
                while (listMatcher.find()) {
                    String title = listMatcher.group("title");
                    url = listMatcher.group("url");
                    System.out.println(title + "\t" + url);
                    GetDocument(url, title);
                    Thread.sleep(7777);
                }

            } catch (Exception e) {
                System.err.println(e.getStackTrace());
            }
        }
    }
    
    /**
     * 下载博客日志
     */
    static void GetDocument(String url, String title) {
        try {
            String html = downhtml(url);
            System.out.println("html.length --> " + html.length());
            Matcher contectMatcher = contectPattern.matcher(html);
            if (contectMatcher.find()) {
                String content = contectMatcher.group("conect");
                
                //图片的url去重复
                HashMap<String, String> map = new HashMap<String, String>();
                Matcher imgMatcher = imgPattern.matcher(content);

                while (imgMatcher.find()) {
                    String matVal = imgMatcher.group();
                    String webUrl = matVal.substring(1, matVal.length() - 1);
                    if (map.containsKey(webUrl))
                        continue;

                    String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");
                    String fullName = localPath + fileName;
                    String webFileName = webSite + fileName;
                    downImages(webUrl, fullName);
                    map.put(webUrl, webFileName);
                    try {
                        Thread.sleep(333);
                    } catch (InterruptedException e) {
                        System.err.println(e.getStackTrace());
                    }
                }

                //将cnblogs的图片url替换成预设网站的url
                for (Entry<String, String> entry : map.entrySet()) {
                    content = content.replace(entry.getKey(), entry.getValue());
                }

                System.out.println("Match content --> " + content.substring(0, 50));
                
                InsertMysql(title, content);
            }
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }
    
    /**
     * 下载html
     *
     * @param  url
     *         博客URL
     * @return  html的内容
     */
    static String downhtml(String url) throws UnsupportedEncodingException, IOException{
        BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String read = "";
        while ((read = br.readLine()) != null) {
            if (stringBuffer.length() == 0) {
                stringBuffer.append(read);
            } else {
                stringBuffer.append("\r\n").append(read);
            }
        }
        br.close();
        return stringBuffer.toString();
    }
    
    /**
     * 下载网络图片到本地
     *
     * @param  imgUrl
     *         图片URL
     * @param  imgName
     *         保存到本地名称
     */
    static void downImages(String imgUrl, String imgName) {
        System.out.println("downfile --> " + imgName + "\t" + imgUrl);
        try {
            URL url = new URL(imgUrl);
            URLConnection conn = url.openConnection();
            conn.setConnectTimeout(10000);
            InputStream inStream = conn.getInputStream();
            FileOutputStream fs = new FileOutputStream(imgName);
            int byteread = 0;
            byte[] buffer = new byte[1204];
            while ((byteread = inStream.read(buffer)) != -1) {
                fs.write(buffer, 0, byteread);
            }
            fs.close();
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

    static void InsertMysql(String title, String content) {
        String url = "jdbc:mysql://localhost:3306/wordpress";
        String user = "root";
        String password = "root";

        Connection connection = null;
        PreparedStatement preparedStatement = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url, user, password);
            preparedStatement = connection
                    .prepareStatement("INSERT INTO wordpress.wp_posts "
                            + "(post_author, " + "post_date, "
                            + "post_date_gmt, " + "post_content, "
                            + "post_title, " + "post_excerpt, "
                            + "post_status, " + "comment_status, "
                            + "ping_status, " + "post_password, "
                            + "post_name, " + "to_ping, " + "pinged, "
                            + "post_modified, " + "post_modified_gmt, "
                            + "post_content_filtered, " + "post_parent, "
                            + "guid, " + "menu_order, " + "post_type, "
                            + "post_mime_type, " + "comment_count" + ")"
                            + "VALUES" + "(1, " + "now(), " + "now(), " + "?, "
                            + "?, " + "'', " + "'publish', " + "'open', "
                            + "'open', " + "'', " + "'', " + "'', " + "'', "
                            + "now(), " + "now(), " + "'', " + "0, " + "'', "
                            + "0, " + "'post', " + "'', " + "0" + ");");

            preparedStatement.setString(1, content);
            preparedStatement.setString(2, title);
            System.out.println(preparedStatement.executeUpdate() + " " + title);
        } catch (Exception e) {
            System.err.println(e.getStackTrace());
        } finally {
            if (preparedStatement != null)
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
            if (connection != null)
                try {
                    connection.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
        }
    }
}
完整代码


以上是一个不需要用户身份验证的例子,在某些特殊的情况下,需要身份验证怎么办?不要着急,我下面还有一个例子,下面的例子是年末统计全年工作日报,而且这个工作日报还影响绩效和考核,可是,我一年没填了,从头写是不可能了,只能做个小工具,于是就有了它,随便乱弹的,看着丑就丑吧。

package ebooks;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;

import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;

public class appTimeSheet {
    public static void main(String[] args) throws Exception {
        String username = "username";
        String password = "yourpassword";
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
        String urlLogin = "http://10.54.11.37:8989/login";
        String urlSave = "http://10.54.11.37:8989/agenda/submission/saveTimereg.json";
        DefaultHttpClient client = new DefaultHttpClient(new PoolingClientConnectionManager());
        HttpPost loginHttpPost = new HttpPost(urlLogin);
        List<NameValuePair> loginPairs = new ArrayList<NameValuePair>();
        loginPairs.add(new BasicNameValuePair("username", username));
        loginPairs.add(new BasicNameValuePair("password", password));
        loginHttpPost.setEntity(new UrlEncodedFormEntity(loginPairs, "utf-8"));
        HttpResponse response = client.execute(loginHttpPost);
        System.out.println(response.getStatusLine());

        Calendar calendar = Calendar.getInstance();
        calendar.clear();
        calendar.set(2017, 11, 24);
        long end = calendar.getTimeInMillis();
        calendar.clear();
        calendar.set(2017, 11, 4);
        
        for (; calendar.getTimeInMillis() < end; calendar.add(Calendar.DATE, 1)) {
            if (calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SUNDAY
                    || calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SATURDAY) {
                continue;
            }

            HttpPost savePost = new HttpPost(urlSave);
            List<NameValuePair> savePairs = new ArrayList<NameValuePair>();
            savePairs.add(new BasicNameValuePair("timereg", "监控室"));
            savePairs.add(new BasicNameValuePair("timereg", "研发项目"));
            savePairs.add(new BasicNameValuePair("timereg", "云平台智能服务技术的研究和应用"));
            savePairs.add(new BasicNameValuePair("timereg", "全部(共享类)"));

            switch (calendar.get(Calendar.DAY_OF_WEEK)) {
            case Calendar.MONDAY:
                savePairs.add(new BasicNameValuePair("timereg", "8,0,0,0,0,0,0"));
                System.out.println("星期一");
                break;
            case Calendar.TUESDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,8,0,0,0,0,0"));
                System.out.println("星期二");
                break;
            case Calendar.WEDNESDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,8,0,0,0,0"));
                System.out.println("星期三");
                break;
            case Calendar.THURSDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,0,8,0,0,0"));
                System.out.println("星期四");
                break;
            case Calendar.FRIDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,0,0,8,0,0"));
                System.out.println("星期五");
                break;
            }
            savePairs.add(new BasicNameValuePair("timereg", "监控系统开发与应用"));
            savePairs.add(new BasicNameValuePair("timereg", "8"));
            savePairs
                    .add(new BasicNameValuePair("theweek", Integer.toString(calendar.get(Calendar.WEEK_OF_YEAR) + 1)));
            System.out.println("第" + (calendar.get(Calendar.WEEK_OF_YEAR) + 1) + "周");

            savePairs.add(new BasicNameValuePair("starttime", simpleDateFormat.format(calendar.getTime())));
            System.out.println("starttime=" + simpleDateFormat.format(calendar.getTime()));

            savePost.setEntity(new UrlEncodedFormEntity(savePairs, "utf-8"));
            HttpResponse saveResponse = client.execute(savePost);
            System.out.println(saveResponse.getStatusLine());
            System.out.println();
            Thread.sleep(7777);
        }
    }
}
送一个需要帐号密码登录的例子

 

posted @ 2018-01-12 15:33  牧师/preacher  阅读(1744)  评论(1编辑  收藏  举报