正在看的故事,下载

View Code
package com.chen.Test;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

public class NetDetailHtml
{
    public String saveRoot = "D:" + File.separator + "download" + File.separator;

    private String httpUrl0 = "http://www.yi-see.com/";

    // 详情页获取信息标签

    public NetDetailHtml()
    {
    }

    public NetDetailHtml(String saveRoot, String httpUrl0)
    {
        this.saveRoot = saveRoot;
        this.httpUrl0 = httpUrl0;
    }

    /**
     * 获取url中的链接
     * 
     * @author chen_weixian
     * @param parentUrl
     * @param urlStartTag
     * @param urlEndTag
     * @return
     * @throws Exception
     */
    public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag)
            throws Exception
    {
        List<String[]> resultList = new ArrayList<String[]>();
        String htmlContent = this.getContentByUrl(parentUrl);
        // StringBuffer sb = new StringBuffer(); // 本地测试
        // ChenFile.readFile("F:/download/source.txt", sb, "gbk");
        // String htmlContent = sb.toString();

        String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length());
        contentString = contentString.substring(0, contentString.indexOf(areaEndTag));

        String[] titleArray = contentString.split(saveFileTitleStartTag);
        if (titleArray.length > 0)
        {
            for (int i = 0; i < titleArray.length; i++)
            {
                try
                {
                    if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0)
                    {
                        continue;
                    }
                    String[] tempArray = new String[2];
                    String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length());
                    url = url.substring(0, url.indexOf(hrefEndTag));
                    String title = this.clearHtml(titleArray[i]);
                    // 获取标题,获取
                    // String content = this.downStart(httpUrl0 + url);
                    // this.saveFile(content, title);
                    // System.out.println(i + "\t完成:" + title + "\t" + httpUrl0
                    // + url);
                    tempArray[0] = url;
                    tempArray[1] = title;

                    resultList.add(tempArray);
                } catch (Exception e)
                {
                    // System.out.println(i + " 信息异常:" + e);
                    continue;
                }
            }
        }

        return resultList;
    }

    /**
     * 读取信息内容
     * 
     * @author chen_weixian
     * @param urlString
     * @return
     * @throws MException
     */
    public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception
    {
        String result = "";

        String htmlContent = this.getContentByUrl(urlString);
        if (htmlContent.length() > 0)
        {
            StringBuffer resultString = new StringBuffer(100); // 结果
            String br = "    \r\n";
            // 标题
            if (!this.isEmpty(titleStartTag) && !this.isEmpty(titleEndTag))
            {
                String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length());
                title = title.substring(0, title.indexOf(titleEndTag));
                resultString.append(title);
                resultString.append(br);
            }
            // 作者
            if (!this.isEmpty(authorSt1artTag) && !this.isEmpty(authorEndTag))
            {
                String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length());
                author = author.substring(0, author.indexOf(authorEndTag));
                resultString.append(author);
                resultString.append(br);
            }
            // 内容
            if (!this.isEmpty(contentStartTag) && !this.isEmpty(contentEndTag))
            {
                String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length());
                content = content.substring(0, content.indexOf(contentEndTag));
                resultString.append(content);
                resultString.append(br);
            }

            result = resultString.toString().replaceAll("&nbsp;&nbsp;&nbsp;&nbsp;", br);
        }
        // 去除html
        return this.clearHtml(result.toString());
    }

    /**
     * 读取url内容
     * 
     * @author chen_weixian
     * @param urlString
     * @return
     * @throws Exception
     */
    public String getContentByUrl(String urlString) throws Exception
    {
//         设置代理上外网
        System.getProperties().put("proxySet", "true");
//        System.getProperties().put("proxyHost", "10.17.171.11"); // ip
        System.getProperties().put("proxyHost", "pascproxy1.pasc.com.cn"); // 域名
        System.getProperties().put("proxyPort", "8080");
        
        StringBuffer htmlContent = new StringBuffer(); // 临时变量
        URL urlObj = new URL(urlString);
        HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection();
        BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
        String line = "";
        while ((line = reader.readLine()) != null)
        {
            line = new String(line.getBytes(), "gbk");
            htmlContent.append(line);
        }
        reader.close();
        return htmlContent.toString();
    }

    /**
     * 保存文件
     * 
     * @author chen_weixian
     * @throws IOException
     */
    public void saveFile(String contentString, String title) throws IOException
    {
        String savepath = saveRoot + File.separator + title + ".txt";
//        ChenFile.unExitCreate(savepath);
        this.WriteToFile(savepath, contentString, "gbk");
    }

    public void doMain(String parentUrl)
    {

        String titleStartTag1 = "<B>";
        String titleEndTag1 = "</B>";
        String authorSt1artTag1 = "</B><BR>";
        String authorEndTag1 = "</a><br>";
        String contentStartTag1 = "<TD CLASS=ART>";
        String contentEndTag1 = "</TD>";
        // 列表链接
        String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">";
        String areaEndTag2 = "</TABLE>";
        String hrefStartTag2 = "<a href='";
        String hrefEndTag2 = "' >";
        String saveFileTitleStartTag2 = "<BR>";
        String saveFileTitleEndTag2 = "</A>";
        // 获取一级链接页面
        String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>";
        String areaEndTag3 = "</TABLE>";
        String hrefStartTag3 = "<a href='";
        String hrefEndTag3 = "'";
        String saveFileTitleStartTag3 = "<TR>";
        String saveFileTitleEndTag3 = "</TR>";
        // 记录日志文件
//        String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";

        try
        {
            // 获取二级链接
            List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3);
            if (firstList != null && firstList.size() > 0)
            {
                System.out.println("firstList.size()=" + firstList.size());
                for (int i = 0; i < firstList.size(); i++)
                {
                    String[] array = firstList.get(i);
                    // 获取一级链接
                    this.saveRoot = "D:" + File.separator + "download" + File.separator + array[1];
                    // 三级链接
                    List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
                    if (lastList != null && lastList.size() > 0)
                    {
                        System.out.println("lastList.size()=" + lastList.size());
                        for (int j = 0; j < lastList.size(); j++)
                        {
                            try
                            {
                                String[] array1 = lastList.get(j);
                                String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
                                this.saveFile(contentString, array1[1]);
                                String message = i + "\t" + array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
//                                ChenFile.WriteToFile(logFile, message, "gbk", true);
                                System.out.println(message);
                            } catch (Exception e)
                            {
                                String message = i + "\t" + "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
//                                ChenFile.WriteToFile(logFile, message, "gbk", true);
                                System.out.println(message);
                                continue;
                            }
                        }
                    }
                }
            }
        } catch (Exception e)
        {
            e.printStackTrace();
        }
        // 获取二级链接
        // 获取详情内容
    }
    
    public void doMain2(String parentUrl)
    {

        String titleStartTag1 = "<span id=\"htmltimu\">";
        String titleEndTag1 = "</span>";
        String authorSt1artTag1 = "</span> <span>";
        String authorEndTag1 = "</a></span>";
        String contentStartTag1 = "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" align=\"center\" >";
        String contentEndTag1 = "<div class=\"button_con\">";
        // 列表链接
        String areaStartTag2 = "<div id=\"htmlList\" class=\"insert_list\">";
        String areaEndTag2 = "</div>";
        String hrefStartTag2 = "<strong><a href=\"";
        String hrefEndTag2 = "\">";
        String saveFileTitleStartTag2 = "<li>";
        String saveFileTitleEndTag2 = "</a>";
        // 记录日志文件
//        String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";

        try
        {
            String array[] = {parentUrl, "官道之色戒"};
            // 获取一级链接
            this.saveRoot = "D:" + File.separator + "story" + File.separator + array[1];
            // 三级链接
            List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
            if (lastList != null && lastList.size() > 0)
            {
                System.out.println("lastList.size()=" + lastList.size());
                for (int j = 0; j < lastList.size(); j++)
                {
                    try
                    {
                        String[] array1 = lastList.get(j);
                        String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
                        this.saveFile(contentString, j + array1[1]);
                        String message =  array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
//                                ChenFile.WriteToFile(logFile, message, "gbk", true);
                        System.out.println(message);
                    } catch (Exception e)
                    {
                        String message = "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
//                                ChenFile.WriteToFile(logFile, message, "gbk", true);
                        System.out.println(message);
                        continue;
                    }
                }
            }
        } catch (Exception e)
        {
            e.printStackTrace();
        }
        // 获取二级链接
        // 获取详情内容
    }
    
    private void WriteToFile(String savepath, String contentString, String code) throws IOException
    {
        File file = new File(savepath);
        if (!file.exists())
        {
            File rootFile = file.getParentFile();
            if (!rootFile.exists())
            {
                rootFile.mkdirs();
            }
            file.createNewFile();
        }
        
        OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file));
        ow.write(new String(contentString.getBytes(code)));
        ow.close();
    }
    
    /***
     * 
     * @author : EX-CHENWEIXIAN001 陈惟鲜
     * @create_date :2013-3-20 上午09:06:03
     * @param str
     * @return
     */
    private boolean isEmpty(String str)
    {
        if (str == null || str.trim().length() == 0)
        {
            return true;
        }
        return false;
    }

    private String clearHtml(String htmlString)
    {
//        String s = " ddd<li title=\" 其它国内品牌\">品牌: 其它国内品牌</li>bsd";
        String p = "<[^>]*>";
        htmlString = htmlString.replaceAll(p, "");
        return htmlString;
    }
    /**
     * @author chen_weixian
     * @param args
     */
    public static void main(String[] args)
    {
        String saveRoot = "D:" + File.separator + "download" + File.separator;
//        String httpUrl0 = "http://www.chkee.com/chkbook/13/13472/";
        String httpUrl0 = "http://www.chkee.com/chkbook/0/554/";
        
        NetDetailHtml downMessage = new NetDetailHtml(saveRoot, httpUrl0);
        String parentUrl = "index.html";
        downMessage.doMain2(parentUrl);
//        String content = "<strong><a href=\"2965986.html\">作者有话说</a></strong></li>";
//        String hrefStartTag = "<li><strong><a href=\"";
//        String hrefEndTag = "\">";
//        System.out.println(content.substring(content.indexOf(hrefStartTag) + hrefStartTag.length(), content.length()));
        
//        System.out.println(s);
    }

}

 

posted on 2013-04-02 17:03  陈惟鲜的博客  阅读(180)  评论(0编辑  收藏  举报

导航