java类实现 指定网站信息采集

话不多出,直接写出实现过程。若是需要代理,就设置代理

//  // 设置代理上外网
//  System.getProperties().put("proxySet", "true");
//  System.getProperties().put("proxyHost", "172.31.170.14");
//  System.getProperties().put("proxyPort", "8080");

 

View Code
package com.chen.downMessage;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.chen.system.util.ChenFile;
import com.chen.system.util.ChenTools;

public class DownMessage
{
    public String saveRoot = "F:" + File.separator + "download" + File.separator;
    private String httpUrl0 = "http://www.yi-see.com/";

    // 详情页获取信息标签

    public DownMessage()
    {
    }

    public DownMessage(String saveRoot, String saveRoota)
    {
    this.saveRoot = saveRoot;
    this.httpUrl0 = httpUrl0;
    }

    /**
     * 获取url中的链接
     * 
     * @author chen_weixian
     * @param parentUrl
     * @param urlStartTag
     * @param urlEndTag
     * @return
     * @throws Exception
     */
    public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag) throws Exception
    {
    List<String[]> resultList = new ArrayList<String[]>();
    String htmlContent = this.getContentByUrl(parentUrl);
    // StringBuffer sb = new StringBuffer(); // 本地测试
    // ChenFile.readFile("F:/download/source.txt", sb, "gbk");
    // String htmlContent = sb.toString();

    String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length());
    contentString = contentString.substring(0, contentString.indexOf(areaEndTag));

    String[] titleArray = contentString.split(saveFileTitleStartTag);
    if (titleArray.length > 0)
    {
        for (int i = 0; i < titleArray.length; i++)
        {
        try
        {
            if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0)
            {
            continue;
            }
            String[] tempArray = new String[2];
            String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length());
            url = url.substring(0, url.indexOf(hrefEndTag));
            String title = ChenTools.clearHtml(titleArray[i]);
            // 获取标题,获取
            // String content = this.downStart(httpUrl0 + url);
            // this.saveFile(content, title);
            // System.out.println(i + "\t完成:" + title + "\t" + httpUrl0
            // + url);
            tempArray[0] = url;
            tempArray[1] = title;

            resultList.add(tempArray);
        }
        catch (Exception e)
        {
            // System.out.println(i + " 信息异常:" + e);
            continue;
        }
        }
    }

    return resultList;
    }

    /**
     * 读取信息内容
     * 
     * @author chen_weixian
     * @param urlString
     * @return
     * @throws MException
     */
    public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception
    {
    String result = "";

//        // 设置代理上外网
//        System.getProperties().put("proxySet", "true");
//        System.getProperties().put("proxyHost", "172.31.170.14");
//        System.getProperties().put("proxyPort", "8080");
    String htmlContent = this.getContentByUrl(urlString);
    if (htmlContent.length() > 0)
    {
        StringBuffer resultString = new StringBuffer(100); // 结果
        String br = "    \r\n";
        // 标题
        if (!ChenTools.isEmpty(titleStartTag) && !ChenTools.isEmpty(titleEndTag))
        {
        String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length());
        title = title.substring(0, title.indexOf(titleEndTag));
        resultString.append(title);
        resultString.append(br);
        }
        // 作者
        if (!ChenTools.isEmpty(authorSt1artTag) && !ChenTools.isEmpty(authorEndTag))
        {
        String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length());
        author = author.substring(0, author.indexOf(authorEndTag));
        resultString.append(author);
        resultString.append(br);
        }
        // 内容
        if (!ChenTools.isEmpty(contentStartTag) && !ChenTools.isEmpty(contentEndTag))
        {
        String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length());
        content = content.substring(0, content.indexOf(contentEndTag));
        resultString.append(content);
        resultString.append(br);
        }

        result = resultString.toString().replaceAll("<br>", br);
    }
    // 去除html
    return ChenTools.clearHtml(result.toString());
    }

    /**
     * 读取url内容
     * 
     * @author chen_weixian
     * @param urlString
     * @return
     * @throws Exception
     */
    public String getContentByUrl(String urlString) throws Exception
    {
    StringBuffer htmlContent = new StringBuffer(); // 临时变量
    URL urlObj = new URL(urlString);
    HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection();
    BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
    String line = "";
    while ((line = reader.readLine()) != null)
    {
        line = new String(line.getBytes(), "gbk");
        htmlContent.append(line);
    }
    reader.close();
    return htmlContent.toString();
    }

    /**
     * 保存文件
     * 
     * @author chen_weixian
     * @throws IOException
     */
    public void saveFile(String contentString, String title) throws IOException
    {
    String savepath = saveRoot + File.separator + title + ".txt";
    ChenFile.unExitCreate(savepath);
    ChenFile.WriteToFile(savepath, contentString, "gbk", true);
    }

    public void doMain(String parentUrl)
    {

    String titleStartTag1 = "<B>";
    String titleEndTag1 = "</B>";
    String authorSt1artTag1 = "</B><BR>";
    String authorEndTag1 = "</a><br>";
    String contentStartTag1 = "<TD CLASS=ART>";
    String contentEndTag1 = "</TD>";
    // 列表链接
    String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">";
    String areaEndTag2 = "</TABLE>";
    String hrefStartTag2 = "<a href='";
    String hrefEndTag2 = "' >";
    String saveFileTitleStartTag2 = "<BR>";
    String saveFileTitleEndTag2 = "</A>";
    // 获取一级链接页面
    String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>";
    String areaEndTag3 = "</TABLE>";
    String hrefStartTag3 = "<a href='";
    String hrefEndTag3 = "'";
    String saveFileTitleStartTag3 = "<TR>";
    String saveFileTitleEndTag3 = "</TR>";
    // 记录日志文件
    String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";
    
    
    try
    {
        // 获取二级链接
        List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3);
        if (firstList != null && firstList.size() > 0)
        {
        System.out.println("firstList.size()=" + firstList.size());
        for (int i = 0; i < firstList.size(); i++)
        {
            String[] array = firstList.get(i);
            // 获取一级链接
            this.saveRoot = "F:" + File.separator + "download" + File.separator + array[1];
            // 三级链接
            List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
            if (lastList != null && lastList.size() > 0)
            {
            System.out.println("lastList.size()=" + lastList.size());
            for (int j = 0; j < lastList.size(); j++)
            {
                try
                {
                String[] array1 = lastList.get(j);
                String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
                this.saveFile(contentString, array1[1]);
                String message = i + "\t"+ array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
                ChenFile.WriteToFile(logFile, message, "gbk", true);
                System.out.println(message);
                }
                catch (Exception e)
                {
                String message = i + "\t"+ "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
                ChenFile.WriteToFile(logFile, message, "gbk", true);
                System.out.println(message);
                continue;
                }
            }
            }
        }
        }
    }
    catch (Exception e)
    {
        e.printStackTrace();
    }
    // 获取二级链接
    // 获取详情内容
    }

    /**
     * @author chen_weixian
     * @param args
     */
    public static void main(String[] args)
    {
    DownMessage downMessage = new DownMessage();
    String parentUrl = "artlist_3.html";
    downMessage.doMain(parentUrl);
    }

}

 

 

posted on 2013-03-15 19:54  陈惟鲜的博客  阅读(285)  评论(0编辑  收藏  举报

导航