正在看的故事,下载
View Code
package com.chen.Test; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; public class NetDetailHtml { public String saveRoot = "D:" + File.separator + "download" + File.separator; private String httpUrl0 = "http://www.yi-see.com/"; // 详情页获取信息标签 public NetDetailHtml() { } public NetDetailHtml(String saveRoot, String httpUrl0) { this.saveRoot = saveRoot; this.httpUrl0 = httpUrl0; } /** * 获取url中的链接 * * @author chen_weixian * @param parentUrl * @param urlStartTag * @param urlEndTag * @return * @throws Exception */ public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag) throws Exception { List<String[]> resultList = new ArrayList<String[]>(); String htmlContent = this.getContentByUrl(parentUrl); // StringBuffer sb = new StringBuffer(); // 本地测试 // ChenFile.readFile("F:/download/source.txt", sb, "gbk"); // String htmlContent = sb.toString(); String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length()); contentString = contentString.substring(0, contentString.indexOf(areaEndTag)); String[] titleArray = contentString.split(saveFileTitleStartTag); if (titleArray.length > 0) { for (int i = 0; i < titleArray.length; i++) { try { if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0) { continue; } String[] tempArray = new String[2]; String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length()); url = url.substring(0, url.indexOf(hrefEndTag)); String title = this.clearHtml(titleArray[i]); // 获取标题,获取 // String content = this.downStart(httpUrl0 + url); // this.saveFile(content, title); // System.out.println(i + "\t完成:" + title + "\t" + httpUrl0 // + url); tempArray[0] = url; tempArray[1] = title; resultList.add(tempArray); } catch (Exception e) { // System.out.println(i + " 信息异常:" + e); continue; } } } return resultList; } /** * 读取信息内容 * * @author chen_weixian * @param urlString * @return * @throws MException */ public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception { String result = ""; String htmlContent = this.getContentByUrl(urlString); if (htmlContent.length() > 0) { StringBuffer resultString = new StringBuffer(100); // 结果 String br = " \r\n"; // 标题 if (!this.isEmpty(titleStartTag) && !this.isEmpty(titleEndTag)) { String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length()); title = title.substring(0, title.indexOf(titleEndTag)); resultString.append(title); resultString.append(br); } // 作者 if (!this.isEmpty(authorSt1artTag) && !this.isEmpty(authorEndTag)) { String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length()); author = author.substring(0, author.indexOf(authorEndTag)); resultString.append(author); resultString.append(br); } // 内容 if (!this.isEmpty(contentStartTag) && !this.isEmpty(contentEndTag)) { String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length()); content = content.substring(0, content.indexOf(contentEndTag)); resultString.append(content); resultString.append(br); } result = resultString.toString().replaceAll(" ", br); } // 去除html return this.clearHtml(result.toString()); } /** * 读取url内容 * * @author chen_weixian * @param urlString * @return * @throws Exception */ public String getContentByUrl(String urlString) throws Exception { // 设置代理上外网 System.getProperties().put("proxySet", "true"); // System.getProperties().put("proxyHost", "10.17.171.11"); // ip System.getProperties().put("proxyHost", "pascproxy1.pasc.com.cn"); // 域名 System.getProperties().put("proxyPort", "8080"); StringBuffer htmlContent = new StringBuffer(); // 临时变量 URL urlObj = new URL(urlString); HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream())); String line = ""; while ((line = reader.readLine()) != null) { line = new String(line.getBytes(), "gbk"); htmlContent.append(line); } reader.close(); return htmlContent.toString(); } /** * 保存文件 * * @author chen_weixian * @throws IOException */ public void saveFile(String contentString, String title) throws IOException { String savepath = saveRoot + File.separator + title + ".txt"; // ChenFile.unExitCreate(savepath); this.WriteToFile(savepath, contentString, "gbk"); } public void doMain(String parentUrl) { String titleStartTag1 = "<B>"; String titleEndTag1 = "</B>"; String authorSt1artTag1 = "</B><BR>"; String authorEndTag1 = "</a><br>"; String contentStartTag1 = "<TD CLASS=ART>"; String contentEndTag1 = "</TD>"; // 列表链接 String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">"; String areaEndTag2 = "</TABLE>"; String hrefStartTag2 = "<a href='"; String hrefEndTag2 = "' >"; String saveFileTitleStartTag2 = "<BR>"; String saveFileTitleEndTag2 = "</A>"; // 获取一级链接页面 String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>"; String areaEndTag3 = "</TABLE>"; String hrefStartTag3 = "<a href='"; String hrefEndTag3 = "'"; String saveFileTitleStartTag3 = "<TR>"; String saveFileTitleEndTag3 = "</TR>"; // 记录日志文件 // String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log"; try { // 获取二级链接 List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3); if (firstList != null && firstList.size() > 0) { System.out.println("firstList.size()=" + firstList.size()); for (int i = 0; i < firstList.size(); i++) { String[] array = firstList.get(i); // 获取一级链接 this.saveRoot = "D:" + File.separator + "download" + File.separator + array[1]; // 三级链接 List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2); if (lastList != null && lastList.size() > 0) { System.out.println("lastList.size()=" + lastList.size()); for (int j = 0; j < lastList.size(); j++) { try { String[] array1 = lastList.get(j); String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1); this.saveFile(contentString, array1[1]); String message = i + "\t" + array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0]; // ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); } catch (Exception e) { String message = i + "\t" + "信息异常:\t" + array[1] + "\t" + j + "\n" + e; // ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); continue; } } } } } } catch (Exception e) { e.printStackTrace(); } // 获取二级链接 // 获取详情内容 } public void doMain2(String parentUrl) { String titleStartTag1 = "<span id=\"htmltimu\">"; String titleEndTag1 = "</span>"; String authorSt1artTag1 = "</span> <span>"; String authorEndTag1 = "</a></span>"; String contentStartTag1 = "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" align=\"center\" >"; String contentEndTag1 = "<div class=\"button_con\">"; // 列表链接 String areaStartTag2 = "<div id=\"htmlList\" class=\"insert_list\">"; String areaEndTag2 = "</div>"; String hrefStartTag2 = "<strong><a href=\""; String hrefEndTag2 = "\">"; String saveFileTitleStartTag2 = "<li>"; String saveFileTitleEndTag2 = "</a>"; // 记录日志文件 // String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log"; try { String array[] = {parentUrl, "官道之色戒"}; // 获取一级链接 this.saveRoot = "D:" + File.separator + "story" + File.separator + array[1]; // 三级链接 List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2); if (lastList != null && lastList.size() > 0) { System.out.println("lastList.size()=" + lastList.size()); for (int j = 0; j < lastList.size(); j++) { try { String[] array1 = lastList.get(j); String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1); this.saveFile(contentString, j + array1[1]); String message = array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0]; // ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); } catch (Exception e) { String message = "信息异常:\t" + array[1] + "\t" + j + "\n" + e; // ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); continue; } } } } catch (Exception e) { e.printStackTrace(); } // 获取二级链接 // 获取详情内容 } private void WriteToFile(String savepath, String contentString, String code) throws IOException { File file = new File(savepath); if (!file.exists()) { File rootFile = file.getParentFile(); if (!rootFile.exists()) { rootFile.mkdirs(); } file.createNewFile(); } OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file)); ow.write(new String(contentString.getBytes(code))); ow.close(); } /*** * * @author : EX-CHENWEIXIAN001 陈惟鲜 * @create_date :2013-3-20 上午09:06:03 * @param str * @return */ private boolean isEmpty(String str) { if (str == null || str.trim().length() == 0) { return true; } return false; } private String clearHtml(String htmlString) { // String s = " ddd<li title=\" 其它国内品牌\">品牌: 其它国内品牌</li>bsd"; String p = "<[^>]*>"; htmlString = htmlString.replaceAll(p, ""); return htmlString; } /** * @author chen_weixian * @param args */ public static void main(String[] args) { String saveRoot = "D:" + File.separator + "download" + File.separator; // String httpUrl0 = "http://www.chkee.com/chkbook/13/13472/"; String httpUrl0 = "http://www.chkee.com/chkbook/0/554/"; NetDetailHtml downMessage = new NetDetailHtml(saveRoot, httpUrl0); String parentUrl = "index.html"; downMessage.doMain2(parentUrl); // String content = "<strong><a href=\"2965986.html\">作者有话说</a></strong></li>"; // String hrefStartTag = "<li><strong><a href=\""; // String hrefEndTag = "\">"; // System.out.println(content.substring(content.indexOf(hrefStartTag) + hrefStartTag.length(), content.length())); // System.out.println(s); } }