java类实现 指定网站信息采集
话不多出,直接写出实现过程。若是需要代理,就设置代理
// // 设置代理上外网 // System.getProperties().put("proxySet", "true"); // System.getProperties().put("proxyHost", "172.31.170.14"); // System.getProperties().put("proxyPort", "8080");
View Code
package com.chen.downMessage; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import com.chen.system.util.ChenFile; import com.chen.system.util.ChenTools; public class DownMessage { public String saveRoot = "F:" + File.separator + "download" + File.separator; private String httpUrl0 = "http://www.yi-see.com/"; // 详情页获取信息标签 public DownMessage() { } public DownMessage(String saveRoot, String saveRoota) { this.saveRoot = saveRoot; this.httpUrl0 = httpUrl0; } /** * 获取url中的链接 * * @author chen_weixian * @param parentUrl * @param urlStartTag * @param urlEndTag * @return * @throws Exception */ public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag) throws Exception { List<String[]> resultList = new ArrayList<String[]>(); String htmlContent = this.getContentByUrl(parentUrl); // StringBuffer sb = new StringBuffer(); // 本地测试 // ChenFile.readFile("F:/download/source.txt", sb, "gbk"); // String htmlContent = sb.toString(); String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length()); contentString = contentString.substring(0, contentString.indexOf(areaEndTag)); String[] titleArray = contentString.split(saveFileTitleStartTag); if (titleArray.length > 0) { for (int i = 0; i < titleArray.length; i++) { try { if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0) { continue; } String[] tempArray = new String[2]; String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length()); url = url.substring(0, url.indexOf(hrefEndTag)); String title = ChenTools.clearHtml(titleArray[i]); // 获取标题,获取 // String content = this.downStart(httpUrl0 + url); // this.saveFile(content, title); // System.out.println(i + "\t完成:" + title + "\t" + httpUrl0 // + url); tempArray[0] = url; tempArray[1] = title; resultList.add(tempArray); } catch (Exception e) { // System.out.println(i + " 信息异常:" + e); continue; } } } return resultList; } /** * 读取信息内容 * * @author chen_weixian * @param urlString * @return * @throws MException */ public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception { String result = ""; // // 设置代理上外网 // System.getProperties().put("proxySet", "true"); // System.getProperties().put("proxyHost", "172.31.170.14"); // System.getProperties().put("proxyPort", "8080"); String htmlContent = this.getContentByUrl(urlString); if (htmlContent.length() > 0) { StringBuffer resultString = new StringBuffer(100); // 结果 String br = " \r\n"; // 标题 if (!ChenTools.isEmpty(titleStartTag) && !ChenTools.isEmpty(titleEndTag)) { String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length()); title = title.substring(0, title.indexOf(titleEndTag)); resultString.append(title); resultString.append(br); } // 作者 if (!ChenTools.isEmpty(authorSt1artTag) && !ChenTools.isEmpty(authorEndTag)) { String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length()); author = author.substring(0, author.indexOf(authorEndTag)); resultString.append(author); resultString.append(br); } // 内容 if (!ChenTools.isEmpty(contentStartTag) && !ChenTools.isEmpty(contentEndTag)) { String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length()); content = content.substring(0, content.indexOf(contentEndTag)); resultString.append(content); resultString.append(br); } result = resultString.toString().replaceAll("<br>", br); } // 去除html return ChenTools.clearHtml(result.toString()); } /** * 读取url内容 * * @author chen_weixian * @param urlString * @return * @throws Exception */ public String getContentByUrl(String urlString) throws Exception { StringBuffer htmlContent = new StringBuffer(); // 临时变量 URL urlObj = new URL(urlString); HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream())); String line = ""; while ((line = reader.readLine()) != null) { line = new String(line.getBytes(), "gbk"); htmlContent.append(line); } reader.close(); return htmlContent.toString(); } /** * 保存文件 * * @author chen_weixian * @throws IOException */ public void saveFile(String contentString, String title) throws IOException { String savepath = saveRoot + File.separator + title + ".txt"; ChenFile.unExitCreate(savepath); ChenFile.WriteToFile(savepath, contentString, "gbk", true); } public void doMain(String parentUrl) { String titleStartTag1 = "<B>"; String titleEndTag1 = "</B>"; String authorSt1artTag1 = "</B><BR>"; String authorEndTag1 = "</a><br>"; String contentStartTag1 = "<TD CLASS=ART>"; String contentEndTag1 = "</TD>"; // 列表链接 String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">"; String areaEndTag2 = "</TABLE>"; String hrefStartTag2 = "<a href='"; String hrefEndTag2 = "' >"; String saveFileTitleStartTag2 = "<BR>"; String saveFileTitleEndTag2 = "</A>"; // 获取一级链接页面 String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>"; String areaEndTag3 = "</TABLE>"; String hrefStartTag3 = "<a href='"; String hrefEndTag3 = "'"; String saveFileTitleStartTag3 = "<TR>"; String saveFileTitleEndTag3 = "</TR>"; // 记录日志文件 String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log"; try { // 获取二级链接 List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3); if (firstList != null && firstList.size() > 0) { System.out.println("firstList.size()=" + firstList.size()); for (int i = 0; i < firstList.size(); i++) { String[] array = firstList.get(i); // 获取一级链接 this.saveRoot = "F:" + File.separator + "download" + File.separator + array[1]; // 三级链接 List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2); if (lastList != null && lastList.size() > 0) { System.out.println("lastList.size()=" + lastList.size()); for (int j = 0; j < lastList.size(); j++) { try { String[] array1 = lastList.get(j); String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1); this.saveFile(contentString, array1[1]); String message = i + "\t"+ array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0]; ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); } catch (Exception e) { String message = i + "\t"+ "信息异常:\t" + array[1] + "\t" + j + "\n" + e; ChenFile.WriteToFile(logFile, message, "gbk", true); System.out.println(message); continue; } } } } } } catch (Exception e) { e.printStackTrace(); } // 获取二级链接 // 获取详情内容 } /** * @author chen_weixian * @param args */ public static void main(String[] args) { DownMessage downMessage = new DownMessage(); String parentUrl = "artlist_3.html"; downMessage.doMain(parentUrl); } }