MinerUtil.java 爬虫工具类
package com.iteye.injavawetrust.miner; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 爬虫工具类 * @author InJavaWeTrust * */ public class MinerUtil { private static final Log LOG = LogFactory.getLog(MinerUtil.class); public static long starTime = 0; /** * 判断是否为空 * @param param * @return true-为空;false-非空 */ public static boolean isBlank(String param) { return (null == param || "".equals(param.trim())) ? true : false; } /** * URL是否以html结尾 * @param url * @return true-是;false-否 */ public static boolean checkURL(String url) { String html = url.substring(url.lastIndexOf(".") + 1); return "html".equals(html) ? true : false; } /** * URL列表是否包含关键字 * @param key 关键字 * @param keys URL列表 * @return true-是;false-否 */ public static boolean checkKeys(String key, List<String> keys) { boolean flag = false; for(String k : keys) { if(key.contains(k)){ flag = true; break; } } return flag; } public static boolean isValidFileName(String fileName) { if (fileName == null || fileName.length() > 255){ return false; } else { return fileName .matches("[^\\s\\\\/:\\*\\?\\\"<>\\|](\\x20|[^\\s\\\\/:\\*\\?\\\"<>\\|])*[^\\s\\\\/:\\*\\?\\\"<>\\|\\.]$"); } } /** * 获取URL * @param url URL * @return URL */ public static Set<String> getAllUrl(String url){ Set<String> urls = new HashSet<String>(); try { Connection conn = Jsoup.connect(url); conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器 Document document = conn.timeout(5000).get(); Elements hrefs = document.select("a[href]"); Iterator<Element> hrefIter = hrefs.iterator(); while (hrefIter.hasNext()) { Element href = hrefIter.next(); urls.add(href.attr("href")); } } catch (Exception e) { LOG.info("获取URL出现异常,异常URL[" + url + "]"); LOG.info("异常信息[" + e.getMessage() + "]"); } return urls; } /** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public static String msToss(long ms) { SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; } /** * 将html写入本地文件 * @param htmlText html内容 * @param htmlName html名称 */ public static void getHtmlToLocal(Map<String, String> map){ Writer writer = null; try { String path = MinerConstanits.HTMLPATH + getToday(); makeDir(path); writer = new OutputStreamWriter(new FileOutputStream(new File(path + File.separator + map.get("title"))), "UTF-8"); writer.write(map.get("html")); writer.flush(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 文件名不能包含下列任何字符:<br> * \/:*?"<>| * @param title 标题 * @return 去掉文件名不能包含的字符 */ public static String fileName(String title){ return title .replaceAll("\\\\", "") .replaceAll("/", "") .replaceAll(":", "") .replaceAll("\\*", "") .replaceAll("\\?", "") .replaceAll("\"", "") .replaceAll("<", "") .replaceAll(">", "") .replaceAll("\\|", ""); } /** * 获取当天日期 * @return 当天日期 */ public static String getToday(){ String result = ""; Date date = new Date(); result = format(date); return result; } /** * 格式化日期 * @param date 日期 * @return yyyymmdd 日期 */ public static String format(Date date){ String format = "yyyyMMdd"; SimpleDateFormat fmt = new SimpleDateFormat(format); return fmt.format(date); } /** * 创建存储目录 * @param path 存储目录 */ public static void makeDir(String path) { File file = new File(path); if(!file.exists()){ file.mkdirs(); LOG.info("创建存储目录[" + path + "]"); } } public static boolean checkBeforeStart(MinerConfig config) { if(null == config){ LOG.info("config未配置!!!"); return false; } if(null == config.getKeys() || 0 == config.getKeys().size()){ LOG.info("包含关键字未配置!!!"); return false; } if(null == config.getStoreType()){ LOG.info("存储方式未配置!!!"); return false; } if(config.getMaxDepth() < 1){ LOG.info("爬取页面最大深度配置错误!!!"); return false; } if(config.getMinerHtmlThreadNum() < 1){ LOG.info("下载页面线程数配置错误!!!"); return false; } if(config.getMiseringThreadNum() < 1){ LOG.info("分析页面线程数配置错误!!!"); return false; } if(config.getMinserStoreThreadNum() < 1){ LOG.info("存储线程数配置错误!!!"); return false; } return true; } public static void main(String[] args) { String path = MinerConstanits.HTMLPATH + File.separator + getToday(); makeDir(path); // System.out.println(getToday()); // String test = "http://my.163.com/2015/11/27/17763_578935.html"; // System.out.println(fileName(test)); // System.out.println(MinerUtil.isBlank(null)); // System.out.println(MinerUtil.isBlank("")); // System.out.println(MinerUtil.isBlank(" ")); // System.out.println(MinerUtil.isBlank("bbb")); // System.out.println(MinerUtil.isBlank(" bbb ")); // String key = "http://www.jqu.net.cn"; // List<String> keys = new ArrayList<String>(); // keys.add("http://www.jqu.net.cn"); // System.out.println(MinerUtil.checkKeys(key, keys)); } }