crawler_工具类_RegexUtils_正则帮助类
package com.cph.crawler.core.utils; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * * @ClassName: RegexUtils * @Description: 正则帮助类 * @author cphmvp * @date 2013-9-9 下午3:48:59<br> * 适合单次抽取结果,不适合遍历抽取 * */ public final class RegexUtils { private RegexUtils() { } private static Log logger = LogFactory.getLog(RegexUtils.class); private static Pattern defaultPattern; private static Matcher defaultMatcher; private static final String NOT_MATCHER_DATA = "没有匹配到对应数据"; /** * 返回单行结果集 * * @param input * @param regex * @param group * @return */ public static String getString(String input, Pattern pattren, int group) { if (pattren.toString().equals( "<span class=\"l\">当前位置:([\\s\\S]*?)</span>")) { System.out.println("warn"); } String result = ""; String splitStr = "⊙"; defaultMatcher = pattren.matcher(input); while (defaultMatcher.find()) { result = defaultMatcher.group(group).trim() + "" + splitStr; } result = result.trim().replaceAll("</?[^>]+>", ""); result = result.replaceAll(">", ">"); result = result.replaceAll("\r\n", ""); result = result.replaceAll("\\r\\n", ""); result = result.replaceAll("\\s", ""); result = result.replaceAll(" ", " "); result = result.replace("\n", ""); result = result.replace("\t", ""); result = result.replace("^p", ""); result = result.replaceAll("⊙", " "); return result.trim(); } /** * 返回单行结果集 * * @param input * @param regex * @param group * @return */ public static String getString(String input, String regex, int group) { String result = " "; defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { result = defaultMatcher.group(group).trim(); } getLog(result); return result; } /** * 获得可匹配对象 * * @param input * @param regex * @return */ public static Matcher getMatcher(String input, String regex) { defaultPattern = getPattern(regex); defaultMatcher = defaultPattern.matcher(input); return defaultMatcher; } /** * 获得模式对象 * * @param regex * @return */ public static Pattern getPattern(String regex) { defaultPattern = Pattern.compile(regex); return defaultPattern; } /** * 返回多行结果集 * * @param input * @param regex * @param group * @return */ public static List<String> getStringList(String input, String regex, int group) { List<String> resultList = new ArrayList<String>(); defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { resultList.add(defaultMatcher.group().trim()); } if (resultList.size() < 1) { logger.error(NOT_MATCHER_DATA); } return resultList; } /** * 返回多行结果集 * * @param input * @param regex * @param group * @return */ public static List<Integer> getIntList(String input, String regex, int group) { List<Integer> resultList = new ArrayList<Integer>(); defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { resultList.add(Integer.parseInt(defaultMatcher.group().trim())); } if (resultList.size() < 1) { logger.error(NOT_MATCHER_DATA); } return resultList; } /** * 返回多行结果集 * * @param input * @param regex * @param group * @return */ public static String getString(String input, String regex) { String result = " "; defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { result = defaultMatcher.group().trim(); } getLog(result); return result; } /** * 返回单行结果集 * * @param input * @param regex * @param group * @return */ public static int getInt(String input, String regex, int group) { int result = -1; defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { result = Integer.parseInt(defaultMatcher.group(group).trim()); } getLog(result); return result; } /** * 返回单行结果集 * * @param input * @param regex * @param group * @return */ public static int getInt(String input, String regex) { int result = -1; defaultMatcher = getMatcher(input, regex); while (defaultMatcher.find()) { result = Integer.parseInt(defaultMatcher.group().trim()); } getLog(result); return result; } /** * 匹配中国邮政编码 * * @param postcode * 邮政编码 * @return 验证成功返回true,验证失败返回false */ public static boolean checkPostcode(String postcode) { String regex = "[1-9]\\d{5}"; return Pattern.matches(regex, postcode); } private static void getLog(String result) { if (result.trim().equals("")) { logger.error(NOT_MATCHER_DATA); } } private static void getLog(Integer result) { if (-1 == result) { logger.error(NOT_MATCHER_DATA); } } }
create by cphmvp
email:cphmvp@163.com
爬虫技术交流_crawler QQ群 :167047843