Java 简单模拟爬虫
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author 园长 & doie.net * */ public class Test { /** * 发送请求 * @param host */ public static void sendPostRequest(String host) { URL url; try { url = new URL(host);// 发送请求的路径 URLConnection conn = url.openConnection(); HttpURLConnection httpUrlConnection = (HttpURLConnection) conn; httpUrlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); String temp = null; BufferedReader br = new BufferedReader(new InputStreamReader(conn .getInputStream())); while ((temp = br.readLine()) != null) { // getSourceCode(temp);// 源代码正则 List<String> ls = getSearchURL(temp); for (int i = 0; i < ls.size(); i++) { System.out.println(ls.get(i)); } } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 爬URL * @param sourceCode * @return */ public static List<String> getSearchURL(String sourceCode){ List<String> ls=new ArrayList<String>(); List<String> rs=new ArrayList<String>(); Pattern p1 = Pattern.compile("(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?");// 抓网址 Matcher getUrl = p1.matcher(sourceCode); while (getUrl.find()) { ls.add(getSuffix(getUrl.group().trim())); } for (int i = 0; i < ls.size(); i++) { if (!rs.contains(ls.toArray()[i])&&ls.toArray()[i]!=null) { rs.add(ls.get(i)); } } return rs; } /** * 正则匹配 * @param sourceCode * @return */ public static Map<String, Object> getSourceCode(String sourceCode) { Pattern pattern = Pattern.compile("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+");// 抓邮箱 Pattern pattern2 = Pattern.compile("(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?");// 抓网址 Matcher matcher = pattern.matcher(sourceCode); Map<String, String> emailMap = new HashMap<String, String>(); Matcher matcher2 = pattern2.matcher(sourceCode); Map<String, Object> httpMap = new HashMap<String, Object>(); List<String> ls=new ArrayList<String>(); List<String> rs=new ArrayList<String>(); while (matcher2.find()) { ls.add(getSuffix(matcher2.group().trim())); httpMap.put(matcher2.group().trim(), matcher2.group().trim()); } for (int i = 0; i < ls.size(); i++) { if (!rs.contains(ls.toArray()[i])&&ls.toArray()[i]!=null) { rs.add(ls.get(i)); } } for (int i = 0; i < rs.size(); i++) { // System.out.println(rs.get(i)); } while (matcher.find()) { emailMap.put(matcher.group(), matcher.group()); } Iterator<String> iterator = emailMap.values().iterator(); while (iterator.hasNext()) { String str = iterator.next(); System.out.println("str:"+str); } return httpMap; } /** * 获得网址所在的匹配区域部分 * @param content * @param strAreaBegin * @param strAreaEnd * @return */ public static String getArea(String content,String strAreaBegin, String strAreaEnd) { int a1 = 0, a2 = 0; a1 = content.indexOf(strAreaBegin)+strAreaBegin.length(); a2 = content.indexOf(strAreaEnd); return content.substring(a1,a2); } /** * 获取URL后缀 * @param url * @return */ public static String getSuffix(String url){ String suffix[]={"exe","css","js","zip","rar","mid","tar","gif","jpeg","jpg","bmp","avi","mp3","swf","rm","3gp","mp4","wma","wav","rmvb","ram","key","png","psd","pdf","doc","mdb","xls","ppt","docx","pptx","wps","iso","wmv","img","flv","fla","swf"}; String s=url.substring(url.lastIndexOf(".")+1, url.length()); for (int i = 0; i < suffix.length; i++) { if (suffix[i].equalsIgnoreCase(s)) { url=null; } } return url; } public static void main(String[] args) { sendPostRequest("http://www.baidu.com/s?wd=%E8%A6%81%E7%9A%84%E7%95%99%E9%82%AE%E7%AE%B1"); } }