废话不多说直接上代码:(亲测可用)
1 package plugins.timer.surfacewater; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.InputStreamReader; 8 import java.util.ArrayList; 9 import java.util.List; 10 11 import org.xml.sax.SAXException; 12 import plugins.timer.station.SurfaceWaterModel; 13 14 public class JSUtil { 15 16 // 如果要更换运行环境,请注意exePath最后的phantom.exe需要更改。因为这个只能在window版本上运行。前面的路径名 17 // 也需要和exePath里面的保持一致。否则无法调用 18 private static String projectPath = System.getProperty("catalina.home"); 19 private static String jsPath = projectPath + File.separator+ "webapps"+File.separator+"syads"+File.separator+"huicong.js"; 20 private static String exePath = projectPath + File.separator+"webapps"+File.separator+"syads"+File.separator+ "phantomjs" 21 + File.separator + "bin" + File.separator + "phantomjs.exe"; 22 /*private static String projectPath = System.getProperty("user.dir"); 23 private static String jsPath = projectPath + File.separator+"WebRoot"+File.separator+"huicong.js"; 24 private static String exePath = projectPath +File.separator+"WebRoot"+File.separator+ "phantomjs" 25 + File.separator + "bin" + File.separator + "phantomjs.exe";*/ 26 27 28 32 public String JSUtil() throws IOException, SAXException { 33 34 // 测试调用,传入url即可,第二个参数是要读取的字节数 35 String str = getParseredHtml2("http://123.127.175.45:8082/",30000); 36 37 System.out.println("str:"+str); 38 return str; 39 40 41 } 42 43 // 调用phantomjs程序,并传入js文件,并通过流拿回需要的数据。 44 public static String getParseredHtml2(String url,int num) throws IOException { 45 Runtime rt = Runtime.getRuntime(); 46 Process p = rt.exec(exePath + " " + jsPath + " " + url); 47 InputStream is = p.getInputStream(); 48 BufferedReader br = new BufferedReader(new InputStreamReader(is,"utf-8")); 49 StringBuffer sbf = new StringBuffer(); 50 String tmp = ""; 51 while ((tmp = br.readLine()) != null) { 52 sbf.append(tmp); 53 } 54 System.out.println("sbf:"+sbf); 55 //得到整个网页的字符串 56 String tableStr = sbf.toString(); 57 return tableStr; 115 } 116 /** 117 * 返回给定地址中的数据 118 * @param url 119 * @return 120 */ 121 public static String getData(String url){ 122 int start = url.indexOf("%\">")+3; 123 int end = url.indexOf("<",start); 124 return url.substring(start,end); 125 } 126 127 }
注意:
- 程序中需要用到两个工具,分别是huicong.js和phantomjs.exe。这两个工具我已经打包放到了百度云盘里:
微信关注公众号“姓氏头像宝典”,回复爬虫即可
- syads是我的项目名称
- http://123.127.175.45:8082/是测试的网站
- 30000是指爬取的字节数