java读取html文件,截取<body>标签中内容
1 public String readfile(String filePath){ 2 File file = new File(filePath); 3 InputStream input = null; 4 try { 5 input = new FileInputStream(file); 6 } catch (FileNotFoundException e) { 7 e.printStackTrace(); 8 } 9 StringBuffer buffer = new StringBuffer(); 10 byte[] bytes = new byte[1024]; 11 try { 12 for(int n ; (n = input.read(bytes))!=-1 ; ){ 13 buffer.append(new String(bytes,0,n,"GBK")); 14 } 15 } catch (IOException e) { 16 e.printStackTrace(); 17 } 18 // System.out.println(buffer); 19 return buffer.toString(); 20 } 21 22 public String getBody(String val) { 23 String start = "<body>"; 24 String end = "</body>"; 25 int s = val.indexOf(start) + start.length(); 26 int e = val.indexOf(end); 27 return val.substring(s, e); 28 } 29
1 public static void main(String [] args){ 2 OaDao m = new OaDao(); 3 // String sql = "SELECT sth,xdh FROM TK_ST_0331 where sth='022012050101131000100' and rownum <=10"; 4 String sql = "select t.sth , t.stgjz ,t.stly, x.mc from TK_ST_0331 t ,TK_STK_ST_0331 k,TK_TX X where t.sth = k.sth AND X.BH = t.tx and rownum <10 "; 5 List<OaVo> datalist= m.findAll(sql); 6 for(OaVo vo : datalist){ 7 System.out.println(vo.getVal1()+"///"+vo.getVal2()); 8 9 // String sth = "022012010100000100100"; 10 String sth = vo.getVal1(); 11 String kmh = sth.substring(0, 2); //科目号 12 String nf = sth.substring(2, 6); //年份 13 String yf = sth.substring(6,10); //月份 14 String serialno = sth.substring(10, 16); //序列号 15 String stxl = sth.substring(16, 19); //题型 16 String path ="/"+kmh+"/"+nf+"/"+yf+"/"+serialno+"/"+stxl+"/"; 17 18 String tm_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_tm.htm"; 19 String da_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_da.htm"; 20 String jx_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_jx.htm"; 21 22 // String path = "H:/tk_source/02/0101/000001/001/022012010100000100100_da.htm"; 23 24 String tm = m.getBody(m.readfile(tm_path)); 25 System.out.println("----------------------题目------------------------------"); 26 System.out.println(tm); 27 28 String da = m.getBody(m.readfile(da_path)); 29 System.out.println("----------------------答案------------------------------"); 30 System.out.println(da); 31 32 33 String jx = m.getBody(m.readfile(da_path)); 34 System.out.println("----------------------解析------------------------------"); 35 System.out.println(jx); 36 } 37 }
1 /** 2 * 从HTML源码中提取图片路径,最后以一个 String 类型的 List 返回,如果不包含任何图片,则返回一个 size=0 的List 3 * 需要注意的是,此方法只会提取以下格式的图片:.jpg|.bmp|.eps|.gif|.mif|.miff|.png|.tif|.tiff|.svg|.wmf|.jpe|.jpeg|.dib|.ico|.tga|.cut|.pic 4 * @param htmlCode HTML源码 5 * @return <img>标签 src 属性指向的图片地址的List集合 6 * @author Carl He 7 */ 8 public static List<String> getImageSrc(String htmlCode) { 9 List<String> imageSrcList = new ArrayList<String>(); 10 Pattern p = Pattern.compile("<img//b[^>]*//bsrc//b//s*=//s*('|/")?([^'/"/n/r/f>]+(//.jpg|//.bmp|//.eps|//.gif|//.mif|//.miff|//.png|//.tif|//.tiff|//.svg|//.wmf|//.jpe|//.jpeg|//.dib|//.ico|//.tga|//.cut|//.pic)//b)[^>]*>", Pattern.CASE_INSENSITIVE); 11 Matcher m = p.matcher(htmlCode); 12 String quote = null; 13 String src = null; 14 while (m.find()) { 15 quote = m.group(1); 16 src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("//s+")[0] : m.group(2); 17 imageSrcList.add(src); 18 } 19 return imageSrcList; 20 }
HLb`s签名:给自己一个拼搏的理由,好好的坚持下去。