基于jsoup的Java服务端http(s)代理程序-代理服务器Demo
亲爱的开发者朋友们,知道百度网址翻译么?他们为何能够翻译源网页呢,iframe可是不能跨域操作的哦,那么可以用代理实现。直接上代码:
本Demo基于MVC写的,灰常简单,copy过去,简单改改就可以用的哦。
//Action层 /** * 网址翻译代理服务器接口层 * @Description: 此接口层可完成对所请求网址的代理,实现同域访问 * @author zhanglongping * @CreateDate: 2016-8-23 上午10:52:49 */ @At("/proxy") public class ProxyModule { /** * 获取网页 * @return * @author zhanglongping * @date 2016-8-23 上午10:54:13 */ @At("/gethtml") @Ok("Raw") @Authority("") public Object gethtml(@Param("yeekit_proxy_url") String url,HttpServletRequest request, HttpServletResponse response){ try { String path = request.getContextPath(); String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; String html = new ProxyUtils().getUrlMap(url,basePath); // return html; InputStream is = new StringInputStream(html); BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8")); String line; PrintWriter out = response.getWriter(); while ((line = in.readLine()) != null) { out.println(line); } out.flush(); in.close(); } catch (Exception e) { e.printStackTrace(); } return null; } /** * 使用GET提交到目标服务器。 * * @param request * @param response * @param targetUrl * @throws IOException */ @At("/forward") @Ok("Raw") @Authority("") public Object urlRedirect(@Param("yeekit_proxy_url") String targetUrl,HttpServletRequest request, HttpServletResponse response) throws IOException { if(targetUrl.endsWith(".htm") || targetUrl.endsWith(".html") || targetUrl.endsWith(".shtml")){ try { String path = request.getContextPath(); String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; String html = new ProxyUtils().getUrlMap(targetUrl,basePath); // return html; InputStream is = new StringInputStream(html); BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8")); String line; PrintWriter out = response.getWriter(); while ((line = in.readLine()) != null) { out.println(line); } out.flush(); in.close(); } catch (Exception e) { e.printStackTrace(); // return null; } }else if(targetUrl.endsWith(".css") || targetUrl.endsWith(".js") || targetUrl.endsWith(".jpg")|| targetUrl.endsWith(".png") || targetUrl.endsWith(".svg") || targetUrl.endsWith(".gif")){ String fileName = targetUrl.split("/")[targetUrl.split("/").length-1]; // response.setHeader("Content-Disposition", "attachment; filename=" // + java.net.URLEncoder.encode(fileName, "UTF-8")); //图片的名称 String imgName = fileName; //名称转码,避免中文乱码 imgName = new String(imgName.getBytes("iso8859-1"),"UTF-8"); //图片的资源地址,http://10.80.3.229:8081/mediaserver/574fe515e30ab97c9068d2e1 //这是媒体服务器返回的地址,因为是网络地址,所以需要使用HttpURLConnection去获取图片 String imgUrl = targetUrl; //输入流,用来读取图片 InputStream ins = null; HttpURLConnection httpURL = null; try{ URL url = new URL(imgUrl); //打开一个网络连接 httpURL = (HttpURLConnection)url.openConnection(); //设置网络连接超时时间 httpURL.setConnectTimeout(3000); //设置应用程序要从网络连接读取数据 httpURL.setDoInput(true); //设置请求方式 httpURL.setRequestMethod("GET"); //获取请求返回码 int responseCode = httpURL.getResponseCode(); if(responseCode == 200){ //如果响应为“200”,表示成功响应,则返回一个输入流 ins = httpURL.getInputStream(); //设置response响应头 //encodeChineseDownloadFileName()用来解决文件名为中文的问题,方法体在下面 if(fileName.indexOf(".css")>-1){ response.setContentType("text/css"); } response.setHeader("content-disposition", "attachment;filename="+ ProxyUtils.encodeChineseDownloadFileName(request,imgName)); //输出流到response中 byte[] data = new byte[1024]; int len = 0; //输出流 OutputStream out = response.getOutputStream(); while((len = ins.read(data)) > 0){ out.write(data, 0, len); } out.flush(); ins.close(); } }catch(Exception e){ System.out.println("下载附件图片出错!"+targetUrl); e.printStackTrace(); } } return null; }
工具类
import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.http.HttpServletRequest; import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang3.StringUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; /** * 译库网址翻译代理服务工具类 * * @Description:包含:提取HTML中网址,并转换为代理的网址服务地址; * @author zhanglongping * @CreateDate: 2016-8-23 上午10:15:08 * @UpdateUser: zhanglongping * @UpdateDate: 2016-8-23 上午10:15:08 * @UpdateRemark: 说明本次修改内容 */ public class ProxyUtils { // public static void main(String[] args) throws IOException { //// ProxyUtils pu = new ProxyUtils(); //// pu.getUrlMap("http://english.cas.cn"); // Connection conn = Jsoup.connect("http://www.bbc.com"); // Document doc_one = conn.get(); // System.out.println(doc_one); // } /** * 获取url哈希:key:源url value:代理url * @param url * @author zhanglongping * @date 2016-8-23 上午10:42:41 */ public String getUrlMap(String url,String basePath){ // String url_protocol = "",url_host = ""; try { //特殊网址转换 url = transformation(url); URL urlcurr = new URL(url); // url_protocol = urlcurr.getProtocol(); // url_host = urlcurr.getHost(); String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost(); // String proxyHost = basePath; // String proxyHostName = proxyHost+"proxy/forward?yeekit_proxy_url="; Document doc_one; Connection conn = Jsoup.connect(hostname); doc_one = conn.get(); doc_one.setBaseUri(hostname); // Elements links = doc_one.select("a[href]"); // Elements media = doc_one.select("[src]"); // Elements imports = doc_one.select("link[href]"); Elements head = doc_one.select("meta"); head.get(0).before("<base href=\""+hostname+"/"+"\" />"); //鼠标悬停翻译js脚本注入 //悬停脚本引用 String hover_js = "<script src=\""+basePath+"/yeekit_translate_url/js/yeekit_hover_trans.js\" type=\"text/javascript\"></script>"; String jquery_js = "<script src=\"http://cdn.bootcss.com/jquery/3.1.0/jquery.min.js\" type=\"text/javascript\"></script>"; head.get(0).after(jquery_js + hover_js); // for (Element src : media) { // String key = src.attr("abs:src"); // src.attr("src", proxyHostName+key); // } // // for (Element link : imports) { // String key = link.attr("abs:href"); // link.attr("href", proxyHostName+key); // } // // for (Element link : links) { // String key = link.attr("abs:href"); // link.attr("href", proxyHostName+key); // } String dochtml = doc_one.html().toString(); //增强型处理 - 处理js脚本里静态资源地址引用 // List<String> list_src_img = getImgSrc(dochtml); // for(String src:list_src_img){ // if(src.indexOf("./") > -1){ // dochtml = dochtml.replaceAll(src, proxyHostName+hostname+src.substring(1)); // } // } // System.out.println(dochtml); return dochtml; } catch (IOException e) { e.printStackTrace(); return null; } } /** * 内容获取 * @return * @author zhanglongping * @throws IOException * @date 2016-8-30 下午5:44:31 */ public String get_https_html(String url) throws IOException{ URL urlcurr = new URL(url); String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost(); Document doc_one; Connection conn = Jsoup.connect(hostname); doc_one = conn.post(); doc_one.setBaseUri(hostname); Elements head = doc_one.select("meta"); head.get(0).before("<base href=\""+hostname+"/"+"\" />"); String dochtml = doc_one.html().toString(); return dochtml; } /* * 解决文件为中文名的乱码问题 */ public static String encodeChineseDownloadFileName(HttpServletRequest request, String pFileName) throws UnsupportedEncodingException{ String filename = null; //获取请求头中的浏览器标识 String agent = request.getHeader("USER-AGENT"); if(agent != null){ if(agent.indexOf("Firefox") != -1){ //Firefox filename = "=?UTF-8?B?" + (new String(Base64.encodeBase64(pFileName.getBytes("UTF-8")))) + "?="; }else if(agent.indexOf("Chrome") != -1){ //Chrome filename = new String(pFileName.getBytes(), "ISO8859-1"); }else{ //IE7+ filename = URLEncoder.encode(pFileName, "UTF-8"); //替换空格 filename = StringUtils.replace(filename, "+", "%20"); } }else{ filename = pFileName; } return filename; } /** * 获取img标签中的src值 * @param content * @return */ public List<String> getImgSrc(String content){ List<String> list = new ArrayList<String>(); //目前img标签标示有3种表达式 //<img alt="" src="1.jpg"/> <img alt="" src="1.jpg"></img> <img alt="" src="1.jpg"> //开始匹配content中的<img />标签 Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)"); Matcher m_img = p_img.matcher(content); boolean result_img = m_img.find(); if (result_img) { while (result_img) { //获取到匹配的<img />标签中的内容 String str_img = m_img.group(2); //开始匹配<img />标签中的src Pattern p_src = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')"); Matcher m_src = p_src.matcher(str_img); if (m_src.find()) { String str_src = m_src.group(3); list.add(str_src); } //结束匹配<img />标签中的src //匹配content中是否存在下一个<img />标签,有则继续以上步骤匹配<img />标签中的src result_img = m_img.find(); } } return list; } /** * 特殊网址转换 * @param url * @return * @author zhanglongping * @date 2016-8-30 下午6:18:48 */ public String transformation(String url){ //百度的二级域名www.baidu.com重定向存在问题 if(url.equals("http://www.baidu.com")){ url = "http://baidu.com"; } return url; } }