Java HttpURLConnection 抓取网页内容 解析gzip格式输入流数据并转换为String格式字符串
最近GFW为了刷存在感,搞得大家是头晕眼花,修改hosts 几乎成了每日必备工作。
索性写了一个小程序,给办公室的同事们分享,其中有个内容 就是抓取网络上的hosts,废了一些周折。
我是在一个博客上抓取的。但是这位朋友的博客应该是在做防盗链,但他的方式比较简单就是5位数的一个整形随机数。这里折腾一下就ok了。
要命的是他这个链接的流类型 居然是gzip。这个郁闷好久,一直以为是编码格式导致解析不出来结果,后来发现是gzip搞的。
主要的一段代码做个记录吧。
1 /** 2 * 网络工具类 用于抓取http://serve.netsh.org上的hosts数据 3 * 4 * @author tone 5 */ 6 public class NetUtil { 7 8 private final static String ENCODING = "UTF-8"; 9 private final static String GZIPCODING = "gzip"; 10 private final static String HOST = "http://serve.netsh.org/pub/hosts.php"; 11 private final static String COOKIE = "hostspasscode=%s; Hm_lvt_e26a7cd6079c926259ded8f19369bf0b=1421846509,1421846927,1421847015,1421849633; Hm_lpvt_e26a7cd6079c926259ded8f19369bf0b=1421849633"; 12 private final static String OFF = "off"; 13 private final static String ON = "on"; 14 private final static int RANDOM = 100000; 15 private static String hostspasscode = null; 16 private static NetUtil instance; 17 18 public static NetUtil getInstance() { 19 if (instance == null) { 20 instance = new NetUtil(); 21 } 22 return instance; 23 } 24 25 private NetUtil() { 26 hostspasscode = createRandomCookies(); 27 } 28 29 /** 30 * 获取html内容 31 * 32 * @param gs 33 * @param wk 34 * @param twttr 35 * @param fb 36 * @param flkr 37 * @param dpbx 38 * @param odrvB 39 * @param yt 40 * @param nohl 41 * @return 42 */ 43 public String getHtmlInfo(boolean gs, boolean wk, boolean twttr, boolean fb, 44 boolean flkr, boolean dpbx, boolean odrv, 45 boolean yt, boolean nohl) throws Exception { 46 HttpURLConnection conn = null; 47 48 String result = ""; 49 50 //String cookie = "hostspasscode="+hostspasscode+"; Hm_lvt_e26a7cd6079c926259ded8f19369bf0b=1421846509,1421846927,1421847015,1421849633; Hm_lpvt_e26a7cd6079c926259ded8f19369bf0b=1421849633"; 51 String cookie = String.format(COOKIE, hostspasscode); 52 53 //URL url = new URL("http://serve.netsh.org/pub/hosts.php?passcode=13008&gs=on&wk=on&twttr=on&fb=on&flkr=on&dpbx=on&odrv=on&yt=on&nolh=on"); 54 URL url = new URL(createUrl(hostspasscode, gs, wk, twttr, fb, flkr, dpbx, odrv, yt, nohl)); 55 //System.out.println(cookie); 56 // System.out.println(url.toString()); 57 58 conn = (HttpURLConnection) url.openConnection(); 59 60 conn.setConnectTimeout(5 * 1000); 61 conn.setDoOutput(true); 62 //get方式提交 63 conn.setRequestMethod("GET"); 64 //凭借请求头文件 65 conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); 66 conn.setRequestProperty("Accept-Encoding", "gzip, deflate"); 67 conn.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); 68 conn.setRequestProperty("Connection", "keep-alive"); 69 conn.setRequestProperty("Cookie", cookie); 70 conn.setRequestProperty("Host", "serve.netsh.org"); 71 conn.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0"); 72 73 // conn.setRequestProperty("Referer", "http://serve.netsh.org/pub/gethosts.php"); 74 // conn.setRequestProperty("X-Requested-With", "XMLHttpRequest"); 75 76 conn.connect(); 77 78 String encoding = conn.getContentEncoding(); 79 80 result = readStream(conn.getInputStream(), encoding); 81 //测试进度条显示 82 // result = readStream(new FileInputStream(new File("/home/tone/Resident.Evil.Damnation.2012.1080p.BluRay.x264.DTS-WiKi.mkv")), "11"); 83 84 conn.disconnect(); 85 if (nohl) { 86 result=getLocalHost()+result; 87 } 88 89 return result; 90 } 91 92 /** 93 * 读取将InputStream中的字节读以字符的形式取到字符串中,如果encoding是gzip,那么需要先有GZIPInputStream进行封装 94 * 95 * @param inputStream InputStream字节流 96 * @param encoding 编码格式 97 * @return String类型的形式 98 * @throws IOException IO异常 99 */ 100 private String readStream(InputStream inputStream, String encoding) throws Exception { 101 StringBuffer buffer = new StringBuffer(); 102 ProgressMonitorInputStream pmis = null; 103 104 InputStreamReader inputStreamReader = null; 105 GZIPInputStream gZIPInputStream = null; 106 if (GZIPCODING.equals(encoding)) { 107 gZIPInputStream = new GZIPInputStream(inputStream); 108 inputStreamReader = new InputStreamReader(ProgressUtil.getMonitorInputStream(gZIPInputStream, "获取网络数据"), ENCODING); 109 110 } else { 111 112 inputStreamReader = new InputStreamReader(ProgressUtil.getMonitorInputStream(inputStream, "获取网络数据"), ENCODING); 113 } 114 115 116 char[] c = new char[1024]; 117 118 int lenI; 119 while ((lenI = inputStreamReader.read(c)) != -1) { 120 121 buffer.append(new String(c, 0, lenI)); 122 123 } 124 if (inputStream != null) { 125 inputStream.close(); 126 } 127 if (gZIPInputStream != null) { 128 gZIPInputStream.close(); 129 } 130 if (pmis!=null) { 131 gZIPInputStream.close(); 132 } 133 134 135 return buffer.toString(); 136 137 138 } 139 140 /** 141 * 生成随机Cookies数组 142 * 143 * @return 五位随机数字 144 */ 145 private String createRandomCookies() { 146 147 return String.valueOf(Math.random() * RANDOM).substring(0, 5); 148 149 } 150 151 /** 152 * 生成链接字符串 153 * 154 * @param hostspasscode 155 * @param gs 156 * @param wk 157 * @param twttr 158 * @param fb 159 * @param flkr 160 * @param dpbx 161 * @param odrvB 162 * @param yt 163 * @param nohl 164 * @return 165 */ 166 private String createUrl(String hostspasscode, boolean gs, boolean wk, boolean twttr, boolean fb, 167 boolean flkr, boolean dpbx, boolean odrv, 168 boolean yt, boolean nohl) { 169 StringBuffer buffer = new StringBuffer(); 170 buffer.append(HOST); 171 buffer.append("?passcode=" + hostspasscode); 172 if (gs) { 173 buffer.append("&gs=" + ON); 174 } else { 175 buffer.append("&gs=" + OFF); 176 } 177 if (wk) { 178 buffer.append("&wk=" + ON); 179 } else { 180 buffer.append("&wk=" + OFF); 181 } 182 if (twttr) { 183 buffer.append("&twttr=" + ON); 184 } else { 185 buffer.append("&twttr=" + OFF); 186 } 187 if (fb) { 188 buffer.append("&fb=" + ON); 189 } else { 190 buffer.append("&fb=" + OFF); 191 } 192 if (flkr) { 193 buffer.append("&flkr=" + ON); 194 } else { 195 buffer.append("&flkr=" + OFF); 196 } 197 if (dpbx) { 198 buffer.append("&dpbx=" + ON); 199 } else { 200 buffer.append("&dpbx=" + OFF); 201 } 202 if (odrv) { 203 buffer.append("&odrv=" + ON); 204 } else { 205 buffer.append("&odrv=" + OFF); 206 } 207 if (yt) { 208 buffer.append("&yt=" + ON); 209 } else { 210 buffer.append("&yt=" + OFF); 211 } 212 if (nohl) { 213 buffer.append("&nohl=" + ON); 214 } else { 215 buffer.append("&nohl=" + OFF); 216 } 217 return buffer.toString(); 218 } 219 220 private String getLocalHost() throws Exception { 221 222 StringBuffer buffer=new StringBuffer(); 223 String hostName=OSUtil.getInstance().getLocalhostName(); 224 buffer.append("#LOCALHOST begin"+"\n"); 225 buffer.append("127.0.0.1\tlocalhost"+"\n"); 226 if (hostName!=null&&!"".equals(hostName)) { 227 buffer.append("127.0.1.1\t"+hostName+"\n"); 228 } 229 230 buffer.append("#LOCALHOST end"+"\n"); 231 return buffer.toString(); 232 233 234 235 } 236 237 }
转载请注明出处:http://www.cnblogs.com/bcsflilong/