crawler_httpurlconnection_自动编码识别
核心思想:
1:从响应头中读取 【命中解流准确率最高】
2:如果响应头中没有,打开流从源码中读取,【取舍,如果有一般在前30行会有,前100行中寻找】
3:如果还没有,根据字节码code位置,字符识别。【前三个字符揣测】
4:最终依旧没有命中采用,大陆国标编码【概率接近于0 ,gb2312】
综合效果,尚无测试到编码有问题的站点。
1 /** 2 * @declare:下载 自动识别编码 3 * @param url 4 * @return 5 * @author cphmvp 6 */ 7 public static StringBuffer downloadHtmlAutoCode(String url) { 8 StringBuffer sb = new StringBuffer(); 9 BufferedReader bufferReader = null; 10 InputStream inputStream = null; 11 BufferedInputStream bufferedInputStream = null; 12 int tryNum = 0; 13 while (true) { 14 try { 15 if (tryNum > 1) { 16 String ecodingUrl = encodParamters(url); 17 urlModel = new URL(ecodingUrl); 18 } else { 19 urlModel = new URL(url); 20 } 21 httpURLConnection = (HttpURLConnection) urlModel 22 .openConnection(); 23 httpURLConnection.setConnectTimeout(connectTimeout); 24 httpURLConnection.setReadTimeout(readTimeout); 25 // httpURLConnection.setInstanceFollowRedirects(false); 26 // httpURLConnection.setFollowRedirects(true); 27 httpURLConnection 28 .setRequestProperty("User-Agent", 29 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"); 30 String redirectUrl = httpURLConnection.getURL().toString(); 31 if (!redirectUrl.equals(url)) { 32 LOG.info(url + "重定向后为" + redirectUrl); 33 } 34 // 得到响应流 35 inputStream = httpURLConnection.getInputStream(); 36 if (null == inputStream) 37 continue; 38 String charSetHeader = httpURLConnection 39 .getHeaderField("Content-Type"); 40 bufferedInputStream = new BufferedInputStream(inputStream); 41 String charSet = null; 42 // 第一步先从响应头header判断 43 if (charSetHeader != null) { 44 Pattern p = Pattern.compile("charset=[\"']?(.*)['\"]?"); 45 Matcher m = p.matcher(charSetHeader); 46 if (m.find()) { 47 charSet = m.group(1).trim(); 48 } 49 } 50 // System.out.println(bufferedInputStream.available() > 0); 51 // System.out.println(bufferedInputStream.markSupported()); 52 // 第二步 从源码中【meta http-equiv="content-type" 】判断 53 // if (null == charSet) { 54 // charSet = getEncode(bufferedInputStream); 55 // System.out.println("---->charSet: 读流识别出来的编码" + charSet); 56 // } 57 58 // 排除非html格式 只有一两行的状况 59 if (null == charSet 60 && charSetHeader.toLowerCase().contains("html")) { 61 // 缓冲区设置大些, read走的信息小于 这个值,就能reset 回来。 62 bufferedInputStream.mark(102400); 63 bufferReader = new BufferedReader(new InputStreamReader( 64 bufferedInputStream)); 65 int lineNum = 1; 66 String inputLine; 67 // reset 在读至流的末尾是无法生效,故限制前100行找,找不到 放弃 68 while ((inputLine = bufferReader.readLine()) != null 69 && lineNum < 100) { 70 if (inputLine.toLowerCase().contains("charset")) { 71 charSet = RegexUtils.getString(inputLine, 72 "charset=[\"']?(.*?)[\"']", 1); 73 LOG.info("自动识别出编码:" + charSet); 74 // 第一次匹配到后 ,不再往下判断,减少判断行数,及误判概率 75 break; 76 } 77 lineNum++; 78 inputLine = null; 79 } 80 // 第三步奏 穿插补录步奏 81 if (null == charSet) { 82 byte[] head = new byte[3]; 83 bufferedInputStream.read(head); 84 if (head[0] == -1 && head[1] == -2) 85 charSet = "UTF-16"; 86 if (head[0] == -2 && head[1] == -1) 87 charSet = "Unicode"; 88 if (head[0] == -17 && head[1] == -69 && head[2] == -65) 89 charSet = "UTF-8"; 90 } 91 92 // 通道回溯 93 bufferedInputStream.reset(); 94 } 95 96 // 第四步奏指向默认 utf-8 97 charSet = (charSet == null ? defaultEncoding : charSet); 98 // 第五步奏按照正确编码解码响应流 99 bufferReader = new BufferedReader(new InputStreamReader( 100 bufferedInputStream, charSet)); 101 String inputLine; 102 while ((inputLine = bufferReader.readLine()) != null) { 103 sb.append(inputLine + "\n"); 104 inputLine = null; 105 } 106 if (bufferReader != null) 107 try { 108 bufferReader.close(); 109 } catch (IOException e) { 110 LOG.error(e); 111 } 112 if (httpURLConnection != null) 113 httpURLConnection.disconnect(); 114 break; 115 } catch (Exception e) { 116 if (tryNum++ == 3) { 117 LOG.error("download page error [ " + urlModel + " ] "); 118 return null; 119 } 120 LOG.warn(tryNum + "次下载失败"); 121 } 122 } 123 return sb; 124 125 }
create by cphmvp
email:cphmvp@163.com
爬虫技术交流_crawler QQ群 :167047843