解决java用url中读取html源码时的乱码问题
1、主要应用getContentType获取相应的网页编码方式:
pageUrl=new URL(urlString);
HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection();
String encoding=uc.getContentType();
2、再提取charset子串(这里使用"charset=",由于网页中的大小写不敏感,所以最好用正则表达式修改一下)
encoding=encoding.substring(encoding.indexOf("charset=")+8).trim();
//System.out.println("+"+encoding+"+");
// 创建网络流
BufferedReader reader=
new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding));
3、下面是源代码,注释的很清楚:
1 import java.io.BufferedReader; 2 import java.io.IOException; 3 import java.io.InputStreamReader; 4 import java.io.UnsupportedEncodingException; 5 import java.net.HttpURLConnection; 6 import java.net.MalformedURLException; 7 import java.net.URL; 8 9 public class PageString { 10 private StringBuffer strBuf=new StringBuffer(); 11 private URL pageUrl=null; 12 public PageString(String urlString){ 13 try { 14 //System.out.println(urlString); 15 pageUrl=new URL(urlString); 16 try { 17 //获取网页的编码方式,这里可以解决乱码问题 18 HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection(); 19 String encoding=uc.getContentType(); 20 encoding=encoding.substring(encoding.indexOf("charset=")+8).trim(); 21 //System.out.println("+"+encoding+"+"); 22 // 创建网络流 23 BufferedReader reader= 24 new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding)); 25 String line; 26 // 读取网页内容 27 //new StringBuffer(); 28 while((line=reader.readLine())!=null){ 29 //System.out.println(line); 30 strBuf.append(line+"\t\n"); 31 } 32 } catch (IOException e) { 33 // TODO Auto-generated catch block 34 e.printStackTrace(); 35 } 36 } catch (MalformedURLException e) { 37 // TODO Auto-generated catch block 38 e.printStackTrace(); 39 } 40 } 41 public StringBuffer getStrBuf() throws UnsupportedEncodingException { 42 //System.out.println(new String(strBuf.toString().getBytes("gb2312")).toString()); 43 return this.strBuf; 44 } 45 }