【java】抓取页面内容,提取链接(此方法可以http get无需账号密码的请求)
1 package 网络编程; 2 3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.FileOutputStream; 6 import java.io.IOException; 7 import java.io.InputStreamReader; 8 import java.io.OutputStreamWriter; 9 import java.net.URL; 10 11 public class TestBaidu { 12 public static void main(String[] args) throws IOException { 13 URL url=new URL("http://www.baidu.com"); 14 /*此方法会有乱码输出 15 InputStream is=url.openStream(); 16 byte[] b=new byte[1024]; 17 int len=0; 18 while((len=is.read(b))!=-1){ 19 System.out.println(new String(b,0,len)); 20 } 21 */ 22 BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream(),"utf-8")); 23 BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(new FileOutputStream("baidu.html"),"utf-8")); 24 String str=null; 25 while((str=br.readLine())!=null){ 26 bw.append(str); 27 bw.newLine(); 28 } 29 //System.out.print(str); 30 bw.flush(); 31 bw.close(); 32 br.close(); 33 } 34 }
1 package 网络编程; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.InputStreamReader; 6 import java.net.URL; 7 import java.nio.charset.Charset; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 11 public class Get163URL { 12 public static void main(String[] args) throws IOException { 13 URL url=new URL("http://www.163.com"); 14 BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk"))); 15 StringBuffer sb=new StringBuffer(); 16 String tmp=null; 17 while((tmp=br.readLine())!=null){ 18 sb.append(tmp); 19 } 20 // System.out.println(sb.toString()); 21 Pattern p=Pattern.compile("\"(http:\\/\\/.+?)\""); 22 Matcher m=p.matcher(sb); 23 while(m.find()) 24 System.out.println(m.group(1)); 25 } 26 }
1 public class WikiDownload { 2 static final String name = "username"; 3 static final String pwd = "password"; 4 5 public static void main(String[] args){ 6 CookieManager manager = new CookieManager(); 7 CookieHandler.setDefault(manager); 8 String wikiUrl = "http://wiki.xxxxx.org/pages/viewpage.action?pageId=71709153"; 9 String loginUrl = "http://wiki.xxxxx.org/login.action?os_destination=%2Fpages%2Fviewpage.action%3FpageId%3D71709153"; 10 try{ 11 URL url = new URL(loginUrl); 12 HttpURLConnection connection = (HttpURLConnection)url.openConnection(); 13 connection.setRequestProperty("accept", "*/*"); 14 connection.setRequestProperty("connection", "Keep-Alive"); 15 connection.setRequestProperty("user-agent", 16 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"); 17 String line; 18 connection.setDoInput(true); 19 connection.setDoOutput(true); 20 connection.setUseCaches(false); 21 connection.setRequestMethod("POST"); 22 try(OutputStreamWriter writer = new OutputStreamWriter(connection.getOutputStream())){ 23 writer.write("os_username=" + name 24 +"&os_password="+ pwd 25 + "&login=%E7%99%BB%E5%BD%95&os_destination=" 26 + URLEncoder.encode(wikiUrl.split("http://wiki.xxxxx.org")[0],"utf-8")); 27 } 28 try(InputStreamReader reader = new InputStreamReader(connection.getInputStream())){ 29 BufferedReader in = new BufferedReader(reader); 30 StringBuilder result= new StringBuilder(""); 31 while ((line = in.readLine()) != null) { 32 result.append("\n"); 33 result.append(line); 34 } 35 System.out.println(result); 36 } 37 }catch (Exception e){ 38 e.printStackTrace(); 39 } 40 41 } 42 43 }