一个爬喜马拉雅音频的例子
不废话了,上代码
import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.net.URL; import java.net.URLConnection; import com.yeepay.g3.utils.common.json.JSONUtils; public class MyHttpClient { public static void main(String[] args) throws ClientProtocolException, IOException { HttpClient hClient = new DefaultHttpClient(); //http://www.ximalaya.com/32160470/album/2881558/ //http://www.ximalaya.com/tracks/61185755.json String mainU = "http://www.ximalaya.com/32160470/album/2881558?page="; for(int i=1;i<=1;i++){ int m = mainU.indexOf("page="); mainU = mainU.substring(0,m+5); mainU = mainU+i; System.out.println(mainU); HttpGet hget = new HttpGet(mainU); HttpResponse response = hClient.execute(hget); // EntityUtils工具类把网页实体转换成字符串 String content = EntityUtils.toString(response.getEntity(), "utf-8"); Document doc = Jsoup.parse(content); Elements elements=doc.select("a[class='title']"); for(Element ele:elements){ String dateStr = ele.nextElementSibling().text(); System.out.println(dateStr); String dirName = "/Users/yp-tc-m-2777/Desktop/testNewP/"+dateStr.substring(0,7); System.out.println(dirName); File filed = new File(dirName); if(!filed.exists()){ filed.mkdir(); } String id = ele.attr("href").split("/")[3]; System.out.println(id); // /32160470/sound/68215809/ String url = "http://www.ximalaya.com/tracks/{id}.json"; url = url.replace("{id}", id); System.out.println(url); System.out.println(ele.text()); System.out.println(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())); String fileName = ele.text(); File file = new File(dirName+File.separator+dateStr+" "+fileName+".m4a"); if(file.exists()){ continue; } //hClient = new DefaultHttpClient(); HttpGet r = new HttpGet(url); HttpResponse r1 = hClient.execute(r); String r2 = EntityUtils.toString(r1.getEntity(), "utf-8"); Map<String,String> map = JSONUtils.jsonToMap(r2, String.class, String.class); String downUrl = map.get("play_path_64"); URL url1 = new URL(downUrl);; URLConnection urlconn = url1.openConnection(); InputStream inputStream = urlconn.getInputStream(); //获取自己数组 byte[] getData = readInputStream(inputStream); FileOutputStream fos = new FileOutputStream(file); fos.write(getData); if(fos!=null){ fos.close(); } if(inputStream!=null){ inputStream.close(); } } } // System.out.println(content); } public static byte[] readInputStream(InputStream inputStream) throws IOException { byte[] buffer = new byte[1024]; int len = 0; ByteArrayOutputStream bos = new ByteArrayOutputStream(); while((len = inputStream.read(buffer)) != -1) { bos.write(buffer, 0, len); } bos.close(); return bos.toByteArray(); } }