抓网页_面包网_javaSE
1、http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-1.html
2、每一个 <li/>里面的内容 分类:
2.1、详细页面 链接
2.2、图片 (图片的地址 有些不多,需要做一些处理)
2.3、评分
2.4、影片名称
2.5、主演 (这里有一个HTML字符“·”)(HTML转义字符对照表 http://tool.oschina.net/commons?type=2)
2.6、类型
2.7、更新时间
2.8、剧情
2.9、状态
3、代码:
3.1、测试类
package test; import mianBao.TmovieMianBao; import z_utils.TzHttpClient; public class Ttest01 { public static void main(String[] args) throws Exception { String strHtml = TzHttpClient.GetZ("http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-4.html"); //System.out.println(strHtml); TmovieMianBao movie = new TmovieMianBao(); movie.FstrJson = strHtml; movie.JsonZ(); } }
3.2、TzHttpClient
package z_utils; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; @SuppressWarnings("deprecation") public class TzHttpClient { public static void main(String[] args) throws Exception { String strRtn = PostZ( "http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-2.html", null, true); System.out.println(strRtn); } // *** public static String PostZ(String _strUrl, String _strParam, boolean _bNeedResponse) throws Exception { DefaultHttpClient httpClient = null; try { //post请求返回结果 httpClient = new DefaultHttpClient(); HttpPost method = new HttpPost(_strUrl); if (null != _strParam) { //解决中文乱码问题 StringEntity entity = new StringEntity(_strParam, "utf-8"); entity.setContentEncoding("UTF-8"); entity.setContentType("application/json"); method.setEntity(entity); } HttpResponse result = httpClient.execute(method); /**请求发送成功,并得到响应**/ if (result.getStatusLine().getStatusCode() == 200) { if (! _bNeedResponse) return null; String str = EntityUtils.toString(result.getEntity()); //System.out.println(str); return str; } return null; } finally { if (httpClient != null) httpClient.close(); } } public static String GetZ(String _strUrl) throws Exception { DefaultHttpClient client = null; try { client = new DefaultHttpClient(); //发送get请求 HttpGet request = new HttpGet(_strUrl); HttpResponse response = client.execute(request); /**请求发送成功,并得到响应**/ if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { /**读取服务器返回过来的json字符串数据**/ String strResult = EntityUtils.toString(response.getEntity()); //System.out.println(strResult); return strResult; } System.out.println("get请求提交失败:" + _strUrl); return null; } finally { if (client != null) client.close(); } } }
3.3、面包网电影信息
package mianBao; //面包网 的电影的信息 public class TinfoMianBaoMovie { public String Fstr01_XiangQingLink = null; // “详情”页面 的链接 public String Fstr02_PicLink = null; // 图片的链接 public String Fstr03_Score = null; // 面包网 的对本电影的评分 public String Fstr04_MovieName = null; // 电影名称 public String Fstr05_ZhuYan = null; // 主演(们)的姓名 public String Fstr06_Type = null; // 电影的类型(s) public String Fstr07_UpdateDateTime = null; // 更新时间(这是网站更新本电影的时间,并非电影的上映时间[这个要到影片的“详情”页面里面去看]) public String Fstr08_JuQing = null; // (可能不完整的)剧情介绍 public String Fstr09_State = null; // 影片的状态(暂无地址 /BD/HD/TC 等) }
3.4、面包网 电影信息 获取
package mianBao; import java.util.ArrayList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Parser; import org.jsoup.select.Elements; import net.sf.json.JSONObject; public class TmovieMianBao { // 站内搜索(搜索界面) // http://find.mianbao99.com/vod-search // 站内搜索的网址(最后的"AAA"是搜索的内容) // http://zhannei.baidu.com/cse/search?s=15458377474113538507&isNeedCheckDomain=1&jump=1&q=AAA // ZC疑问:百度有向外提供搜索引擎的服务?通过s来区分不同的用户? // 相关教程:http://jingyan.baidu.com/article/7908e85c8f0c36af481ad22d.html // 动作片:http://www.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-1.html // 喜剧片:http://www.mianbao99.com/vod-showlist-id-9-order-time-c-4058-p-1.html // 爱情片:http://www.mianbao99.com/vod-showlist-id-10-order-time-c-2059-p-1.html // 科幻片:http://www.mianbao99.com/vod-showlist-id-11-order-time-c-1098-p-1.html // 恐怖片:http://www.mianbao99.com/vod-showlist-id-12-order-time-c-3110-p-1.html // 战争片:http://www.mianbao99.com/vod-showlist-id-13-order-time-c-736-p-1.html // 剧情片:http://www.mianbao99.com/vod-showlist-id-14-order-time-c-8713-p-1.html // 纪录片:http://www.mianbao99.com/vod-showlist-id-23-order-time-c-1549-p-1.html // 粤语片:http://www.mianbao99.com/vod-showlist-id-27-order-time-c-208-p-1.html // 动画电影:http://www.mianbao99.com/vod-showlist-id-31-order-time-c-658-p-1.html public static void main(String[] args) { } // *** public String FstrJson = null; String FstrPagesx = null; String FstrAjaxtxt = null; public int JsonZ() { JSONObject jsoupObj = JSONObject.fromObject(FstrJson); {/* Iterator<String> itKeys = jsoupObj.keys(); while (itKeys.hasNext()) { String strKey = itKeys.next(); if (! strKey.equalsIgnoreCase("ajaxtxt")) { String strValue = jsoupObj.getString(strKey); System.out.println("\n\nValue["+ strKey +"] : "); System.out.println(strValue); } } //*/} //* if (! jsoupObj.containsKey("ajaxtxt")) return -1; FstrPagesx = jsoupObj.getString("pagesx"); FstrAjaxtxt = jsoupObj.getString("ajaxtxt"); Json_pagesx(); Json_ajaxtxt(); return 0; } public String FstrPageIdxNow = null; public String FstrPageIdxTotal = null; public int Json_pagesx() { Document doc = Jsoup.parse(FstrPagesx, "", Parser.xmlParser()); Element eleLabel = doc.child(0); String str = eleLabel.text(); int iIdx = str.indexOf('/'); FstrPageIdxNow = str.substring(0, iIdx); FstrPageIdxTotal = str.substring(iIdx+1, str.length()); //System.out.println("当前页序号 : "+FstrPageIdxNow); //System.out.println("一共"+FstrPageIdxTotal+"页"); return 0; } public ArrayList<TinfoMianBaoMovie> FlistMovieInfo = null; public int Json_ajaxtxt() { Document doc = Jsoup.parse(FstrAjaxtxt, "", Parser.xmlParser()); Elements children = doc.children(); if ( (children == null) || (children.size() == 0) ) { System.out.println("Document has no child ."); return -1; } if (FlistMovieInfo == null) FlistMovieInfo = new ArrayList<TinfoMianBaoMovie>(); else FlistMovieInfo.clear(); for (int i=0; i<children.size(); i++) { TinfoMianBaoMovie movieInfo = new TinfoMianBaoMovie(); FlistMovieInfo.add(movieInfo); Element eleLi = children.get(i); Elements children01 = eleLi.children(); if (children01 == null) { System.out.println("<li>["+i+"] has no child ."); continue; } else if (children01.size() != 2) { System.out.println("<li>["+i+"] children's num is not 2 : "+children01.size()+" ."); continue; } Element eleA = children01.get(0); Element eleDiv = children01.get(1); Info1_3(movieInfo, eleA); Info4_9(movieInfo, eleDiv); } /* for (int i=0; i<FlistMovieInfo.size(); i++) { TinfoMianBaoMovie movieInfo = FlistMovieInfo.get(i); System.out.println(movieInfo.Fstr04_MovieName); System.out.println("\t“详情”页面 的链接 : "+movieInfo.Fstr01_XiangQingLink); System.out.println("\t图片的链接 : "+movieInfo.Fstr02_PicLink); System.out.println("\t评分 : "+movieInfo.Fstr03_Score); System.out.println("\t主演 : "+movieInfo.Fstr05_ZhuYan); System.out.println("\t类型 : "+movieInfo.Fstr06_Type); System.out.println("\t更新时间 : "+movieInfo.Fstr07_UpdateDateTime); System.out.println("\t剧情 : "+movieInfo.Fstr08_JuQing); System.out.println("\t状态 : "+movieInfo.Fstr09_State); } //*/ return 0; } public static int Info1_3(TinfoMianBaoMovie _movieInfo, Element _eleA) { _movieInfo.Fstr01_XiangQingLink = _eleA.attr("href"); Element eleImg = _eleA.child(0); if (eleImg != null) { _movieInfo.Fstr02_PicLink = eleImg.attr("src"); // 图片链接,需要做处理 PicLink(_movieInfo); } Element eleLabel = _eleA.child(1); if (eleLabel != null) _movieInfo.Fstr03_Score = eleLabel.text(); return 0; } public static int PicLink(TinfoMianBaoMovie _movieInfo) { // http://img.mianbao99.com:88/thumb/580360837a4e0.jpg String strUrl = "img.mianbao99.com:88"; int iIdx01 = _movieInfo.Fstr02_PicLink.indexOf("//"); int iIdx02 = _movieInfo.Fstr02_PicLink.indexOf('/', iIdx01+2); if (iIdx02 <= iIdx01) return -1; String str01 = _movieInfo.Fstr02_PicLink.substring(0, iIdx01+2); //String str02 = _movieInfo.Fstr02_PicLink.substring(iIdx01+2, iIdx02); String str03 = _movieInfo.Fstr02_PicLink.substring(iIdx02); // System.out.println(iIdx01+" --> "+iIdx02); // System.out.println("\t"+str01); // System.out.println("\t"+str02); // System.out.println("\t"+str03); _movieInfo.Fstr02_PicLink = str01 + strUrl + str03; //System.out.println(_movieInfo.Fstr02_PicLink); return 0; } public static int Info4_9(TinfoMianBaoMovie _movieInfo, Element _eleDiv) { { Element eleH5 = _eleDiv.child(0); Element eleA = eleH5.child(0); _movieInfo.Fstr04_MovieName = eleA.text(); } { Element eleP_ZhuYan = _eleDiv.child(1); _movieInfo.Fstr05_ZhuYan = ""; Elements chiledren = eleP_ZhuYan.children(); for (int i=1; i<chiledren.size(); i++) { Element child = chiledren.get(i); _movieInfo.Fstr05_ZhuYan += child.text(); if (i != (chiledren.size()-1)) _movieInfo.Fstr05_ZhuYan += " "; } } { Element eleP_Type = _eleDiv.child(2); _movieInfo.Fstr06_Type = ""; Elements chiledren = eleP_Type.children(); for (int i=1; i<chiledren.size(); i++) { Element child = chiledren.get(i); _movieInfo.Fstr06_Type += child.text(); if (i != (chiledren.size()-1)) _movieInfo.Fstr06_Type += " "; } } { Element eleP_UpdateDateTime = _eleDiv.child(3); _movieInfo.Fstr07_UpdateDateTime = eleP_UpdateDateTime.ownText(); } { Element eleP_JuQing = _eleDiv.child(4); _movieInfo.Fstr08_JuQing = eleP_JuQing.ownText(); } { Element eleP_State = _eleDiv.child(5); String str = eleP_State.ownText(); char c = (char)160; str = str.replace(c, ' '); _movieInfo.Fstr09_State = str; } return 0; } }
4、
5、