抓网页_面包网_javaSE

1、http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-1.html

2、每一个 <li/>里面的内容 分类:

  2.1、详细页面 链接

  2.2、图片  (图片的地址 有些不多,需要做一些处理)

  2.3、评分

  2.4、影片名称

  2.5、主演  (这里有一个HTML字符“&middot;”)(HTML转义字符对照表 http://tool.oschina.net/commons?type=2)

  2.6、类型

  2.7、更新时间

  2.8、剧情

  2.9、状态

3、代码:

  3.1、测试类

package test;

import mianBao.TmovieMianBao;
import z_utils.TzHttpClient;

public class Ttest01
{
    public static void main(String[] args) throws Exception
    {
        String strHtml = TzHttpClient.GetZ("http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-4.html");
            //System.out.println(strHtml);
        
        TmovieMianBao movie = new TmovieMianBao();
        movie.FstrJson = strHtml;
        movie.JsonZ();
    }

}

 

  3.2、TzHttpClient

package z_utils;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;

@SuppressWarnings("deprecation")
public class TzHttpClient
{
    public static void main(String[] args) throws Exception
    {
        String strRtn = PostZ(
            "http://ajax.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-2.html",
            null,
            true);
        System.out.println(strRtn);
    }
    
// ***

    public static String PostZ(String _strUrl, String _strParam, boolean _bNeedResponse) throws Exception
    {
        DefaultHttpClient httpClient = null;
        try
        {
            //post请求返回结果
            httpClient = new DefaultHttpClient();
            HttpPost method = new HttpPost(_strUrl);
            if (null != _strParam)
            {
                //解决中文乱码问题
                StringEntity entity = new StringEntity(_strParam, "utf-8");
                entity.setContentEncoding("UTF-8");
                entity.setContentType("application/json");
                method.setEntity(entity);
            }
            HttpResponse result = httpClient.execute(method);
            /**请求发送成功,并得到响应**/
            if (result.getStatusLine().getStatusCode() == 200)
            {
                if (! _bNeedResponse)
                    return null;
                String str = EntityUtils.toString(result.getEntity());
                //System.out.println(str);
                return str;
            }
            return null;
        }
        finally
        {
            if (httpClient != null)
                httpClient.close();
        }
    }

    public static String GetZ(String _strUrl) throws Exception
    {
        DefaultHttpClient client = null;
        try
        {
            client = new DefaultHttpClient();
            //发送get请求
            HttpGet request = new HttpGet(_strUrl);
            HttpResponse response = client.execute(request);
    
            /**请求发送成功,并得到响应**/
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK)
            {
                /**读取服务器返回过来的json字符串数据**/
                String strResult = EntityUtils.toString(response.getEntity());
                //System.out.println(strResult);
                return strResult;
            }
            System.out.println("get请求提交失败:" + _strUrl);
            return null;
        }
        finally
        {
            if (client != null)
                client.close();
        }
    }
}

 

  3.3、面包网电影信息

package mianBao;

//面包网 的电影的信息
public class TinfoMianBaoMovie
{    
    public String Fstr01_XiangQingLink = null; // “详情”页面 的链接
    public String Fstr02_PicLink = null; // 图片的链接
    public String Fstr03_Score = null; // 面包网 的对本电影的评分
    public String Fstr04_MovieName = null; // 电影名称
    public String Fstr05_ZhuYan = null; // 主演(们)的姓名
    public String Fstr06_Type = null; // 电影的类型(s)
    public String Fstr07_UpdateDateTime = null; // 更新时间(这是网站更新本电影的时间,并非电影的上映时间[这个要到影片的“详情”页面里面去看])
    public String Fstr08_JuQing = null; // (可能不完整的)剧情介绍
    public String Fstr09_State = null; // 影片的状态(暂无地址 /BD/HD/TC 等)
}

 

  3.4、面包网 电影信息 获取

package mianBao;

import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;

import net.sf.json.JSONObject;

public class TmovieMianBao
{
    // 站内搜索(搜索界面)
    // http://find.mianbao99.com/vod-search
    // 站内搜索的网址(最后的"AAA"是搜索的内容)
    // http://zhannei.baidu.com/cse/search?s=15458377474113538507&isNeedCheckDomain=1&jump=1&q=AAA
    // ZC疑问:百度有向外提供搜索引擎的服务?通过s来区分不同的用户?
    //    相关教程:http://jingyan.baidu.com/article/7908e85c8f0c36af481ad22d.html
    
    // 动作片:http://www.mianbao99.com/vod-showlist-id-8-order-time-c-3719-p-1.html
    // 喜剧片:http://www.mianbao99.com/vod-showlist-id-9-order-time-c-4058-p-1.html
    // 爱情片:http://www.mianbao99.com/vod-showlist-id-10-order-time-c-2059-p-1.html
    // 科幻片:http://www.mianbao99.com/vod-showlist-id-11-order-time-c-1098-p-1.html
    // 恐怖片:http://www.mianbao99.com/vod-showlist-id-12-order-time-c-3110-p-1.html
    // 战争片:http://www.mianbao99.com/vod-showlist-id-13-order-time-c-736-p-1.html
    // 剧情片:http://www.mianbao99.com/vod-showlist-id-14-order-time-c-8713-p-1.html
    // 纪录片:http://www.mianbao99.com/vod-showlist-id-23-order-time-c-1549-p-1.html
    // 粤语片:http://www.mianbao99.com/vod-showlist-id-27-order-time-c-208-p-1.html
    // 动画电影:http://www.mianbao99.com/vod-showlist-id-31-order-time-c-658-p-1.html
        
    public static void main(String[] args)
    {
    }
    
// ***

    public String FstrJson = null;
    String FstrPagesx = null;
    String FstrAjaxtxt = null;
    
    public int JsonZ()
    {
        JSONObject jsoupObj = JSONObject.fromObject(FstrJson);
        {/*
            Iterator<String> itKeys = jsoupObj.keys();
            while (itKeys.hasNext())
            {
                String strKey = itKeys.next();
                if (! strKey.equalsIgnoreCase("ajaxtxt"))
                {
                    String strValue = jsoupObj.getString(strKey);
                    System.out.println("\n\nValue["+ strKey +"] : ");
                    System.out.println(strValue);
                }
            }
        //*/}
        //*
        if (! jsoupObj.containsKey("ajaxtxt"))
            return -1;
        
        FstrPagesx = jsoupObj.getString("pagesx");
        FstrAjaxtxt = jsoupObj.getString("ajaxtxt");
        Json_pagesx();
        Json_ajaxtxt();
        
        return 0;
    }
    
    public String FstrPageIdxNow = null;
    public String FstrPageIdxTotal = null;
    
    public int Json_pagesx()
    {
        Document doc =  Jsoup.parse(FstrPagesx, "", Parser.xmlParser());
        Element eleLabel = doc.child(0);
        String str = eleLabel.text();
        int iIdx = str.indexOf('/');
        FstrPageIdxNow = str.substring(0, iIdx);
        FstrPageIdxTotal = str.substring(iIdx+1, str.length());
            //System.out.println("当前页序号 : "+FstrPageIdxNow);
            //System.out.println("一共"+FstrPageIdxTotal+"页");

        return 0;
    }
    
    public ArrayList<TinfoMianBaoMovie> FlistMovieInfo = null;
    
    public int Json_ajaxtxt()
    {
        Document doc =  Jsoup.parse(FstrAjaxtxt, "", Parser.xmlParser());
        
        Elements children = doc.children();
        if ( (children == null) || (children.size() == 0) )
        {
            System.out.println("Document has no child .");
            return -1;
        }
        
        if (FlistMovieInfo == null)
            FlistMovieInfo = new ArrayList<TinfoMianBaoMovie>();
        else
            FlistMovieInfo.clear();
        for (int i=0; i<children.size(); i++)
        {
            TinfoMianBaoMovie movieInfo = new TinfoMianBaoMovie();
            FlistMovieInfo.add(movieInfo);
            
            Element eleLi = children.get(i);
            Elements children01 = eleLi.children();
            if (children01 == null)
            {
                System.out.println("<li>["+i+"] has no child .");
                continue;
            }
            else if (children01.size() != 2)
            {
                System.out.println("<li>["+i+"] children's num is not 2 : "+children01.size()+" .");
                continue;
            }
            
            Element eleA = children01.get(0);
            Element eleDiv = children01.get(1);
            Info1_3(movieInfo, eleA);
            Info4_9(movieInfo, eleDiv);
        }
        /*
        for (int i=0; i<FlistMovieInfo.size(); i++)
        {
            TinfoMianBaoMovie movieInfo = FlistMovieInfo.get(i);
            System.out.println(movieInfo.Fstr04_MovieName);
            System.out.println("\t“详情”页面 的链接 : "+movieInfo.Fstr01_XiangQingLink);
            System.out.println("\t图片的链接 : "+movieInfo.Fstr02_PicLink);
            System.out.println("\t评分 : "+movieInfo.Fstr03_Score);
            System.out.println("\t主演 : "+movieInfo.Fstr05_ZhuYan);
            System.out.println("\t类型 : "+movieInfo.Fstr06_Type);
            System.out.println("\t更新时间 : "+movieInfo.Fstr07_UpdateDateTime);
            System.out.println("\t剧情 : "+movieInfo.Fstr08_JuQing);
            System.out.println("\t状态 : "+movieInfo.Fstr09_State);
        }
        //*/
        return 0;
    }
    
    
    public static int Info1_3(TinfoMianBaoMovie _movieInfo, Element _eleA)
    {
        _movieInfo.Fstr01_XiangQingLink = _eleA.attr("href");
        Element eleImg = _eleA.child(0);
        if (eleImg != null)
        {
            _movieInfo.Fstr02_PicLink = eleImg.attr("src");
            // 图片链接,需要做处理
            PicLink(_movieInfo);
        }
        Element eleLabel = _eleA.child(1);
        if (eleLabel != null)
            _movieInfo.Fstr03_Score = eleLabel.text();
        return 0;
    }
    
    public static int PicLink(TinfoMianBaoMovie _movieInfo)
    {
        // http://img.mianbao99.com:88/thumb/580360837a4e0.jpg
        String strUrl = "img.mianbao99.com:88";
        
        int iIdx01 = _movieInfo.Fstr02_PicLink.indexOf("//");
        int iIdx02 = _movieInfo.Fstr02_PicLink.indexOf('/', iIdx01+2);
        if (iIdx02 <= iIdx01)
            return -1;
        String str01 = _movieInfo.Fstr02_PicLink.substring(0, iIdx01+2);
        //String str02 = _movieInfo.Fstr02_PicLink.substring(iIdx01+2, iIdx02);
        String str03 = _movieInfo.Fstr02_PicLink.substring(iIdx02);
//            System.out.println(iIdx01+" --> "+iIdx02);
//            System.out.println("\t"+str01);
//            System.out.println("\t"+str02);
//            System.out.println("\t"+str03);
        _movieInfo.Fstr02_PicLink = str01 + strUrl + str03;
            //System.out.println(_movieInfo.Fstr02_PicLink);
        return 0;
    }
    
    public static int Info4_9(TinfoMianBaoMovie _movieInfo, Element _eleDiv)
    {
        {
            Element eleH5 = _eleDiv.child(0);
                Element eleA = eleH5.child(0);
                    _movieInfo.Fstr04_MovieName = eleA.text();
        }
        {
            Element eleP_ZhuYan = _eleDiv.child(1);
            _movieInfo.Fstr05_ZhuYan = "";
                Elements chiledren = eleP_ZhuYan.children();
                    for (int i=1; i<chiledren.size(); i++)
                    {
                        Element child = chiledren.get(i);
                        _movieInfo.Fstr05_ZhuYan += child.text();
                        if (i != (chiledren.size()-1))
                            _movieInfo.Fstr05_ZhuYan += " ";
                    }
        }
        {
            Element eleP_Type = _eleDiv.child(2);
            _movieInfo.Fstr06_Type = "";
                Elements chiledren = eleP_Type.children();
                    for (int i=1; i<chiledren.size(); i++)
                    {
                        Element child = chiledren.get(i);
                        _movieInfo.Fstr06_Type += child.text();
                        if (i != (chiledren.size()-1))
                            _movieInfo.Fstr06_Type += " ";
                    }
        }
        {
            Element eleP_UpdateDateTime = _eleDiv.child(3);
            _movieInfo.Fstr07_UpdateDateTime = eleP_UpdateDateTime.ownText();
            
        }
        {
            Element eleP_JuQing = _eleDiv.child(4);
            _movieInfo.Fstr08_JuQing = eleP_JuQing.ownText();
        }
        {
            Element eleP_State = _eleDiv.child(5);
            String str = eleP_State.ownText();
            char c = (char)160;
            str = str.replace(c, ' ');
            _movieInfo.Fstr09_State = str;
        }
        return 0;
    }

}

 

4、

5、

 

posted @ 2016-10-18 11:47  CodeHouse  阅读(542)  评论(0编辑  收藏  举报