java Jsoup 抓取页面数据

List<ImageBean> imgList = new ArrayList<ImageBean>();
        ImageBean image = null;
        String imageTime = "";
        String imageName = "";
        String url = "";
        for (Map.Entry<String, String> entry : map.entrySet()) {
            try {
                Document doc = Jsoup.connect(entry.getKey()).get();
                Elements scripts = doc.select("script");

                JSONObject obj = null;
                String[] datas = entry.getValue().split(this.split);
                for (int i = 0; i < scripts.size(); i++) 
                {
                    Element script = scripts.get(i); // Get the script part
                    Pattern p = Pattern.compile(datas[3]); // 匹配图片链接地址的正则表达式
                    Matcher m = p.matcher(script.html()); // 匹配的字符串
                    while (m.find()) 
                    {
                        image = new ImageBean();
                        String matchStr = m.group(1);
                        obj = JSONObject.parseObject(matchStr);
                        url = datas[1] + obj.getString(datas[4]);
                        image.setUrl(url);
                        imageTime = getImageTime(url);
                        image.setName(imageTime);
                        image.setType(datas[3]);
                        image.setImageType(datas[5]);
                        imgList.add(image);
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
<entry key="http://www.nmc.cn/publish/nwp/t639/ea/500hPa-hgt.html">
                    <value>高度场~http://image.nmc.cn~type~data.push\((\{*.*?\})\)~img_path~nmc_fore_t639_hgt</value>
                </entry>

 

posted @ 2016-12-08 18:10  JackGIS  阅读(1405)  评论(0编辑  收藏  举报