爬虫获取数据
2017-08-01 10:21 sihao560 阅读(544) 评论(0) 编辑 收藏 举报1.pom.xml文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>test01</groupId> <artifactId>test01</artifactId> <version>1.0</version> <packaging>jar</packaging> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <target.version>1.0</target.version> <spring.version>4.2.3.RELEASE</spring.version> <quartz.version>1.8.6</quartz.version> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.5</version> </dependency> <!-- WebCollector dependency --> <dependency> <groupId>cn.edu.hfut.dmic.webcollector</groupId> <artifactId>WebCollector</artifactId> <version>2.09</version> </dependency> <!-- selenium --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.44.0</version> </dependency> <!-- phantomjsdriver(selenium webdriver 第三方支持) --> <dependency> <groupId>com.github.detro</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.0</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.0.31</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>6.0.6</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>${spring.version}</version> <exclusions> <!-- Exclude Commons Logging in favor of SLF4j --> <exclusion> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> </exclusion> </exclusions> </dependency> <!-- jsonpath --> <dependency> <groupId>net.minidev</groupId> <artifactId>json-smart</artifactId> <version>2.2.1</version> </dependency> <dependency> <groupId>com.jayway.jsonpath</groupId> <artifactId>json-path</artifactId> <version>2.2.0</version> </dependency> <dependency><!--3.0.7没这个包 --> <groupId>org.springframework</groupId> <artifactId>spring-context-support</artifactId> <version>${spring.version}</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>${spring.version}</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-orm</artifactId> <version>${spring.version}</version> <type>jar</type> <scope>compile</scope> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>${spring.version}</version> <type>jar</type> <scope>test</scope> </dependency> <dependency> <groupId>org.quartz-scheduler</groupId> <artifactId>quartz</artifactId> <version>${quartz.version}</version> </dependency> <dependency> <groupId>net.sf.json-lib</groupId> <artifactId>json-lib</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.16.sec01</version> </dependency> </dependencies> <build> <finalName>test01</finalName> </build> </project>
2.测试文件
package test01; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class test { public static void main(String[] args) { System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.getProperties().setProperty("http.proxyHost", "10.19.110.55"); System.getProperties().setProperty("http.proxyPort", "8080"); System.getProperties().setProperty("https.proxyHost", "10.19.110.55"); System.getProperties().setProperty("https.proxyPort", "8080"); getCountry(); System.out.println(111); } /** * 模板 * @return */ public static List<Map<String, Object>> getCountry() { List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); try { Document doc = Jsoup .connect("https://news.zhibo8.cc/nba/more.htm") .timeout(3000) .get(); Element e = doc .getElementById("boxlist"); Elements c = e .select("div.dataList ul li"); for (Element e2 : c) { Map<String, Object> map = new HashMap<String, Object>(); //关键字 String data_country_id = e2.attr("data-label"); //目标网站来源 map.put("fromStation", "直播吧"); //抓取频道 String channel; map.put("fromStation", "NBA新闻滚动"); //列表图 String colImg; map.put("colImg", "无"); //标题 String title = e2.select(".articleTitle a").html(); map.put("title", title); //作者 String author; //时间 String time = e2.select(".postTime").html(); map.put("time", time); //参考来源 String ReferenceSource = e2.select(".source").html();; map.put("ReferenceSource", ReferenceSource); //评论数 String commentsNumber; //评论列表 String commentsList; //正文 String content; //详情图片 String imgDetail ; //新闻URL String newsURL = e2.select(".articleTitle a").attr("href"); map.put("newsURL", newsURL); list.add(map); } } catch (IOException e) { e.printStackTrace(); } System.out.println(list); return list; } }
package test01; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.alibaba.fastjson.JSON; import com.jayway.jsonpath.Configuration; import com.jayway.jsonpath.JsonPath; import com.suning.web.service.NewerService; import com.suning.web.util.JDBCUtil; import com.suning.web.util.JsonpUntil; public class SportsTest { public static JDBCUtil jdbcutil; public static NewerService newerService = new NewerService(); public static void main(String[] args) { System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.getProperties().setProperty("http.proxyHost", "10.19.110.55"); System.getProperties().setProperty("http.proxyPort", "8080"); System.getProperties().setProperty("https.proxyHost", "10.19.110.55"); System.getProperties().setProperty("https.proxyPort", "8080"); /*Runnable runnable1 = new Runnable() { public void run() { String[] keyword = {"day.html","interfb.html","innerfb.html","nba.html","cba.html","sports.html"}; for(String key : keyword){ getSportsList(key); } } }; ScheduledExecutorService service = Executors .newSingleThreadScheduledExecutor(); // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间 service.scheduleAtFixedRate(runnable1, 0, 86400, TimeUnit.SECONDS);*/ //getSportsList("day.html"); //首页详情 //getMainContent("http://resource.ttplus.cn/publish/app/data/2017/07/20/67522/share1.html"); //新闻详情 getSportContent("http://www.ttplus.cn/publish/app/data/2017/07/20/67559/share1.html"); //getRealTime(); } /** * 24小时 */ private static List<Map<String,Object>> getRealTime() { List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String getUrl = "http://www.ttplus.cn/24h?lastid="; String key = ""; try { String commentDe = JsonpUntil.encode(getUrl, key).toString(); //----------------------------------------------------------------jsonpath------------------start System.out.println(commentDe); String type1 = JsonPath.parse(commentDe).read("$.type");//返回数据的状态 System.out.println(type1); if("success".equals(type1)){ List<Map<String, Object>> pData = JsonPath .using(Configuration.defaultConfiguration()) .parse(commentDe) .read("$.content[?(@.newstime > 0)]", List.class); for(Map<String,Object> comm : pData){ Map<String, Object> map2 = new HashMap<String, Object>(); //标题 String title = (String) comm.get("title"); map2.put("title", title); //目标网站来源 map2.put("fromStation", "体坛+"); //抓取频道 map2.put("channel", "24H"); //作者 String author = (String) comm.get("authorName"); map2.put("author", author); //时间 String time = formatter.format(new Date((Long) (comm.get("newstime")))); map2.put("time", time); //新闻URL String newsURL = ""; map2.put("newsURL", newsURL); //在原网站数据库中id int aid = (Integer) comm.get("id"); //详情图片 String imgUrl = ""; map2.put("imgUrl", imgUrl); //评论数 String commentsNumber = ""; map2.put("commentsNumber", commentsNumber); //关键字 map2.put("keyword", "");//用来分开保存 //新闻内容--------------------------start List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>(); Map<String, Object> map = new HashMap<String, Object>(); //标题 map.put("title",title); //作者 map.put("author",author); //时间 map.put("article_info",time); //关键字 String tags = ""; map.put("tags",tags); //图文信息 String detail = ""; List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString()); if(imgS.size() > 0){ for(Map<String,Object> img : imgS){ String imgHref = (String) img.get("imgurl"); detail = detail + imgHref + "@/"; } } detail = detail + (String) comm.get("content")+"@/"; map.put("detail",detail); //评论 List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>(); map.put("commentsList", commentList); commentsList.add(map); //新闻内容--------------------------end map2.put("commentsList", commentsList); list.add(map2); } } //----------------------------------------------------------------jsonpath------------------end //把json乱码转成utf-8并以集合形式存贮 Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString()); String type = parseData.get("type").toString();//返回数据的状态 if("success".equals(type)){ List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("content").toString()); for(Map<String,Object> comm : pData){ Map<String, Object> map2 = new HashMap<String, Object>(); //标题 String title = (String) comm.get("title"); map2.put("title", title); //目标网站来源 map2.put("fromStation", "体坛+"); //抓取频道 map2.put("channel", "24H"); //作者 String author = (String) comm.get("authorName"); map2.put("author", author); //时间 String time = formatter.format(new Date((Long) (comm.get("newstime")))); map2.put("time", time); //新闻URL String newsURL = ""; map2.put("newsURL", newsURL); //在原网站数据库中id int aid = (Integer) comm.get("id"); //详情图片 String imgUrl = ""; map2.put("imgUrl", imgUrl); //评论数 String commentsNumber = ""; map2.put("commentsNumber", commentsNumber); //关键字 map2.put("keyword", "");//用来分开保存 //新闻内容--------------------------start List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>(); Map<String, Object> map = new HashMap<String, Object>(); //标题 map.put("title",title); //作者 map.put("author",author); //时间 map.put("article_info",time); //关键字 String tags = ""; map.put("tags",tags); //图文信息 String detail = ""; List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString()); if(imgS.size() > 0){ for(Map<String,Object> img : imgS){ String imgHref = (String) img.get("imgurl"); detail = detail + imgHref + "@/"; } } detail = detail + (String) comm.get("content")+"@/"; map.put("detail",detail); //评论 List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>(); map.put("commentsList", commentList); commentsList.add(map); //新闻内容--------------------------end map2.put("commentsList", commentsList); list.add(map2); } } } catch (Exception e) { e.printStackTrace(); } System.out.println(list); return list; } /** * 获取体坛+网站所有信息 */ public static List<Map<String,Object>> getSportsList(String val){ List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); String url = "http://www.ttplus.cn/"; //新建一个数组用来存放已经保存的新闻id try { Document doc = Jsoup.connect(url+val).timeout(3000).get(); Map<String, Object> map1 = new HashMap<String, Object>(); Map<String, Object> map2 = new HashMap<String, Object>(); //轮播图片的跳转 Elements main = doc.select("#swiper-wrapper .swiper-slide"); if(main.size() > 0){ for (Element li : main) { //标题 String title = li.select("a p").text(); map1.put("title", title); //目标网站来源 map1.put("fromStation", "体坛+"); //抓取频道 map1.put("channel", "首页滚动"); //作者 String author=""; map1.put("author", author); //时间 String time=""; map1.put("time_info", time); //列表图 String imgUrl = li.select("a img").attr("src"); map1.put("imgUrl", imgUrl); //评论数 String commentsNumber = ""; map1.put("commentsNumber", commentsNumber); //关键字 map1.put("keyword", "main"); //新闻URL String newsURL = li.select("a").attr("href"); List<Map<String,Object>> detail = new ArrayList<Map<String,Object>>(); if(newsURL.contains("http://resource.ttplus.cn/publish/app/data/")){ //标题id String aid = newsURL.split("/")[9]; map1.put("newsURL", newsURL); /** * 轮播图详情 */ detail = getSportContent(newsURL); map1.put("detail", detail); list.add(map1); }else{ continue; } } } //模块部分 Elements part = doc.select("#newsListBox #newsList li"); if(part.size() > 0){ for(Element li : part){ //标题 String title = li.select("a .newsBox-bd h3").text(); map2.put("title", title); //目标网站来源 map2.put("fromStation", "体坛+"); //抓取频道 map2.put("channel", "首页滚动"); Elements deta = li.select("a .newsBox-bd p span"); //作者 String author = deta.get(0).text(); map2.put("author", author); //时间 String time = deta.get(1).text(); map2.put("time", time); //新闻URL String newsURL = li.select("a").attr("href"); map2.put("newsURL", newsURL); //在原网站数据库中id String aid = newsURL.split("/")[9]; //详情图片 String imgUrl = li.select("a .newsBox-hd img").attr("src"); map2.put("imgUrl", imgUrl); //评论数 String commentsNumber = deta.get(2).text(); map2.put("commentsNumber", commentsNumber); //关键字 map2.put("keyword", val);//用来分开保存 //评论列表 if(!newsURL.contains("video.html")){ List<Map<String,Object>> commentsList = getSportContent(newsURL); map2.put("commentsList", commentsList); }else{ continue; } list.add(map2); } } } catch (IOException e) { e.printStackTrace(); } //System.out.println(list); return list; } /** * 获取详细信息 */ @SuppressWarnings("unchecked") public static List<Map<String,Object>> getSportContent(String newsURL){ List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Calendar cal = Calendar.getInstance(); int year = cal.get(Calendar.YEAR); try { Map<String, Object> map = new HashMap<String, Object>(); Document doc = Jsoup.connect(newsURL).timeout(3000).get(); String pubtime = doc.select("#author_id h6").attr("id"); //详情图片 String detail = ""; if("pubtime3".equals(pubtime)){ //标题 String title = doc.select(".d-title .h1-title").text(); map.put("title",title); //作者 String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text(); map.put("author",author); //时间 String article_info = year + "-" +doc.select("#author_id #pubtime3 .pull-left").text(); map.put("article_info",article_info); //关键字 String tags = doc.select("#author_id #pubtime3 .original").text(); map.put("tags",tags); }else if("pubtime1".equals(pubtime)){ //标题 String title = doc.select(".d-title .h1-title").text(); map.put("title",title); //作者 String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text(); map.put("author",author); //时间 String article_info = year + "-" +doc.select("#author_id #pubtime").text(); map.put("article_info",article_info); //关键字 String tags = ""; map.put("tags",tags); }else if("pubtime".equals(pubtime)){ //标题 String title = doc.select(".d-title .h1-title").text(); map.put("title",title); Elements pull_left = doc.select("#author_id #pubtime span"); //时间 String article_info = year + "-" +pull_left.get(1).text(); map.put("article_info",article_info); //作者 String author = pull_left.get(0).text(); map.put("author",author); //关键字 String tags = ""; map.put("tags",tags); }else if("pubtime4".equals(pubtime)){ //标题 String title = doc.select(".d-title .h1-title").text(); map.put("title",title); Elements pull_left = doc.select("#author_id #pubtime4 span"); //时间 String article_info = year + "-" +pull_left.get(1).text(); map.put("article_info",article_info); //作者 String author = pull_left.get(0).text(); map.put("author",author); //关键字 String tags = pull_left.get(2).text(); String tag = doc.select(".m-detail .m-detail-hd-ft .m-detail-type span").text(); if(!"".equals(tag) && null != tag){ tags = tags + ";" + tag; } map.put("tags",tags); //标题图 String titleImg = doc.select(".m-detail .m-detail-hd img").attr("src"); if(!"".equals(titleImg) && null != titleImg){ detail = detail + titleImg + "@/"; } } Elements pList = doc .select(".m-detail-bd p"); if(pList.size() > 0){//图文信息获取 for(Element p : pList){ String data_src = p.select("img").attr("src"); if("".equals(data_src) || null ==data_src){ detail = detail + p.text()+ "@/"; }else if(!"".equals(p.select("strong").text()) || null != p.select("strong").text()){ detail = detail + p.select("strong").text() + "@/"; }else{ detail = detail + data_src + "@/"; } } } map.put("detail",detail); //评论 String aid = newsURL.split("/")[9]; //当前新闻的id String getUrl = "http://app.ttplus.cn:1102/v2/commpent/news/www/"+aid+"/0"; String key = "callback=callback_cmt&_="+System.currentTimeMillis(); String commentDe = JsonpUntil.encode(getUrl, key).toString(); commentDe = commentDe.substring(13, commentDe.length() - 2); System.out.println(commentDe); //---------jsonPath--------------start int count1 = JsonPath.parse(commentDe).read("$.count"); if(count1 > 0){ List<Map<String,Object>> pData = JsonPath .using(Configuration.defaultConfiguration()) .parse(commentDe) .read("$.comment[?(@.id > 0)]", List.class); for(Map<String,Object> comm : pData){ Map<String, Object> commentMap = new HashMap<String, Object>(); //评论人信息 String comment_user = (String) comm.get("username"); commentMap.put("comment_user", comment_user); //评论时间 String comment_time = formatter.format(new Date((Long) (comm.get("time")))); commentMap.put("comment_time", comment_time); //评论内容 String comment_content = (String) comm.get("content"); commentMap.put("comment_content", comment_content); } } //---------jsonPath-------------end //把json乱码转成utf-8并以集合形式存贮 Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString()); List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>(); int count = (Integer) JSON.parse(parseData.get("count").toString()); if(count > 0){ List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("comment").toString()); for(Map<String,Object> comm : pData){ Map<String, Object> commentMap = new HashMap<String, Object>(); //评论人信息 String comment_user = (String) comm.get("username"); commentMap.put("comment_user", comment_user); //评论时间 String comment_time = formatter.format(new Date((Long) (comm.get("time")))); commentMap.put("comment_time", comment_time); //评论内容 String comment_content = (String) comm.get("content"); commentMap.put("comment_content", comment_content); commentList.add(commentMap); } map.put("commentNumber", commentList.size()); } map.put("commentsList", commentList); list.add(map); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } //System.out.println(list); return list; } }
package test01; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.alibaba.fastjson.JSON; import com.suning.web.util.JsonpUntil; import com.suning.web.util.StringUtil; public class OnFiresTest { public static void main(String[] args) { System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.getProperties().setProperty("http.proxyHost", "10.19.110.55"); System.getProperties().setProperty("http.proxyPort", "8080"); System.getProperties().setProperty("https.proxyHost", "10.19.110.55"); System.getProperties().setProperty("https.proxyPort", "8080"); System.out.println("onfire"); //OnFire篮球APP获取 //Set aids = new HashSet(); //getOnFireList(1,aids); getContent("http://www.bbonfire.com/news/detail?p=pc&aid=56374"); /*Runnable runnable1 = new Runnable() { Set aids = new HashSet(); public void run() { getOnFireList(1,aids); //System.out.println(aids); } }; Runnable runnable2 = new Runnable() { Set aids = new HashSet(); public void run() { getOnFireList(2,aids); getOnFireList(3,aids); } }; ScheduledExecutorService service = Executors .newSingleThreadScheduledExecutor(); // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间 service.scheduleAtFixedRate(runnable1, 0, 1800, TimeUnit.SECONDS); service.scheduleAtFixedRate(runnable2, 0, 86400, TimeUnit.SECONDS);*/ } /** * 抓取OnFire篮球APP包 * 当i为1时为推荐,30分钟抓取一次; * 2时为专栏,24小时抓取一次 * 3时为精译,24小时抓取一次 */ public static List<Map<String,Object>> getOnFireList(int i,Set aids){ List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); String url = "http://www.bbonfire.com"; //新建一个数组用来存放已经保存的新闻id try { Document doc = Jsoup .connect(url+"/news/index?c="+i+"&p=pc") .timeout(3000) .get(); Elements e = doc .select(".news-list .news-item"); if(e.size() > 0){ for (Element e2 : e) { Map<String, Object> map = new HashMap<String, Object>(); //标题 String title = e2.select(".news-title a").text(); map.put("title", title); //目标网站来源 map.put("fromStation", "OnFire"); //抓取频道 String channel = ""; if(i == 1){ channel = "推荐"; }else if(i == 2){ channel = "专栏"; }else{ channel = "精译"; } map.put("channel", channel); //作者 String author = ""; map.put("author", author); //时间 String time_info = e2.select(".news-info .time-info").text(); map.put("time_info", time_info); //新闻URL String newsURL = e2.select(".news-title a").attr("href"); map.put("newsURL", url+newsURL); //在原网站数据库中id String aid = StringUtil.getNumbers(e2.select(".news-title a").attr("href")); //判断数组中是否已经有此id,有跳过循环,没有存入 if(aids.contains(aid)){ continue; }else{ map.put("aid", aid); aids.add(aid); } //标题图地址 String imgUrl = e2.select(".news-thumb a img").attr("src"); map.put("imgUrl", imgUrl); //评论数 String commentsNumber = e2.select(".news-rel .news-comment").text().replace("评论", "").replace(" ", ""); map.put("commentsNumber", commentsNumber); //关键字 map.put("keyword", "");//用来分开保存 //获取详情 List<Map<String,Object>> commentsList = getContent(url+newsURL); if(commentsList.size() > 0){//不是图文信息则跳过当前循环 map.put("content", commentsList); list.add(map); }else{ continue; } } } } catch (IOException e) { e.printStackTrace(); } System.out.println(list); return list; } /** * 获取详情信息 * @return */ private static List<Map<String, Object>> getContent(String contentUrl) { List<Map<String, Object>> list = new ArrayList<Map<String,Object>>(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); try { Document doc = Jsoup .connect(contentUrl) .timeout(3000) .get(); //System.out.println(doc); Map<String, Object> map = new HashMap<String, Object>(); //图文信息 if(!"transparent".equals(doc.select("embed").attr("wmode"))){//判断图文消息 //标题 String title = doc.select(".article h1").text(); map.put("title",title); //时间 SimpleDateFormat form1 = new SimpleDateFormat("yyyy年MM月dd日 HH:mm"); String article_info = formatter.format(form1.parse(doc.select(".article-info .time").text())).toString(); map.put("article_info",article_info); //作者 String author = doc.select(".article-info .author").text(); map.put("author",author); //详情图片 String detail = ""; Elements pList = doc .select(".article-content p"); if(pList.size() > 0){//图文信息获取 for(Element p : pList){ String data_src = p.select("img").attr("data-src"); if("".equals(data_src) || null ==data_src){ detail = detail + p.text()+ "@/"; }else{ detail = detail + data_src + "@/"; } } } map.put("detail",detail); //关键字 String tags = ""; Elements spanList = doc.select(".article-tag span"); if(spanList.size() > 0){ for(Element span : spanList){ tags = tags + span.text() + ";"; } } map.put("tags",tags); //评论 String aid = doc.select("#commentHTML").attr("data-articleid"); //当前新闻的id String getUrl = "http://www.bbonfire.com/api/list"; String key = "p=comment&isjs=1&articleid="+aid+"&len=15&hotlen=5"; String commentDe = JsonpUntil.encode(getUrl, key).toString(); //System.out.println(commentDe); //把json乱码转成utf-8并以集合形式存贮 Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString()); List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("data").toString()); List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>(); if(pData.size() > 0){ for(Map<String,Object> comm : pData){ Map<String, Object> commentMap = new HashMap<String, Object>(); //评论人信息 Map<String,Object> comment_user = (Map<String, Object>) comm.get("userInfo"); commentMap.put("comment_user", comment_user.get("screen_name").toString()); //评论时间 SimpleDateFormat form = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US); String comment_time = formatter.format(form.parse((String)comm.get("ctime"))); commentMap.put("comment_time", comment_time); //评论内容 String comment_content = (String) comm.get("content"); commentMap.put("comment_content", comment_content); commentList.add(commentMap); } map.put("commentsList", commentList); map.put("commentNumber", commentList.size()); } list.add(map); } } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } System.out.println(list); return list; } }
3.ajxa请求
package com.suning.web.util; import java.io.StringWriter; import org.apache.commons.codec.Charsets; import org.apache.commons.io.output.WriterOutputStream; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.message.BasicHeader; public class JsonpUntil { public static StringWriter encode(String url,String key) throws Exception{ StringWriter sw = null ; HttpClient httpClient = new DefaultHttpClient(); HttpHost proxy = new HttpHost("10.19.110.55", 8080); httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxy); if(!"".equals(key) && null != key){ url = url+"?"+key; } HttpGet httpGet = new HttpGet(url); httpGet.addHeader(new BasicHeader("Cookie", "_snma=1%7C149567342565754882%7C1495673425657%7C1495673446005%7C1495714227730%7C3%7C3; idsLoginUserIdLastTime=16030136; authId=si9343022161FCD46A3745D6F3A1BCB180; secureToken=5E769A7ADD32F1977AC2104266C010F3")); HttpResponse loginResponse = httpClient.execute(httpGet); HttpEntity loginEntity = loginResponse.getEntity(); if("HTTP/1.1 404 Not Found".trim().equals(loginResponse.getStatusLine().toString().trim())) { System.out.println(url); System.out.println("此条信息异常!"); } else { sw = new StringWriter(); try (WriterOutputStream out = new WriterOutputStream(sw, Charsets.UTF_8)) { loginEntity.writeTo(out); } } return sw; } }