java爬虫入门--用jsoup爬取汽车之家的新闻
概述
使用jsoup来进行网页数据爬取。jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
详细
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
jsoup的主要功能如下:
1. 从一个URL,文件或字符串中解析HTML;
2. 使用DOM或CSS选择器来查找、取出数据;
3. 可操作HTML元素、属性、文本;
jsoup是基于MIT协议发布的,可放心使用于商业项目
第一步:项目预览
第二步:代码实现
主程序为GrapNews类,实现了从汽车网摘取相关内容的功能。GrapNews有main函数,执行即可。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | package net.sinolbs.ycd.news; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * TODO * 2017年5月21日上午12:25:30 */ public class GrapNews { public static boolean isContainChinese(String str) { Pattern p = Pattern.compile( "[\u4e00-\u9fa5]" ); Matcher m = p.matcher(str); if (m.find()) { return true ; } return false ; } /** * 从笑话集抓取笑话 * @param size * @param baseUrl * @param domainName * @param newsListClassOrId * @param classOrId * @param newsULIndex * @param newsContentClassOrId * @param titleTagOrClass * @param dateTag * @return */ public static ArrayList<News> getNewsFromJokeji( int size,String baseUrl,String domainName, String newsListClassOrId, int newsULIndex, String newsContentClassOrId,String titleTagOrClass,String dateTag){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Element element = null ; Element title = null ; News news = null ; try { doc = Jsoup.connect(baseUrl).timeout( 10000 ).get(); element = (Element) doc.getElementsByClass(newsListClassOrId).first(); Elements elements = element.getElementsByTag( "li" ); if (elements!= null &&elements.size()> 0 ){ for (Element ele:elements){ news = new News(); title = ele.select( "a" ).first(); if (title== null ){ continue ; } news.setTitle(title.getElementsByTag(titleTagOrClass).text()); if (news.getTitle()== null ||news.getTitle().equals( "" )){ continue ; } news.setHref(domainName+title.attr( "href" )); if (dateTag!= null ){ String date=ele.select( "i" ).text(); news.setDate(date); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8" ) .toLowerCase().replace( "%3a" , ":" ).replace( "%2f" , "/" ); } Document newsDoc = Jsoup.connect(newsUrl).timeout( 10000 ).get(); String text=newsDoc.getElementById(newsContentClassOrId).html(); text=deleteImg(text); text=deleteA(text); StringBuffer textBuffer= new StringBuffer( 5 ); textBuffer.append( "<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">" ); textBuffer.append( "</head><body>" ); textBuffer.append(deleteSource(text)); textBuffer.append( "</body></html>" ); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println( "标题=====" +news.getTitle()); System.out.println( "href=====" +news.getHref()); System.out.println( "content=====" +news.getContent()); newsList.add(news); if (newsList.size()==size){ break ; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } /** * 从汽车之家抓新闻 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getNewsFromCarHome( int size,String baseUrl,String domainName,String newsListId, String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Elements elements = null ; Element title = null ; News news = null ; try { doc = Jsoup.connect(baseUrl).timeout( 10000 ).get(); elements = (Elements) doc.getElementById(newsListId).children(); if (elements!= null &&elements.size()> 0 ){ for (Element ele:elements){ news = new News(); title = ele.select( "a" ).first(); if (title== null ){ continue ; } news.setTitle(title.getElementsByTag(titleTag).text()); if (news.getTitle()== null ||news.getTitle().equals( "" )){ continue ; } news.setHref(domainName+title.attr( "href" )); if (dateTag!= null ){ String date=ele.select( "i" ).text(); news.setDate(date); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8" ) .toLowerCase().replace( "%3a" , ":" ).replace( "%2f" , "/" ); } Document newsDoc = Jsoup.connect(newsUrl).timeout( 10000 ).get(); String text=newsDoc.getElementsByClass(newsContentClass).html(); if (text.indexOf( "余下全文" )> 0 ||text.indexOf( "未经许可" )> 0 ||text.indexOf( "禁止转载" )> 0 ||text.indexOf( "公众号" )> 0 ||text.indexOf( "公众账号" )> 0 ){ continue ; } text=replaceImgSrcFromDataSrc(text, true ,needDeleteAlt); int index=text.lastIndexOf( "(" ); if (index> 0 ){ text=text.substring( 0 ,index); } StringBuffer textBuffer= new StringBuffer( 5 ); textBuffer.append( "<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">" ); textBuffer.append( "</head><body>" ); textBuffer.append(deleteSource(text)); textBuffer.append( "</body></html>" ); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println( "标题=====" +news.getTitle()); System.out.println( "href=====" +news.getHref()); System.out.println( "content=====" +news.getContent()); newsList.add(news); if (newsList.size()==size){ break ; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } public static String getVideoFromMiaoPai(String baseUrl) throws Exception{ Document doc= Jsoup.connect(baseUrl).timeout( 10000 ).get(); String html=doc.html().trim(); return getUrlFromMiaoPaiHtml(html); } public static String getUrlFromMiaoPaiHtml(String html){ int startIndex=html.indexOf( "videoSrc" ); int endIndex=html.indexOf( "poster" ); String videoUrl=html.substring(startIndex+ 11 ,endIndex+ 5 ); int index=videoUrl.indexOf( '"' ); if (index> 0 ){ return videoUrl.substring( 0 , index); } return videoUrl; } public static String getVideoPhotoFromMiaoPaiHtml(String html){ System.out.println(html); int startIndex=html.indexOf( "poster" ); int index=html.substring(startIndex).indexOf( "jpg" ); return html.substring(startIndex+ 9 ,startIndex+index+ 3 ); } public static void main(String[] args) throws Exception{ getNewsFromCarHome( 2 , "http://m.autohome.com.cn/channel" , "http://m.autohome.com.cn" , "list" , "details" , "h4" , "time" , "汽车之家" ); getNewsFromJokeji( 3 , "http://www.jokeji.cn/list.htm" , "http://www.jokeji.cn" , "list_title" , 1 , "text110" , "a" , "i" ); getNewsFromSouHu( 20 , "http://m.sohu.com/c/1592/" , "a" , null , null ); } /** * 从秒拍抓视频 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getVideoFromMiaopai( int size,String baseUrl){ ArrayList<News> newsList = new ArrayList<News>(); try { News news = null ; Element videoEmement= null ; Document doc = null ; String videoUrl= null ; doc = Jsoup.connect(baseUrl).timeout( 10000 ).get(); Elements elements = doc.getElementsByClass( "videoCont" ); String videoDetailUrl= "" ; if (elements!= null &&elements.size()> 0 ){ for (Element ele:elements){ videoEmement=ele.getElementsByClass( "MIAOPAI_player" ).first(); String videoId=videoEmement.attr( "data-scid" ).toString(); String videoPhotoUrl=videoEmement.attr( "data-img" ).toString(); String videoTitle=ele.getElementsByClass( "viedoAbout" ).first().getElementsByTag( "p" ).text(); System.out.println( "视频id" +videoId); System.out.println( "视频封面url" +videoPhotoUrl); System.out.println( "视频标题" +videoTitle); news = new News(); if (videoId!= null ){ news.setTitle(videoTitle); videoDetailUrl= "http://www.miaopai.com/show/" +videoId+ ".html" ; doc = Jsoup.connect( "http://www.miaopai.com/show/" +videoId+ ".html" ).timeout( 10000 ).get(); System.out.println( "视频详情url========" +videoDetailUrl); news.setHref( "http://m.miaopai.com/show/" +videoId); news.setPhotoUrl(videoPhotoUrl); } if (doc!= null ){ videoUrl=getUrlFromMiaoPaiHtml(doc.html()); } if (videoUrl!= null ){ news.setContent(createVideoHtml(videoUrl, videoPhotoUrl)); System.out.println( "视频url=====" +videoUrl); System.out.println( "视频html======" +news.getContent()); newsList.add(news); } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } public static String createVideoHtml(String videoUrl,String videoPhotoUrl) { Document doc; StringBuffer textBuffer = new StringBuffer( 5 ); textBuffer.append( "<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">" ); textBuffer.append( "</head><body>" ); textBuffer.append( "<div align=\"center\">" ); textBuffer.append( " <video></video> </div>" ); textBuffer.append( "</body></html>" ); doc = Jsoup.parse(textBuffer.toString()); doc.getElementsByTag( "body" ).attr( "style" , "height:400px;" ); doc.getElementsByTag( "video" ).attr( "style" , "width:100%;max-height:400px;" ) .attr( "poster" , videoPhotoUrl).attr( "autoplay" , "autoplay" ) .attr( "controls" , "controls" ).attr( "src" , videoUrl); return doc.toString(); } /** * 从搜狐抓新闻 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getNewsFromSouHu( int size,String baseUrl, String titleTag,String dateTag,String needDeleteAlt){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Element element = null ; Element title = null ; News news = null ; try { doc = Jsoup.connect(baseUrl).timeout( 10000 ).get(); element =doc.getElementsByTag( "section" ).get( 2 ); element = element.getElementsByClass( "headlines" ).get( 0 ); Elements elements=element.children(); if (elements!= null &&elements.size()> 0 ){ for (Element ele:elements){ news = new News(); title = ele.select( "a" ).first(); if (title== null ){ continue ; } news.setTitle(title.getElementsByTag(titleTag).text()); if (news.getTitle()== null ||news.getTitle().equals( "" ) ||news.getTitle().indexOf( "广告" )> 0 ||news.getTitle().indexOf( "视频" )> 0 ){ continue ; } news.setHref( "https://m.sohu.com" +title.attr( "href" )); if (dateTag!= null ){ String dateStr=ele.select(dateTag).first().text(); news.setDate(dateStr); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8" ) .toLowerCase().replace( "%3a" , ":" ).replace( "%2f" , "/" ); } Document newsDoc = Jsoup.connect(newsUrl).timeout( 10000 ).get(); String text=newsDoc.getElementsByTag( "article" ).html(); if (text.indexOf( "未经许可" )> 0 ||text.indexOf( "禁止转载" )> 0 ||text.indexOf( "公众号" )> 0 ||text.indexOf( "公众账号" )> 0 ){ continue ; } int index=text.indexOf( "<p class=\"para\">" ); int lastIndex=text.indexOf( "<div class=\"expend-wp\"> " ); if (lastIndex> 0 ){ text=text.substring(index,lastIndex); } else if (index> 0 ){ text=text.substring(index,text.length()); } text=replaceImgSrcFromDataSrc(text, true , null ); if (text== null ||text.length()== 0 ){ continue ; } StringBuffer textBuffer= new StringBuffer( 5 ); textBuffer.append( "<!DOCTYPE html><html><head>" + "<meta name=\"content-type\" content=\"text/html; charset=UTF-8\">" ); textBuffer.append( "</head><body>" ); textBuffer.append(deleteSource(text)); textBuffer.append( "</body></html>" ); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println( "标题=====" +news.getTitle()); System.out.println( "href=====" +news.getHref()); System.out.println( "content=====" +news.getContent()); newsList.add(news); if (newsList.size()==size){ break ; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } private static String deleteImg(String text) { return text.replaceAll( "<img [^>]*>" , "" ); } private static String deleteA(String text) { return text.replaceAll( "<a[^>]*>(.*?)</a>" , "" ); } private static String deleteSource(String text) { return text.replaceAll( "\\(.*?\\)|\\[.*?]" , "" ); } /** * 删除a标签中的href * @param content * @return */ public static String removeHref(String content){ Document document = Jsoup.parse(content); Elements elements = document.select( "a[href]" ); for (Element el:elements){ el.removeAttr( "href" ); } return document.html(); } /** * 将htmlBody中所有img标签中的src内容替换为原data-src的内容, <br/> * 如果不报含data-src,则src的内容不会被替换 <br/> * @param htmlBody html内容 * @param needDeleteAlt 需要剔除的图片的alt信息 * @param imgUrlNeedAddProtocolPrefix 图片的url是否需要添加http协议前缀 * @return 返回替换后的内容 */ public static String replaceImgSrcFromDataSrc(String htmlBody, boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) { Document document = Jsoup.parseBodyFragment(htmlBody); List<Element> nodes = document.select( "img" ); int nodeLenth = nodes.size(); if (nodeLenth== 0 ){ return htmlBody; } for ( int i = 0 ; i < nodeLenth; i++) { Element e = nodes.get(i); String dataSrc = e.attr( "data-src" ); if (isNotBlank(dataSrc)) { e.attr( "src" , dataSrc); e.removeAttr( "data-src" ); } String originalSrc = e.attr( "original" ); if (isNotBlank(originalSrc)) { e.attr( "src" , "http:" +originalSrc); e.removeAttr( "originalSrc" ); } String originalHiddenSrc = e.attr( "original-hidden" ); if (isNotBlank(originalHiddenSrc)) { e.attr( "src" , "http:" +originalHiddenSrc); e.removeAttr( "original-hidden" ); } } if (htmlBody.contains( "<html>" )) { if (needDeleteAlt== null &&!imgUrlNeedAddProtocolPrefix){ return document.toString(); } else if (needDeleteAlt== null &&imgUrlNeedAddProtocolPrefix){ return document.toString().replace( "src=\"//" , "src=\"http://" ); } else if (needDeleteAlt!= null &&imgUrlNeedAddProtocolPrefix){ return document.toString().replace( "src=\"//" , "src=\"http://" ) .replace( "alt=" +needDeleteAlt, "" ); } return document.toString().replace( "alt=" +needDeleteAlt, "" ); } else { if (needDeleteAlt== null &&!imgUrlNeedAddProtocolPrefix){ return document.select( "body>*" ).toString(); } else if (needDeleteAlt== null &&imgUrlNeedAddProtocolPrefix){ return document.select( "body>*" ).toString().replace( "src=\"//" , "src=\"http://" ); } else if (needDeleteAlt!= null &&imgUrlNeedAddProtocolPrefix){ return document.select( "body>*" ). toString().replace( "src=\"//" , "src=\"http://" ).replace( "alt=" +needDeleteAlt, "" ); } return document.select( "body>*" ).toString().replace( "alt=" +needDeleteAlt, "" ); } } private static boolean isNotBlank(String str){ if (str == null ) return false ; else if (str.trim().length() == 0 ) return false ; else return true ; } } |
还有一个载体类,用于把趴下来的网页内容进行封装到一个类里面。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | package net.sinolbs.ycd.news; /** * 新闻数据载体 */ public class News { private int id; private String title; private String href; private String content; private String date; private String photoUrl; public News() { } public News(String title, String href, String content, int id) { this .title = title; this .content = content; this .href = href; this .id = id; } public int getId() { return id; } public void setId( int id) { this .id = id; } public String getTitle() { return title; } public void setTitle(String title) { this .title = title; } public String getHref() { return href; } public void setHref(String href) { this .href = href; } public String getContent() { return content; } public void setContent(String content) { this .content = content; } public String getDate() { return date; } public void setDate(String date) { this .date = date; } public String getPhotoUrl() { return photoUrl; } public void setPhotoUrl(String photoUrl) { this .photoUrl = photoUrl; } } |
第三步:运行效果
运行GrapNews类(有main方法)。
注:本文著作权归作者,由demo大师发表,拒绝转载,转载需要作者授权
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· 使用C#创建一个MCP客户端
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· Windows编程----内核对象竟然如此简单?