java爬虫入门--用jsoup爬取汽车之家的新闻

概述

使用jsoup来进行网页数据爬取。jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

详细

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

jsoup的主要功能如下:

1. 从一个URL,文件或字符串中解析HTML;

2. 使用DOM或CSS选择器来查找、取出数据;

3. 可操作HTML元素、属性、文本;

jsoup是基于MIT协议发布的,可放心使用于商业项目

第一步:项目预览

blob.png

第二步:代码实现

主程序为GrapNews类,实现了从汽车网摘取相关内容的功能。GrapNews有main函数,执行即可。

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
package net.sinolbs.ycd.news;
 
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
/**
 * TODO
 * 2017年5月21日上午12:25:30
 */
public class GrapNews {
     
    public static boolean isContainChinese(String str) {
        Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
        Matcher m = p.matcher(str);
        if (m.find()) {
            return true;
        }
        return false;
    }
     
    /**
     * 从笑话集抓取笑话
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListClassOrId
     * @param classOrId
     * @param newsULIndex
     * @param newsContentClassOrId
     * @param titleTagOrClass
     * @param dateTag
     * @return
     */
    public static ArrayList<News> getNewsFromJokeji(int size,String baseUrl,String domainName,
            String newsListClassOrId,int newsULIndex,
            String newsContentClassOrId,String titleTagOrClass,String dateTag){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Element element =null;
        Element title =null;
        News news = null;
        try {
            doc = Jsoup.connect(baseUrl).timeout(10000).get();
            element = (Element) doc.getElementsByClass(newsListClassOrId).first();
            Elements elements = element.getElementsByTag("li");
            if(elements!=null&&elements.size()>0){
                for(Element ele:elements){
                    news = new News();
                    title = ele.select("a").first();
                    if(title==null){
                        continue;
                    }
                    news.setTitle(title.getElementsByTag(titleTagOrClass).text());
                    if(news.getTitle()==null||news.getTitle().equals("")){
                        continue;
                    }
                    news.setHref(domainName+title.attr("href"));
                    if(dateTag!=null){
                        String date=ele.select("i").text();
                        news.setDate(date);
                    }
                    String newsUrl =news.getHref();
                    if (isContainChinese(news.getHref())) {
                        newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
                                .toLowerCase().replace("%3a", ":").replace("%2f", "/");
                    }
                    Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
                    String text=newsDoc.getElementById(newsContentClassOrId).html();
                    text=deleteImg(text);
                    text=deleteA(text);
                     StringBuffer textBuffer=new StringBuffer(5);
                     textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
                     textBuffer.append("</head><body>");
                     textBuffer.append(deleteSource(text));
                     textBuffer.append("</body></html>");
                     news.setContent(textBuffer.toString());
                     news.setContent(textBuffer.toString());
                     System.out.println("标题====="+news.getTitle());
                     System.out.println("href====="+news.getHref());
                     System.out.println("content====="+news.getContent());
                     newsList.add(news);
                    if(newsList.size()==size){
                        break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return newsList;
    }
 
    /**
     * 从汽车之家抓新闻
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
    public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId,
            String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Elements elements =null;
        Element title =null;
        News news = null;
        try {
            doc = Jsoup.connect(baseUrl).timeout(10000).get();
            elements = (Elements) doc.getElementById(newsListId).children();
            if(elements!=null&&elements.size()>0){
                for(Element ele:elements){
                    news = new News();
                    title = ele.select("a").first();
                    if(title==null){
                        continue;
                    }
                    news.setTitle(title.getElementsByTag(titleTag).text());
                    if(news.getTitle()==null||news.getTitle().equals("")){
                        continue;
                    }
                    news.setHref(domainName+title.attr("href"));
                    if(dateTag!=null){
                        String date=ele.select("i").text();
                        news.setDate(date);
                    }
                    String newsUrl =news.getHref();
                    if (isContainChinese(news.getHref())) {
                        newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
                                .toLowerCase().replace("%3a", ":").replace("%2f", "/");
                    }
                    Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
                    String text=newsDoc.getElementsByClass(newsContentClass).html();
                    if(text.indexOf("余下全文")>0||text.indexOf("未经许可")>0
                            ||text.indexOf("禁止转载")>0||text.indexOf("公众号")>0||text.indexOf("公众账号")>0){
                        continue;
                    }
                     text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt);
                     int index=text.lastIndexOf("(");
                     if(index>0){
                         text=text.substring(0,index);
                     }
                     StringBuffer textBuffer=new StringBuffer(5);
                     textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
                     textBuffer.append("</head><body>");
                     textBuffer.append(deleteSource(text));
                     textBuffer.append("</body></html>");
                     news.setContent(textBuffer.toString());
                     news.setContent(textBuffer.toString());
                     System.out.println("标题====="+news.getTitle());
                     System.out.println("href====="+news.getHref());
                     System.out.println("content====="+news.getContent());
                     newsList.add(news);
                    if(newsList.size()==size){
                        break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return newsList;
    }
     
     
     
    public static String getVideoFromMiaoPai(String baseUrl) throws Exception{
        Document doc= Jsoup.connect(baseUrl).timeout(10000).get();
        String html=doc.html().trim();
        return getUrlFromMiaoPaiHtml(html);
    }
     
    public static String getUrlFromMiaoPaiHtml(String html){
        int startIndex=html.indexOf("videoSrc");
        int endIndex=html.indexOf("poster");
        String videoUrl=html.substring(startIndex+11,endIndex+5);
        int index=videoUrl.indexOf('"');
        if(index>0){
            return videoUrl.substring(0, index);
        }
        return videoUrl;
    }
     
    public static String getVideoPhotoFromMiaoPaiHtml(String html){
        System.out.println(html);
        int startIndex=html.indexOf("poster");
        int index=html.substring(startIndex).indexOf("jpg");
        return html.substring(startIndex+9,startIndex+index+3);
    }
     
    public static void main(String[] args) throws Exception{
        getNewsFromCarHome(2,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽车之家");
        getNewsFromJokeji(3,"http://www.jokeji.cn/list.htm","http://www.jokeji.cn","list_title",1,"text110","a","i");
        getNewsFromSouHu(20,"http://m.sohu.com/c/1592/","a",null,null);
    }
     /**
     * 从秒拍抓视频
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
    public static ArrayList<News> getVideoFromMiaopai(int size,String baseUrl){
        ArrayList<News> newsList = new ArrayList<News>();
        try {
            News news = null;
            Element videoEmement=null;
            Document doc = null;
            String videoUrl=null;
            doc = Jsoup.connect(baseUrl).timeout(10000).get();
            Elements    elements = doc.getElementsByClass("videoCont");
            String videoDetailUrl="";
            if(elements!=null&&elements.size()>0){
                for(Element ele:elements){
                videoEmement=ele.getElementsByClass("MIAOPAI_player").first();
                String videoId=videoEmement.attr("data-scid").toString();
                String videoPhotoUrl=videoEmement.attr("data-img").toString();
                String videoTitle=ele.getElementsByClass("viedoAbout").first().getElementsByTag("p").text();
                System.out.println("视频id"+videoId);
                System.out.println("视频封面url"+videoPhotoUrl);
                System.out.println("视频标题"+videoTitle);
                news = new News();
                if(videoId!=null){
                    news.setTitle(videoTitle);
                    videoDetailUrl="http://www.miaopai.com/show/"+videoId+".html";
                    doc = Jsoup.connect("http://www.miaopai.com/show/"+videoId+".html").timeout(10000).get();
                    System.out.println("视频详情url========"+videoDetailUrl);
                    news.setHref("http://m.miaopai.com/show/"+videoId);
                    news.setPhotoUrl(videoPhotoUrl);
                }
                if(doc!=null){
                     videoUrl=getUrlFromMiaoPaiHtml(doc.html());
                }
                if(videoUrl!=null){
                     news.setContent(createVideoHtml(videoUrl, videoPhotoUrl));
                     System.out.println("视频url====="+videoUrl);
                     System.out.println("视频html======"+news.getContent());
                     newsList.add(news);
                }
                
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return newsList;
    }
 
    public static String createVideoHtml(String videoUrl,String videoPhotoUrl) {
        Document doc;
        StringBuffer textBuffer = new StringBuffer(5);
        textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
        textBuffer.append("</head><body>");
        textBuffer.append("<div align=\"center\">");
        textBuffer.append(" <video></video> </div>");
        textBuffer.append("</body></html>");
        doc = Jsoup.parse(textBuffer.toString());
        doc.getElementsByTag("body").attr("style", "height:400px;");
        doc.getElementsByTag("video").attr("style", "width:100%;max-height:400px;")
            .attr("poster", videoPhotoUrl).attr("autoplay", "autoplay")
                .attr("controls", "controls").attr("src", videoUrl);
        return doc.toString();
    }
     
    /**
     * 从搜狐抓新闻
     * @param size
     * @param baseUrl
     * @param domainName
     * @param newsListId
     * @param newsContentClass
     * @param titleTagOrClass
     * @param limitHref
     * @param dateTag
     * @param needDeleteAlt
     * @return
     */
    public static ArrayList<News> getNewsFromSouHu(int size,String baseUrl,
            String titleTag,String dateTag,String needDeleteAlt){
        ArrayList<News> newsList = new ArrayList<News>();
        Document doc;
        Element element =null;
        Element title =null;
        News news = null;
        try {
            doc = Jsoup.connect(baseUrl).timeout(10000).get();
            element =doc.getElementsByTag("section").get(2);
            element = element.getElementsByClass("headlines").get(0);
             Elements elements=element.children();
            if(elements!=null&&elements.size()>0){
                for(Element ele:elements){
                    news = new News();
                    title = ele.select("a").first();
                    if(title==null){
                        continue;
                    }
                    news.setTitle(title.getElementsByTag(titleTag).text());
                    if(news.getTitle()==null||news.getTitle().equals("")
                            ||news.getTitle().indexOf("广告")>0||news.getTitle().indexOf("视频")>0){
                        continue;
                    }
                    news.setHref("https://m.sohu.com"+title.attr("href"));
                    if(dateTag!=null){
                        String dateStr=ele.select(dateTag).first().text();
                        news.setDate(dateStr);
                    }
                    String newsUrl =news.getHref();
                    if (isContainChinese(news.getHref())) {
                        newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
                                .toLowerCase().replace("%3a", ":").replace("%2f", "/");
                    }
                    Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
                    String text=newsDoc.getElementsByTag("article").html();
                    if(text.indexOf("未经许可")>0||text.indexOf("禁止转载")>0
                            ||text.indexOf("公众号")>0||text.indexOf("公众账号")>0){
                        continue;
                    }
                    int index=text.indexOf("<p class=\"para\">");
                    int lastIndex=text.indexOf("<div class=\"expend-wp\"> ");
                    if(lastIndex>0){
                         text=text.substring(index,lastIndex);
                    }else if(index>0){
                        text=text.substring(index,text.length());
                    }
                    text=replaceImgSrcFromDataSrc(text,true,null);
                    if(text==null||text.length()==0){
                        continue;
                    }
                     StringBuffer textBuffer=new StringBuffer(5);
                     textBuffer.append("<!DOCTYPE html><html><head>"
                            + "<meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
                     textBuffer.append("</head><body>");
                     textBuffer.append(deleteSource(text));
                     textBuffer.append("</body></html>");
                     news.setContent(textBuffer.toString());
                     news.setContent(textBuffer.toString());
                     System.out.println("标题====="+news.getTitle());
                     System.out.println("href====="+news.getHref());
                     System.out.println("content====="+news.getContent());
                     newsList.add(news);
                    if(newsList.size()==size){
                        break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return newsList;
    }
     
    private static String deleteImg(String text) {
        return text.replaceAll("<img [^>]*>", "");
    }
     
    private static String deleteA(String text) {
        return text.replaceAll("<a[^>]*>(.*?)</a>", "");
    }
     
    private static String deleteSource(String text) {
        return text.replaceAll("\\(.*?\\)|\\[.*?]", "");
    }
    /**
     * 删除a标签中的href
     * @param content
     * @return
     */
    public static String removeHref(String content){ 
        Document document = Jsoup.parse(content); 
        Elements elements = document.select("a[href]"); 
        for(Element el:elements){ 
            el.removeAttr("href"); 
        
        return document.html(); 
    
     
     
    /**
     * 将htmlBody中所有img标签中的src内容替换为原data-src的内容, <br/>
     * 如果不报含data-src,则src的内容不会被替换 <br/>
     * @param htmlBody html内容
     * @param needDeleteAlt 需要剔除的图片的alt信息
     * @param imgUrlNeedAddProtocolPrefix 图片的url是否需要添加http协议前缀
     * @return 返回替换后的内容
     */
    public static String replaceImgSrcFromDataSrc(String htmlBody,
            boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) {
        Document document = Jsoup.parseBodyFragment(htmlBody);
        List<Element> nodes = document.select("img");
        int nodeLenth = nodes.size();
        if(nodeLenth==0){
            return htmlBody;
        }
        for (int i = 0; i < nodeLenth; i++) {
            Element e = nodes.get(i);
            String dataSrc = e.attr("data-src");
            if (isNotBlank(dataSrc)) {
                e.attr("src", dataSrc);
                e.removeAttr("data-src");
            }
            String originalSrc = e.attr("original");
            if (isNotBlank(originalSrc)) {
                e.attr("src", "http:"+originalSrc);
                e.removeAttr("originalSrc");
            }
            String originalHiddenSrc = e.attr("original-hidden");
            if (isNotBlank(originalHiddenSrc)) {
                e.attr("src", "http:"+originalHiddenSrc);
                e.removeAttr("original-hidden");
            }
        }
        if (htmlBody.contains("<html>")) {
            if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
                return document.toString();
            }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
                return document.toString().replace("src=\"//", "src=\"http://");
            }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
                return document.toString().replace("src=\"//", "src=\"http://")
                        .replace("alt="+needDeleteAlt, "");
            }
            return document.toString().replace("alt="+needDeleteAlt, "");
        } else {
            if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
                return document.select("body>*").toString();
            }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
                return document.select("body>*").toString().replace("src=\"//", "src=\"http://");
            }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
                return document.select("body>*").
                        toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");
            }
            return document.select("body>*").toString().replace("alt="+needDeleteAlt, "");
        }
     
    }
     
     
    private static boolean isNotBlank(String str){
        if(str == null)
            return false;
        else if(str.trim().length() == 0)
            return false;
        else
            return true;
    }
}

还有一个载体类,用于把趴下来的网页内容进行封装到一个类里面。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
package net.sinolbs.ycd.news;
 
/**
 * 新闻数据载体
 */
public class News {
    private int id;
    private String title;
    private String href;
    private String content;
    private String date;
    private String photoUrl;
    public News() {
    }
 
    public News(String title, String href, String content, int id) {
        this.title = title;
        this.content = content;
        this.href = href;
        this.id = id;
    }
 
    public int getId() {
        return id;
    }
 
    public void setId(int id) {
        this.id = id;
    }
 
 
    public String getTitle() {
        return title;
    }
 
    public void setTitle(String title) {
        this.title = title;
    }
 
    public String getHref() {
        return href;
    }
 
    public void setHref(String href) {
        this.href = href;
    }
 
    public String getContent() {
        return content;
    }
 
    public void setContent(String content) {
        this.content = content;
    }
 
    public String getDate() {
        return date;
    }
 
    public void setDate(String date) {
        this.date = date;
    }
 
    public String getPhotoUrl() {
        return photoUrl;
    }
 
    public void setPhotoUrl(String photoUrl) {
        this.photoUrl = photoUrl;
    }
     
}

第三步:运行效果

运行GrapNews类(有main方法)。

blob.png

 

注:本文著作权归作者,由demo大师发表,拒绝转载,转载需要作者授权

 

posted on   demo例子集  阅读(2098)  评论(0编辑  收藏  举报

(评论功能已被禁用)
编辑推荐:
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
阅读排行:
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· 使用C#创建一个MCP客户端
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· Windows编程----内核对象竟然如此简单?

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示