Java使用Jsoup获得新闻联播所有文字稿

Jsoup的maven坐标:

		<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
		<dependency>
		    <groupId>org.jsoup</groupId>
		    <artifactId>jsoup</artifactId>
		    <version>1.11.3</version>
		</dependency>

Java代码:

package com.zifeiy.test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class XinwenGetter {
	
	private static List<String> urlList = new ArrayList<String>();
	
	private static OutputStreamWriter out;
	
	private static void getUrlList() throws IOException {
		for (int i = 1; i <= 44; i ++) {
			String url = null;
			if (i == 0) {
				url = "http://www.xwlbo.com/txt.html";
			} else {
				url = "http://www.xwlbo.com/txt_" + i + ".html";
			}
			Document doc = Jsoup.connect(url).get();
			Elements xwlistElements = doc.getElementsByClass("xwlist");
			Elements aElements = xwlistElements.get(0).select("a");
			for (Element element : aElements) {
				String resUrl = element.attr("href");
				urlList.add(resUrl);
			}
		}
	}
	
	private static void solve(String url) throws IOException {
		Document doc = Jsoup.connect(url).get();
		System.out.println("handling " + doc.title() + " ...");
		out.write("<h3>" + doc.title() + "</h3>\r\n");
		Elements textElements = doc.getElementsByClass("text_content");
		Elements pElements = textElements.get(0).select("p");
		for (Element pElement : pElements) {
//			System.out.println(pElement);
			out.write(pElement.toString() + "\r\n");
		}
		out.write("<hr>\r\n");
	}
	
	
	public static void main(String[] args) throws IOException {
		
		getUrlList();
		
		File file = new File("D:/新闻联播大全.html");
		if (file.exists() == true) file.delete();
		out = new OutputStreamWriter(new FileOutputStream(file, true), "UTF-8");
        
		for (String url: urlList) {
			solve(url);
		}
		
        out.close();
		
	}
	
}

posted @ 2019-03-14 00:06  zifeiy  阅读(462)  评论(0编辑  收藏  举报