Java获取新闻联播每天的内容简介

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DatePattern;
import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import cn.hutool.poi.excel.ExcelUtil;
import cn.hutool.poi.excel.ExcelWriter;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.*;

public class Test {

public static void main(String[] args) throws IOException {
//https://tv.cctv.com/lm/xwlb/day/20240101.shtml
final String prefixUrl = "https://tv.cctv.com/lm/xwlb/day/";
final String endUrl = ".shtml";
List<String> skipList = new ArrayList<>();
skipList.add("20191204");
ArrayList<Map<String, Object>> rows = CollUtil.newArrayList();
try (WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//添加这句,放弃ssl验证
webClient.getOptions().setUseInsecureSSL(true);
//支持js
webClient.getOptions().setJavaScriptEnabled(true);
//忽略js错误
webClient.getOptions().setThrowExceptionOnScriptError(false);
//忽略css错误
webClient.setCssErrorHandler(new SilentCssErrorHandler());
//不执行CSS渲染
webClient.getOptions().setCssEnabled(false);
//超时时间
webClient.getOptions().setTimeout(3000);
//允许重定向
webClient.getOptions().setRedirectEnabled(true);
//允许cookie
webClient.getCookieManager().setCookiesEnabled(true);
Date today = new Date();
DateTime lastYear = DateUtil.offsetMonth(today, -60);
DateTime beginDay = DateUtil.beginOfYear(lastYear);
System.out.println(beginDay);

long days = DateUtil.betweenDay(beginDay, DateUtil.endOfYear(lastYear), true);
System.out.println(days);

for (int i = 0; i < days; i++) {
DateTime dateTime = DateUtil.offsetDay(beginDay, i);
String dayQuery = DateUtil.format(dateTime, DatePattern.PURE_DATE_PATTERN);
Map<String, Object> row1 = new LinkedHashMap<>();
row1.put("日期",DateUtil.format(dateTime, DatePattern.CHINESE_DATE_PATTERN));
row1.put("date", dayQuery);
row1.put("week", DateUtil.dayOfWeekEnum(dateTime).toChinese());
row1.put("content", "");
if(skipList.contains(dayQuery)){
rows.add(row1);
continue;
}
HtmlPage page = webClient.getPage(prefixUrl + dayQuery + endUrl);
String pageXml = page.asXml();
Document document = Jsoup.parse(pageXml);
Elements aTags = document.select("a");
if (CollUtil.isNotEmpty(aTags)) {
String contentUrl = aTags.get(0).attr("href");
System.out.println(contentUrl);
try {
HtmlPage pageContent = webClient.getPage(contentUrl);
Document docContent = Jsoup.parse(pageContent.asXml());
Elements metaTags = docContent.select("meta");
for (Element element : metaTags) {
if("og:description".equals(element.attr("property"))){
String content = element.attr("content");
if(content.startsWith("本期节目主要内容")){
row1.put("content", content);
}else{
Element texts = docContent.getElementsByClass("nrjianjie_shadow").get(0);
row1.put("content", texts.getElementsByTag("p").get(0).ownText());
}
}
}
rows.add(row1);
}catch (SocketTimeoutException e){
System.out.println(dayQuery+"查询超时:"+contentUrl);
skipList.add(dayQuery);
rows.add(row1);
}catch (FailingHttpStatusCodeException e){
System.out.println(dayQuery+"查询404:"+contentUrl);
skipList.add(dayQuery);
rows.add(row1);
}catch (Exception e){
System.out.println(dayQuery+"查询异常:"+contentUrl);
skipList.add(dayQuery);
rows.add(row1);
}
}
}
}
// 通过工具类创建writer
ExcelWriter writer = ExcelUtil.getWriter("d:/2019.xlsx");
// 合并单元格后的标题行,使用默认标题样式
// 一次性写出内容,使用默认样式,强制输出标题
writer.write(rows, true);
// 关闭writer,释放内存
writer.close();
System.out.println(skipList);
}
}
posted @ 2024-11-27 10:59  华格瑞沙  阅读(4)  评论(0编辑  收藏  举报