TS爬虫爬取博客园博客列表

function onProcessData(context: IDataFlowScriptNodeContext): DbTableInfo | any[][] {
    const Jsoup = Java.type("org.jsoup.Jsoup");
    const url = "https://www.cnblogs.com/";
    let data = [];
    for (let i = 1; i <= 5; i++) {
        let res: string = post({
            url: "https://www.cnblogs.com/AggSite/AggSitePostList",
            data: {
                "CategoryType": "SiteHome",
                "ParentCategoryId": 0,
                "CategoryId": 808,
                "PageIndex": i.toFixed(),
                "TotalPostCount": 4000,
                "ItemListActionName": "AggSitePostList"
            },
            headers: {
                "Content-Type": "application/json"
            }
        }).responseText;
        let document = Jsoup.parse(res);

        let elements = document.getElementsByClass("post-item");

        for (let element of elements) {
            let row = [];

            let titleElement = element.getElementsByClass("post-item-title");
            let summaryElement = element.getElementsByClass("post-item-summary");
            let authorElement = element.select(".post-item-foot > .post-item-author");
            //let authorElement = element.select("section > footer > a.:nth-child(1)")
            let createDateElement = element.select("section > footer > span.post-meta-item > span");
            let commentCountElement = element.select("section > footer > a:nth-child(4) > span");
            let viewCountElement = element.select("section > footer > a:nth-child(5) > span");


            // 博客 ID
            let id = element.attr("data-post-id");
            // 博客标题
            let title = titleElement.text();
            // 内容简介
            let summary = summaryElement.get(0).ownText();
            let authorUrl = authorElement.select("a").get(0).attr("href");
            // 作者 ID
            let authorId = authorUrl.substring(url.length, authorUrl.length() - 1);
            // 作者网名
            let authorName = authorElement.select("span").get(0).text();
            // 创建时间
            let createDate = createDateElement.text();
            // 点赞数
            let diggCount = element.getElementById("digg_count_" + id).text();
            // 评论数
            let commentCount = commentCountElement.text();
            // 浏览量
            let viewCount = viewCountElement.text();
            print("-------------------------------------------")
            row.push(id);
            row.push(title);
            row.push(summary);
            row.push(authorUrl);
            row.push(authorId);
            row.push(authorName);
            row.push(createDate);
            row.push(diggCount);
            row.push(commentCount);
            row.push(viewCount);
            print("row======" + row);
            data.push(row);
        }
    }
    return data;
}

posted on   会更好aaa  阅读(34)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

统计

点击右上角即可分享
微信分享提示