【Java】爬取澳门区划信息

官网地址:

1
https://macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html

  

大区部分是在页面展示的

 

点击发现并没有请求网络,所以数据是js中存在的

 

找到了展示街道方法,这一段:

使用大区id匹配上述变量的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
function showStreets(freguesia){
       var freguesiaStreets;
       switch(freguesia){
           case "fatima":
               freguesiaStreets = fatimaStreets;
               break;
           case "lourenco":
               freguesiaStreets = lourencoStreets;
               break;
           case "lazaro":
               freguesiaStreets = lazaroStreets;
               break;
           case "carno":
               freguesiaStreets = carnoStreets;
               break;
           case "se":
               freguesiaStreets = seStreets;
               break;
           case "antonio":
               freguesiaStreets = antionioStreets;
               break;
           case "xavier":
               freguesiaStreets = xavierStreets;
               break;
       }

  

可以发现,这一段是js代码实现的,我可以通过Jsoup解析文档获取js的源代码部分

但是要怎么在Java读取JS变量呢?

意外发现JDK8自带提供API 【JS解析引擎】

但是有使用限制,只支持ES5的语法 ,并且不支持任何第三方库的语法内容

1
2
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;

网上的资料不多, 要边写边看才知道

 

1
2
3
4
5
6
7
8
9
10
11
12
@SneakyThrows
public static void main(String[] args) {
    ScriptEngineManager manager = new ScriptEngineManager();
    ScriptEngine engine = manager.getEngineByName("Nashorn");
    engine.eval("var obj = { a: 100, b: true, c: 'hello js-engine'}");
    ScriptObjectMirror jsObject = (ScriptObjectMirror) engine.get("obj");
    String[] ownKeys = jsObject.getOwnKeys(false); /* false表示只需要一般属性, true表示全部属性,包括对象原型的属性 */
    for (String ownKey : ownKeys) {
        Object o = jsObject.get(ownKey);
        System.out.println(o); /* 按具体类型强转即可。集合、对象类型,还是强转为ScriptObjectMirror来读取 */
    }
}

 

db.setting数据源配置文件:

复制代码
## db.setting文件

url = jdbc:mysql://localhost:3308/my-info?serverTimezone=Asia/Shanghai
user = root
pass = 123456

## 可选配置
# 是否在日志中显示执行的SQL
showSql = true
# 是否格式化显示的SQL
formatSql = false
# 是否显示SQL参数
showParams = true
# 打印SQL的日志等级,默认debug,可以是info、warn、error
sqlLevel = debug

#----------------------------------------------------------------------------------------------------------------
## 连接池配置项
#————————————————
#版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
#原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633

## ---------------------------------------------------- Druid
# 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时
initialSize = 1
# 最大连接池数量
maxActive = 8
# 最小连接池数量
minIdle = 0
# 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。
maxWait = 0
# 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。
poolPreparedStatements = false
# 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100
maxOpenPreparedStatements = -1
# 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。
validationQuery = SELECT 1
# 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。
testOnBorrow = true
# 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能
testOnReturn = false
# 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。
testWhileIdle = false
# 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明
timeBetweenEvictionRunsMillis = 60000
# 物理连接初始化的时候执行的sql
connectionInitSqls = SELECT 1
# 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat  日志用的filter:log4j 防御sql注入的filter:wall
# filters = stat
# 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系
# proxyFilters =
复制代码

 

工具类代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
package cn.cloud9.chinese.mazu;
 
import cn.hutool.core.date.DateUtil;
import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import jdk.nashorn.api.scripting.ScriptObjectMirror;
import lombok.Builder;
import lombok.Data;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.time.LocalDateTime;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.Map;
 
public class MaZuRegionUtil {
    private static final Db db = Db.use();
    private static String currentTableName = "";
 
    /* 首页 */
    private static final String INDEX_PAGE = "https://macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html";
 
    /* 官网根地址 */
    private static final String ROOT_PATH = "https://macaostreets.iam.gov.mo";
    @Data
    @Builder
    public static final class MaZuRegionPO {
        private Integer id;
        private Integer parentId;
        private String name;
        private Integer level;
        private String link;
        private String fullPath;
        private String description;
        private byte[] image;
        private LocalDateTime genTime;
    }
 
    /* js 事件匹配数据对象 */
    private static Map<String, String> REGION_MAP = new LinkedHashMap<String, String>(){{
        this.put("fatima", "fatimaStreets");
        this.put("lourenco", "lourencoStreets");
        this.put("lazaro", "lazaroStreets");
        this.put("carno", "carnoStreets");
        this.put("se", "seStreets");
        this.put("antonio", "antionioStreets");
        this.put("xavier", "xavierStreets");
    }};
 
    @SneakyThrows
    private static void writeToDB(MaZuRegionPO po) {
        db.insert(
                Entity.create(currentTableName)
                        .set("id", po.getId())
                        .set("parent_id", po.getParentId())
                        .set("full_path", po.getFullPath())
                        .set("name", po.getName())
                        .set("link", po.getLink())
                        .set("level", po.getLevel())
                        .set("description", po.getDescription())
                        .set("image", po.getImage())
                        .set("gen_time", LocalDateTime.now())
        );
    }
 
    @SneakyThrows
    public static void initialTableSpace() {
        String format = "`macao-region-" + DateUtil.format(new Date(), "yyyyMMddHHmmss") + "`";
        final String SQL =
                "CREATE TABLE IF NOT EXISTS " + format + " (" +
                        "  `id` int NOT NULL AUTO_INCREMENT COMMENT '主键'," +
                        "  `parent_id` int NOT NULL COMMENT '上级id'," +
                        "  `name` varchar(64) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称'," +
                        "  `link` varchar(192) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '链接'," +
                        "  `full_path` varchar(128) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '完整路径'," +
                        "  `level` varchar(12) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '层级'," +
                        "  `description` varchar(2048) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '描述'," +
                        "  `image` MEDIUMBLOB DEFAULT NULL COMMENT '图片'," +
                        "  `gen_time` datetime DEFAULT NULL COMMENT '记录创建时间'," +
                        "  PRIMARY KEY (`id`)" +
                        ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='澳门区域表';";
        db.execute(SQL, null);
        currentTableName = format;
    }
 
    /**
     * 解析script标签中的js代码
     * @param targetScript
     * @return
     */
    @SneakyThrows
    private static ScriptEngine readJavaScriptData(String targetScript) {
        String targetJavaScriptCode = targetScript;
        // System.out.println(targetJavaScriptCode);
        /* 只保留前段变量代码 */
        int index = targetJavaScriptCode.indexOf("function");
        targetJavaScriptCode = targetJavaScriptCode.substring(0, index);
 
        ScriptEngineManager manager = new ScriptEngineManager();
        ScriptEngine engine = manager.getEngineByName("Nashorn");
        engine.eval(targetJavaScriptCode);
        return engine;
    }
 
    public static void goRead() {
        initialTableSpace();
        String string = HttpUtil.get(INDEX_PAGE);
        // System.out.println(string);
 
        Document document = Jsoup.parse(string);
 
        /* 先解析目标JS代码变量 */
        Elements scriptList = document.getElementsByTag("script");
        Element targetScript = scriptList.get(9);
        ScriptEngine scriptEngine = readJavaScriptData(targetScript.html());
 
        /*  提取页面区域部分 */
        Elements childList = document.select("#FreguesiaSectionText > [id]");
        int idx = 100;
        for (Element eachSector : childList) {
            String idKey = eachSector.id();
            MaZuRegionPO sector = MaZuRegionPO.builder()
                    .id(idx)
                    .name(eachSector.ownText())
                    .fullPath(eachSector.ownText())
                    .level(1)
                    .link(INDEX_PAGE)
                    .parentId(0)
                    .build();
            // System.out.println(sector);
            writeToDB(sector);
            String jsDataKey = REGION_MAP.get(idKey);
            ScriptObjectMirror streetsList = (ScriptObjectMirror) scriptEngine.get(jsDataKey);
            String[] ownKeys = streetsList.getOwnKeys(false);
            int c1 = 1;
            for (String key : ownKeys) {
                ScriptObjectMirror jsObject = (ScriptObjectMirror)streetsList.get(key);
                String name = jsObject.get("name").toString();
                String link = jsObject.get("link").toString(); // https://macaostreets.iam.gov.mo/ + link
                link = ROOT_PATH + link;
 
                String detailPage = HttpUtil.get(link);
                Document detailDoc = Jsoup.parse(detailPage);
 
                /* 读取简介 */
                Element descElement = detailDoc.select(".SpotInfo > .SpotInfoText").get(0);
                String text = descElement.html();
                /* 读取目标图片 */
                Element linkA = detailDoc.select("a[data-caption='" + name + "']").get(0);
                String href = linkA.attr("href");
                href = ROOT_PATH + href;
 
                HttpResponse execute = HttpUtil.createGet(href).execute();
                byte[] bodyBytes = execute.bodyBytes();
 
                MaZuRegionPO street = MaZuRegionPO.builder()
                        .id(idx + c1)
                        .parentId(sector.getId())
                        .name(name)
                        .fullPath(sector.getFullPath() + " -> " + name)
                        .level(2)
                        .description(text)
                        .image(bodyBytes)
                        .link(link)
                        .build();
                writeToDB(street);
                c1 += 1;
            }
            idx += 100;
        }
    }
 
    @SneakyThrows
    public static void main(String[] args) {
        goRead();
    }
}

  

posted @   emdzz  阅读(31)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
· 因为Apifox不支持离线,我果断选择了Apipost!
点击右上角即可分享
微信分享提示