【Java】爬取澳门区划信息
官网地址:
1 | https: //macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html |
大区部分是在页面展示的
点击发现并没有请求网络,所以数据是js中存在的
找到了展示街道方法,这一段:
使用大区id匹配上述变量的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | function showStreets(freguesia){ var freguesiaStreets; switch (freguesia){ case "fatima" : freguesiaStreets = fatimaStreets; break ; case "lourenco" : freguesiaStreets = lourencoStreets; break ; case "lazaro" : freguesiaStreets = lazaroStreets; break ; case "carno" : freguesiaStreets = carnoStreets; break ; case "se" : freguesiaStreets = seStreets; break ; case "antonio" : freguesiaStreets = antionioStreets; break ; case "xavier" : freguesiaStreets = xavierStreets; break ; } |
可以发现,这一段是js代码实现的,我可以通过Jsoup解析文档获取js的源代码部分
但是要怎么在Java读取JS变量呢?
意外发现JDK8自带提供API 【JS解析引擎】
但是有使用限制,只支持ES5的语法 ,并且不支持任何第三方库的语法内容
1 2 | import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; |
网上的资料不多, 要边写边看才知道
1 2 3 4 5 6 7 8 9 10 11 12 | @SneakyThrows public static void main(String[] args) { ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngine engine = manager.getEngineByName( "Nashorn" ); engine.eval( "var obj = { a: 100, b: true, c: 'hello js-engine'}" ); ScriptObjectMirror jsObject = (ScriptObjectMirror) engine.get( "obj" ); String[] ownKeys = jsObject.getOwnKeys( false ); /* false表示只需要一般属性, true表示全部属性,包括对象原型的属性 */ for (String ownKey : ownKeys) { Object o = jsObject.get(ownKey); System.out.println(o); /* 按具体类型强转即可。集合、对象类型,还是强转为ScriptObjectMirror来读取 */ } } |
db.setting数据源配置文件:
## db.setting文件 url = jdbc:mysql://localhost:3308/my-info?serverTimezone=Asia/Shanghai user = root pass = 123456 ## 可选配置 # 是否在日志中显示执行的SQL showSql = true # 是否格式化显示的SQL formatSql = false # 是否显示SQL参数 showParams = true # 打印SQL的日志等级,默认debug,可以是info、warn、error sqlLevel = debug #---------------------------------------------------------------------------------------------------------------- ## 连接池配置项 #———————————————— #版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 #原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633 ## ---------------------------------------------------- Druid # 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时 initialSize = 1 # 最大连接池数量 maxActive = 8 # 最小连接池数量 minIdle = 0 # 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。 maxWait = 0 # 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。 poolPreparedStatements = false # 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100 maxOpenPreparedStatements = -1 # 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。 validationQuery = SELECT 1 # 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。 testOnBorrow = true # 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能 testOnReturn = false # 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。 testWhileIdle = false # 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明 timeBetweenEvictionRunsMillis = 60000 # 物理连接初始化的时候执行的sql connectionInitSqls = SELECT 1 # 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat 日志用的filter:log4j 防御sql注入的filter:wall # filters = stat # 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系 # proxyFilters =
工具类代码:
| package cn.cloud9.chinese.mazu; import cn.hutool.core.date.DateUtil; import cn.hutool.db.Db; import cn.hutool.db.Entity; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import jdk.nashorn.api.scripting.ScriptObjectMirror; import lombok.Builder; import lombok.Data; import lombok.SneakyThrows; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; import java.time.LocalDateTime; import java.util.Date; import java.util.LinkedHashMap; import java.util.Map; public class MaZuRegionUtil { private static final Db db = Db.use(); private static String currentTableName = "" ; /* 首页 */ private static final String INDEX_PAGE = "https://macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html" ; /* 官网根地址 */ private static final String ROOT_PATH = "https://macaostreets.iam.gov.mo" ; @Data @Builder public static final class MaZuRegionPO { private Integer id; private Integer parentId; private String name; private Integer level; private String link; private String fullPath; private String description; private byte [] image; private LocalDateTime genTime; } /* js 事件匹配数据对象 */ private static Map<String, String> REGION_MAP = new LinkedHashMap<String, String>(){{ this .put( "fatima" , "fatimaStreets" ); this .put( "lourenco" , "lourencoStreets" ); this .put( "lazaro" , "lazaroStreets" ); this .put( "carno" , "carnoStreets" ); this .put( "se" , "seStreets" ); this .put( "antonio" , "antionioStreets" ); this .put( "xavier" , "xavierStreets" ); }}; @SneakyThrows private static void writeToDB(MaZuRegionPO po) { db.insert( Entity.create(currentTableName) .set( "id" , po.getId()) .set( "parent_id" , po.getParentId()) .set( "full_path" , po.getFullPath()) .set( "name" , po.getName()) .set( "link" , po.getLink()) .set( "level" , po.getLevel()) .set( "description" , po.getDescription()) .set( "image" , po.getImage()) .set( "gen_time" , LocalDateTime.now()) ); } @SneakyThrows public static void initialTableSpace() { String format = "`macao-region-" + DateUtil.format( new Date(), "yyyyMMddHHmmss" ) + "`" ; final String SQL = "CREATE TABLE IF NOT EXISTS " + format + " (" + " `id` int NOT NULL AUTO_INCREMENT COMMENT '主键'," + " `parent_id` int NOT NULL COMMENT '上级id'," + " `name` varchar(64) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称'," + " `link` varchar(192) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '链接'," + " `full_path` varchar(128) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '完整路径'," + " `level` varchar(12) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '层级'," + " `description` varchar(2048) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '描述'," + " `image` MEDIUMBLOB DEFAULT NULL COMMENT '图片'," + " `gen_time` datetime DEFAULT NULL COMMENT '记录创建时间'," + " PRIMARY KEY (`id`)" + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='澳门区域表';" ; db.execute(SQL, null ); currentTableName = format; } /** * 解析script标签中的js代码 * @param targetScript * @return */ @SneakyThrows private static ScriptEngine readJavaScriptData(String targetScript) { String targetJavaScriptCode = targetScript; // System.out.println(targetJavaScriptCode); /* 只保留前段变量代码 */ int index = targetJavaScriptCode.indexOf( "function" ); targetJavaScriptCode = targetJavaScriptCode.substring( 0 , index); ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngine engine = manager.getEngineByName( "Nashorn" ); engine.eval(targetJavaScriptCode); return engine; } public static void goRead() { initialTableSpace(); String string = HttpUtil.get(INDEX_PAGE); // System.out.println(string); Document document = Jsoup.parse(string); /* 先解析目标JS代码变量 */ Elements scriptList = document.getElementsByTag( "script" ); Element targetScript = scriptList.get( 9 ); ScriptEngine scriptEngine = readJavaScriptData(targetScript.html()); /* 提取页面区域部分 */ Elements childList = document.select( "#FreguesiaSectionText > [id]" ); int idx = 100 ; for (Element eachSector : childList) { String idKey = eachSector.id(); MaZuRegionPO sector = MaZuRegionPO.builder() .id(idx) .name(eachSector.ownText()) .fullPath(eachSector.ownText()) .level( 1 ) .link(INDEX_PAGE) .parentId( 0 ) .build(); // System.out.println(sector); writeToDB(sector); String jsDataKey = REGION_MAP.get(idKey); ScriptObjectMirror streetsList = (ScriptObjectMirror) scriptEngine.get(jsDataKey); String[] ownKeys = streetsList.getOwnKeys( false ); int c1 = 1 ; for (String key : ownKeys) { ScriptObjectMirror jsObject = (ScriptObjectMirror)streetsList.get(key); String name = jsObject.get( "name" ).toString(); String link = jsObject.get( "link" ).toString(); // https://macaostreets.iam.gov.mo/ + link link = ROOT_PATH + link; String detailPage = HttpUtil.get(link); Document detailDoc = Jsoup.parse(detailPage); /* 读取简介 */ Element descElement = detailDoc.select( ".SpotInfo > .SpotInfoText" ).get( 0 ); String text = descElement.html(); /* 读取目标图片 */ Element linkA = detailDoc.select( "a[data-caption='" + name + "']" ).get( 0 ); String href = linkA.attr( "href" ); href = ROOT_PATH + href; HttpResponse execute = HttpUtil.createGet(href).execute(); byte [] bodyBytes = execute.bodyBytes(); MaZuRegionPO street = MaZuRegionPO.builder() .id(idx + c1) .parentId(sector.getId()) .name(name) .fullPath(sector.getFullPath() + " -> " + name) .level( 2 ) .description(text) .image(bodyBytes) .link(link) .build(); writeToDB(street); c1 += 1 ; } idx += 100 ; } } @SneakyThrows public static void main(String[] args) { goRead(); } } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
· 因为Apifox不支持离线,我果断选择了Apipost!