excel 到dremio 查询处理
参考解决方案
原理说明
用户通过rest api 请求上传excel 文件,可以存储到s3中,然后我们可以通过监听s3的webhooks 或者上传的时候进行格式转换同时存储s3
格式可以支持的包含jsonlines,parquet。。。同时基于dremio直接查询s3的能力,方便的进行业务分析
环境准备
version: "3"
services:
dremio:
image: dremio/dremio-oss
ports:
- "9047:9047"
s3:
image: minio/minio
environment:
- "MINIO_ACCESS_KEY=minio"
- "MINIO_SECRET_KEY=minio123"
command: server /data --console-address ":9001"
ports:
- "9000:9000"
- "9001:9001"
参考代码
- pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>excel-reader</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
<scope>provided</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.2.8</version>
</dependency>
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.3.0</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.12.5</version>
</dependency>
</dependencies>
</project>
- java 代码
代码使用了easyexcel 进行文件读取,同时基于jackson 生成jsonlines 文件,之后存储s3
package com.dalong;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.enums.CellExtraTypeEnum;
import com.alibaba.excel.event.AnalysisEventListener;
import com.alibaba.excel.metadata.CellExtra;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SequenceWriter;
import io.minio.MinioClient;
import io.minio.PutObjectArgs;
import io.minio.errors.*;
import lombok.SneakyThrows;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class Application {
private static final ObjectMapper objectMapper = new ObjectMapper();
private static MinioClient minioClient =
MinioClient.builder()
.endpoint("http://localhost:9000")
.credentials("minio", "minio123")
.build();
public static void main(String[] args) {
readExcelFile();
}
public static void writeExcelContent2S3JSONLines(List<Object> input) throws ServerException, InsufficientDataException, ErrorResponseException, IOException, NoSuchAlgorithmException, InvalidKeyException, InvalidResponseException, XmlParserException, InternalException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
try (SequenceWriter seq = objectMapper.writer()
.withRootValueSeparator("\n").writeValues(byteArrayOutputStream)) {
input.forEach(new Consumer<Object>() {
@SneakyThrows
@Override
public void accept(Object s) {
seq.write(s);
}
});
try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray())) {
minioClient.putObject(
PutObjectArgs.builder().bucket("demo").object("demoapp.ldjson").stream(
byteArrayInputStream, -1, 10485760)
.build());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static void readExcelFile() {
InputStream stream = Application.class.getClassLoader().getResourceAsStream("excel-jsonlines.xlsx");
EasyExcel.read(stream, new AnalysisEventListener() {
private List<Object> results = new ArrayList<>();
@SneakyThrows
@Override
public void invoke(Object o, AnalysisContext analysisContext) {
results.add(o);
System.out.println(objectMapper.writeValueAsString(o));
}
@SneakyThrows
@Override
public void extra(CellExtra extra, AnalysisContext context) {
System.out.println(extra.getType());
System.out.println(objectMapper.writeValueAsString(extra));
}
@SneakyThrows
@Override
public void doAfterAllAnalysed(AnalysisContext analysisContext) {
writeExcelContent2S3JSONLines(results);
System.out.println("after");
}
}).extraRead(CellExtraTypeEnum.MERGE).sheet().doRead();
}
}
- dremio 查询效果
说明
实际上dremio 是可以读取excel的,但是很多时候可能因为一些限制或者原因,造成使用问题(比如dremio 对于excel 文件大小限制,以及数据使用场景的问题)
基于上边的方式都是一个不错的选择
参考资料
https://jsonlines.org/examples/
https://www.yuque.com/easyexcel/doc/easyexcel
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
2020-09-01 一些不错的linux perf tools
2019-09-01 haproxy 2.0 dataplaneapi 类似的工具haproxyadmin
2019-09-01 haproxy 2.0 dataplaneapi rest api 试用
2018-09-01 ory Oathkeeper Ecosystem
2018-09-01 ory Oathkeeper docker-compose 安装运行
2018-09-01 benthos stream nats 集成试用
2018-09-01 benthos 几个方便的帮助命令