excel 到dremio 查询处理
参考解决方案
原理说明
用户通过rest api 请求上传excel 文件,可以存储到s3中,然后我们可以通过监听s3的webhooks 或者上传的时候进行格式转换同时存储s3
格式可以支持的包含jsonlines,parquet。。。同时基于dremio直接查询s3的能力,方便的进行业务分析
环境准备
version: "3"
services:
dremio:
image: dremio/dremio-oss
ports:
- "9047:9047"
s3:
image: minio/minio
environment:
- "MINIO_ACCESS_KEY=minio"
- "MINIO_SECRET_KEY=minio123"
command: server /data --console-address ":9001"
ports:
- "9000:9000"
- "9001:9001"
参考代码
- pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>excel-reader</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
<scope>provided</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.2.8</version>
</dependency>
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.3.0</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.12.5</version>
</dependency>
</dependencies>
</project>
- java 代码
代码使用了easyexcel 进行文件读取,同时基于jackson 生成jsonlines 文件,之后存储s3
package com.dalong;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.enums.CellExtraTypeEnum;
import com.alibaba.excel.event.AnalysisEventListener;
import com.alibaba.excel.metadata.CellExtra;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SequenceWriter;
import io.minio.MinioClient;
import io.minio.PutObjectArgs;
import io.minio.errors.*;
import lombok.SneakyThrows;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class Application {
private static final ObjectMapper objectMapper = new ObjectMapper();
private static MinioClient minioClient =
MinioClient.builder()
.endpoint("http://localhost:9000")
.credentials("minio", "minio123")
.build();
public static void main(String[] args) {
readExcelFile();
}
public static void writeExcelContent2S3JSONLines(List<Object> input) throws ServerException, InsufficientDataException, ErrorResponseException, IOException, NoSuchAlgorithmException, InvalidKeyException, InvalidResponseException, XmlParserException, InternalException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
try (SequenceWriter seq = objectMapper.writer()
.withRootValueSeparator("\n").writeValues(byteArrayOutputStream)) {
input.forEach(new Consumer<Object>() {
@SneakyThrows
@Override
public void accept(Object s) {
seq.write(s);
}
});
try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray())) {
minioClient.putObject(
PutObjectArgs.builder().bucket("demo").object("demoapp.ldjson").stream(
byteArrayInputStream, -1, 10485760)
.build());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static void readExcelFile() {
InputStream stream = Application.class.getClassLoader().getResourceAsStream("excel-jsonlines.xlsx");
EasyExcel.read(stream, new AnalysisEventListener() {
private List<Object> results = new ArrayList<>();
@SneakyThrows
@Override
public void invoke(Object o, AnalysisContext analysisContext) {
results.add(o);
System.out.println(objectMapper.writeValueAsString(o));
}
@SneakyThrows
@Override
public void extra(CellExtra extra, AnalysisContext context) {
System.out.println(extra.getType());
System.out.println(objectMapper.writeValueAsString(extra));
}
@SneakyThrows
@Override
public void doAfterAllAnalysed(AnalysisContext analysisContext) {
writeExcelContent2S3JSONLines(results);
System.out.println("after");
}
}).extraRead(CellExtraTypeEnum.MERGE).sheet().doRead();
}
}
- dremio 查询效果
说明
实际上dremio 是可以读取excel的,但是很多时候可能因为一些限制或者原因,造成使用问题(比如dremio 对于excel 文件大小限制,以及数据使用场景的问题)
基于上边的方式都是一个不错的选择
参考资料
https://jsonlines.org/examples/
https://www.yuque.com/easyexcel/doc/easyexcel