excel 到dremio 查询处理

参考解决方案

 

 

原理说明

用户通过rest api 请求上传excel 文件,可以存储到s3中,然后我们可以通过监听s3的webhooks 或者上传的时候进行格式转换同时存储s3
格式可以支持的包含jsonlines,parquet。。。同时基于dremio直接查询s3的能力,方便的进行业务分析

环境准备

 
version: "3"
services:
  dremio:
      image: dremio/dremio-oss
      ports:
        - "9047:9047"
  s3:
      image: minio/minio
      environment:
        - "MINIO_ACCESS_KEY=minio"
        - "MINIO_SECRET_KEY=minio123"
      command: server /data  --console-address ":9001"
      ports:
        - "9000:9000"
        - "9001:9001"

参考代码

  • pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
 
    <groupId>org.example</groupId>
    <artifactId>excel-reader</artifactId>
    <version>1.0-SNAPSHOT</version>
 
    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>
 
    <dependencies>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.20</version>
            <scope>provided</scope>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>easyexcel</artifactId>
            <version>2.2.8</version>
        </dependency>
        <dependency>
            <groupId>io.minio</groupId>
            <artifactId>minio</artifactId>
            <version>8.3.0</version>
            <exclusions>
                <exclusion>
                    <groupId>com.fasterxml.jackson.core</groupId>
                    <artifactId>jackson-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>com.fasterxml.jackson.core</groupId>
                    <artifactId>jackson-annotations</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>com.fasterxml.jackson.core</groupId>
                    <artifactId>jackson-databind</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.11.0</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.12.5</version>
        </dependency>
    </dependencies>
</project> 
  • java 代码
    代码使用了easyexcel 进行文件读取,同时基于jackson 生成jsonlines 文件,之后存储s3
 
package com.dalong;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.enums.CellExtraTypeEnum;
import com.alibaba.excel.event.AnalysisEventListener;
import com.alibaba.excel.metadata.CellExtra;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SequenceWriter;
import io.minio.MinioClient;
import io.minio.PutObjectArgs;
import io.minio.errors.*;
import lombok.SneakyThrows;
 
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
 
public class Application {
    private static final ObjectMapper objectMapper = new ObjectMapper();
    private static MinioClient minioClient =
            MinioClient.builder()
                    .endpoint("http://localhost:9000")
                    .credentials("minio", "minio123")
                    .build();
    public static void main(String[] args) {
        readExcelFile();
    }
    public static void writeExcelContent2S3JSONLines(List<Object> input) throws ServerException, InsufficientDataException, ErrorResponseException, IOException, NoSuchAlgorithmException, InvalidKeyException, InvalidResponseException, XmlParserException, InternalException {
        try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
            try (SequenceWriter seq = objectMapper.writer()
                    .withRootValueSeparator("\n").writeValues(byteArrayOutputStream)) {
                input.forEach(new Consumer<Object>() {
                    @SneakyThrows
                    @Override
                    public void accept(Object s) {
                        seq.write(s);
                    }
                });
                try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray())) {
                    minioClient.putObject(
                            PutObjectArgs.builder().bucket("demo").object("demoapp.ldjson").stream(
                                            byteArrayInputStream, -1, 10485760)
                                    .build());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
 
 
    }
 
    private static void readExcelFile() {
        InputStream stream = Application.class.getClassLoader().getResourceAsStream("excel-jsonlines.xlsx");
        EasyExcel.read(stream, new AnalysisEventListener() {
            private List<Object> results = new ArrayList<>();
 
            @SneakyThrows
            @Override
            public void invoke(Object o, AnalysisContext analysisContext) {
                results.add(o);
                System.out.println(objectMapper.writeValueAsString(o));
            }
 
            @SneakyThrows
            @Override
            public void extra(CellExtra extra, AnalysisContext context) {
                System.out.println(extra.getType());
                System.out.println(objectMapper.writeValueAsString(extra));
            }
 
            @SneakyThrows
            @Override
            public void doAfterAllAnalysed(AnalysisContext analysisContext) {
                writeExcelContent2S3JSONLines(results);
                System.out.println("after");
            }
        }).extraRead(CellExtraTypeEnum.MERGE).sheet().doRead();
    }
}
 
  • dremio 查询效果

 

 

说明

实际上dremio 是可以读取excel的,但是很多时候可能因为一些限制或者原因,造成使用问题(比如dremio 对于excel 文件大小限制,以及数据使用场景的问题)
基于上边的方式都是一个不错的选择

参考资料

https://jsonlines.org/examples/
https://www.yuque.com/easyexcel/doc/easyexcel

posted on 2021-09-01 21:41  荣锋亮  阅读(134)  评论(0编辑  收藏  举报

导航