lakefs 集成presto/trinodb

presto 集成lakefs 主要利用了hive connector

预备

需要hive metadata

  • docker-compose
version: "3"
services:
  lakefs:
    image: "treeverse/lakefs:${VERSION:-latest}"
    ports:
      - "8000:8000"
    depends_on:
      - "postgres"
    environment:
      - LAKEFS_AUTH_ENCRYPT_SECRET_KEY=${LAKEFS_AUTH_ENCRYPT_SECRET_KEY:-some random secret string}
      - LAKEFS_DATABASE_CONNECTION_STRING=${LAKEFS_DATABASE_CONNECTION_STRING:-postgres://lakefs:lakefs@postgres/postgres?sslmode=disable}
      - LAKEFS_BLOCKSTORE_TYPE=${LAKEFS_BLOCKSTORE_TYPE:-s3}
      - LAKEFS_BLOCKSTORE_LOCAL_PATH=${LAKEFS_BLOCKSTORE_LOCAL_PATH:-/home/lakefs}
      - LAKEFS_GATEWAYS_S3_DOMAIN_NAME=${LAKEFS_GATEWAYS_S3_DOMAIN_NAME:-s3.local.lakefs.io:8000}
      - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-minio}
      - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_SECRET_KEY=${AWS_SECRET_ACCESS_KEY:-minio123}
      - LAKEFS_LOGGING_LEVEL=${LAKEFS_LOGGING_LEVEL:-INFO}
      - LAKEFS_STATS_ENABLED
      - LAKEFS_BLOCKSTORE_S3_ENDPOINT=http://s3:9000
      - LAKEFS_BLOCKSTORE_S3_FORCE_PATH_STYLE=true
      - LAKEFS_COMMITTED_LOCAL_CACHE_DIR=${LAKEFS_COMMITTED_LOCAL_CACHE_DIR:-/home/lakefs/.local_tier}
    entrypoint:
      [
        "/app/wait-for",
        "postgres:5432",
        "--",
        "/app/lakefs",
        "run"
      ]
  postgres:
    image: postgres:11
    container_name: postgres
    environment:
      POSTGRES_USER: lakefs
      POSTGRES_PASSWORD: lakefs
 
  mariadb:
    image: mariadb:10
    container_name: mariadb
    ports:
      - "3306:3306"
    environment:
      MYSQL_ROOT_PASSWORD: admin
      MYSQL_USER: admin
      MYSQL_PASSWORD: admin
      MYSQL_DATABASE: metastore_db
  s3:
    image: minio/minio
    environment:
      - "MINIO_ACCESS_KEY=minio"
      - "MINIO_SECRET_KEY=minio123"
    command: server /data --console-address ":9001"
    ports:
      - "9000:9000"
      - "9001:9001"
  hive-metastore:
    image: dalongrong/hive-metastore:3.2.0
    container_name: hive
    depends_on:
      - mariadb
    ports:
      - 9083:9083
    environment:
      - DB_URI=mariadb:3306
    volumes:
      - ./etc/hive-site.xml:/opt/apache-hive-bin/conf/hive-site.xml
 
  trino:
    image: trinodb/trino:370
    container_name: trino
    environment:
      - AWS_ACCOUNT_ID
      - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID
      - LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY
      - LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_KEY_ID
      - LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_SECRET_KEY
    volumes:
      # - ./etc/s3.properties:/etc/trino/catalog/s3.properties
      - ./etc/lakefs.properties:/etc/trino/catalog/lakefs.properties
    ports:
      - "8080:8080"
 
  trino-client:
    image: trinodb/trino:358
    profiles: [ "client" ]
    entrypoint:
      [
        "trino",
        "--server",
        "trino:8080",
        "--catalog",
        "s3",
        "--schema",
        "default"
      ]
 
  create-dbt-schema-main:
    image: trinodb/trino:358
    profiles: [ "client" ]
    entrypoint:
      [
        "trino",
        "--server",
        "trino:8080",
        "--catalog",
        "lakefs",
        "--execute",
        "drop schema if exists dbt_main; create schema dbt_main with (location = 's3://example/main/dbt' )"
      ]
 
  • hive metadata 配置
    结合实际修改
 
<configuration>
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hive:9083</value>
        <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
    </property>
    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://mariadb:3306/metastore_db</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>admin</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>admin</value>
    </property>
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>
    <property>
        <name>fs.s3a.path.style.access</name>
        <value>true</value>
    </property>
    <property>
        <name>fs.s3.impl</name>
        <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
    </property>
    <property>
        <name>fs.s3a.impl</name>
        <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
    </property>
    <property>
        <name>fs.s3a.endpoint</name>
        <value>http://lakefs:8000</value>
    </property>
    <property>
        <name>fs.s3a.access.key</name>
        <value>AKIAJXJ3AX5EWAAYO3GQ</value>
    </property>
    <property>
        <name>fs.s3a.secret.key</name>
        <value>ayPIL/XuHlPCNUp1x+0dQfyvXCZFp8g+BewEy7bX</value>
    </property>
</configuration>
  • trinodb 配置
    lakefs.properties
 
connector.name=hive-hadoop2
hive.metastore.uri=thrift://hive-metastore:9083
hive.non-managed-table-writes-enabled=true
hive.s3select-pushdown.enabled=true
hive.s3.aws-access-key=AKIAJXJ3AX5EWAAYO3GQ
hive.s3.aws-secret-key=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
hive.s3.endpoint=http://lakefs:8000
hive.s3.path-style-access=true
hive.s3.ssl.enabled=false
 
hive.allow-drop-table=true
hive.allow-add-column=true
hive.allow-drop-column=true
hive.allow-rename-table=true
hive.allow-rename-column=true

链接trinodb

可以使用dbeaver

 

 

  • 创建lakefs schema 以及table
 
CREATE SCHEMA lakefs.main
WITH (location = 's3a://dalong/main')
 
 
CREATE TABLE lakefs.main.page_views2 (
  user_id bigint,
  page_url varchar,
  country varchar
)
WITH (
  format = 'ORC',
  bucketed_by = ARRAY['user_id'],
  bucket_count = 50
)
 
insert into lakefs.main.page_views2(user_id,page_url,country) values (333,'https://www.baidu.com','dalong')
  • 效果

数据

 

 


lakefs

 

 

说明

以上是一个简单的集成,table格式使用了orc,实际上我们也可以使用parquet 这样就可以集成dremio了,实际上dremio 集成其他格式的方法挺多,但是目前
因为dremio并不是完全开源(特性以及完全有商业左右)并不是特别的好,我们可以基于如下扩展
基于hive metadata 服务扩展存储,然后基于自定义的存储插件解决(性能可能不会太好),还有就是基于jdbc 驱动

 

 

参考资料

https://docs.lakefs.io/integrations/presto_trino.html
https://github.com/rongfengliang/lakefs-hive-trino

posted on 2022-02-13 22:04  荣锋亮  阅读(151)  评论(0编辑  收藏  举报

导航