lakefs 集成presto/trinodb
presto 集成lakefs 主要利用了hive connector
预备
需要hive metadata
- docker-compose
version: "3"
services:
lakefs:
image: "treeverse/lakefs:${VERSION:-latest}"
ports:
- "8000:8000"
depends_on:
- "postgres"
environment:
- LAKEFS_AUTH_ENCRYPT_SECRET_KEY=${LAKEFS_AUTH_ENCRYPT_SECRET_KEY:-some random secret string}
- LAKEFS_DATABASE_CONNECTION_STRING=${LAKEFS_DATABASE_CONNECTION_STRING:-postgres://lakefs:lakefs@postgres/postgres?sslmode=disable}
- LAKEFS_BLOCKSTORE_TYPE=${LAKEFS_BLOCKSTORE_TYPE:-s3}
- LAKEFS_BLOCKSTORE_LOCAL_PATH=${LAKEFS_BLOCKSTORE_LOCAL_PATH:-/home/lakefs}
- LAKEFS_GATEWAYS_S3_DOMAIN_NAME=${LAKEFS_GATEWAYS_S3_DOMAIN_NAME:-s3.local.lakefs.io:8000}
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-minio}
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_SECRET_KEY=${AWS_SECRET_ACCESS_KEY:-minio123}
- LAKEFS_LOGGING_LEVEL=${LAKEFS_LOGGING_LEVEL:-INFO}
- LAKEFS_STATS_ENABLED
- LAKEFS_BLOCKSTORE_S3_ENDPOINT=http://s3:9000
- LAKEFS_BLOCKSTORE_S3_FORCE_PATH_STYLE=true
- LAKEFS_COMMITTED_LOCAL_CACHE_DIR=${LAKEFS_COMMITTED_LOCAL_CACHE_DIR:-/home/lakefs/.local_tier}
entrypoint:
[
"/app/wait-for",
"postgres:5432",
"--",
"/app/lakefs",
"run"
]
postgres:
image: postgres:11
container_name: postgres
environment:
POSTGRES_USER: lakefs
POSTGRES_PASSWORD: lakefs
mariadb:
image: mariadb:10
container_name: mariadb
ports:
- "3306:3306"
environment:
MYSQL_ROOT_PASSWORD: admin
MYSQL_USER: admin
MYSQL_PASSWORD: admin
MYSQL_DATABASE: metastore_db
s3:
image: minio/minio
environment:
- "MINIO_ACCESS_KEY=minio"
- "MINIO_SECRET_KEY=minio123"
command: server /data --console-address ":9001"
ports:
- "9000:9000"
- "9001:9001"
hive-metastore:
image: dalongrong/hive-metastore:3.2.0
container_name: hive
depends_on:
- mariadb
ports:
- 9083:9083
environment:
- DB_URI=mariadb:3306
volumes:
- ./etc/hive-site.xml:/opt/apache-hive-bin/conf/hive-site.xml
trino:
image: trinodb/trino:370
container_name: trino
environment:
- AWS_ACCOUNT_ID
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY
- LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_KEY_ID
- LAKECTL_METASTORE_GLUE_CREDENTIALS_ACCESS_SECRET_KEY
volumes:
# - ./etc/s3.properties:/etc/trino/catalog/s3.properties
- ./etc/lakefs.properties:/etc/trino/catalog/lakefs.properties
ports:
- "8080:8080"
trino-client:
image: trinodb/trino:358
profiles: [ "client" ]
entrypoint:
[
"trino",
"--server",
"trino:8080",
"--catalog",
"s3",
"--schema",
"default"
]
create-dbt-schema-main:
image: trinodb/trino:358
profiles: [ "client" ]
entrypoint:
[
"trino",
"--server",
"trino:8080",
"--catalog",
"lakefs",
"--execute",
"drop schema if exists dbt_main; create schema dbt_main with (location = 's3://example/main/dbt' )"
]
- hive metadata 配置
结合实际修改
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>thrift://hive:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://mariadb:3306/metastore_db</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>admin</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>admin</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
<property>
<name>fs.s3a.endpoint</name>
<value>http://lakefs:8000</value>
</property>
<property>
<name>fs.s3a.access.key</name>
<value>AKIAJXJ3AX5EWAAYO3GQ</value>
</property>
<property>
<name>fs.s3a.secret.key</name>
<value>ayPIL/XuHlPCNUp1x+0dQfyvXCZFp8g+BewEy7bX</value>
</property>
</configuration>
- trinodb 配置
lakefs.properties
connector.name=hive-hadoop2
hive.metastore.uri=thrift://hive-metastore:9083
hive.non-managed-table-writes-enabled=true
hive.s3select-pushdown.enabled=true
hive.s3.aws-access-key=AKIAJXJ3AX5EWAAYO3GQ
hive.s3.aws-secret-key=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
hive.s3.endpoint=http://lakefs:8000
hive.s3.path-style-access=true
hive.s3.ssl.enabled=false
hive.allow-drop-table=true
hive.allow-add-column=true
hive.allow-drop-column=true
hive.allow-rename-table=true
hive.allow-rename-column=true
- 初始化lakefs
启动之后http://localhost:8000/setup
链接trinodb
可以使用dbeaver
- 创建lakefs schema 以及table
CREATE SCHEMA lakefs.main
WITH (location = 's3a://dalong/main')
CREATE TABLE lakefs.main.page_views2 (
user_id bigint,
page_url varchar,
country varchar
)
WITH (
format = 'ORC',
bucketed_by = ARRAY['user_id'],
bucket_count = 50
)
insert into lakefs.main.page_views2(user_id,page_url,country) values (333,'https://www.baidu.com','dalong')
- 效果
数据
lakefs
说明
以上是一个简单的集成,table格式使用了orc,实际上我们也可以使用parquet 这样就可以集成dremio了,实际上dremio 集成其他格式的方法挺多,但是目前
因为dremio并不是完全开源(特性以及完全有商业左右)并不是特别的好,我们可以基于如下扩展
基于hive metadata 服务扩展存储,然后基于自定义的存储插件解决(性能可能不会太好),还有就是基于jdbc 驱动
参考资料
https://docs.lakefs.io/integrations/presto_trino.html
https://github.com/rongfengliang/lakefs-hive-trino