waterdrop同步mysql数据到hive
一、shell类型任务,提交到yarn集群
#!bin/bash
#=========================数据源配置,只读账号=========================
jdbc_url="数据库ip:3306"
database="数据名"
username="账号"
password="密码"
#=========================目标hive表配置=========================
target_table="hive库.hive表"
target_partion_dt="20220322"
#=========================数据插入===========================
echo "env {
spark.app.name=\"sync_mysql_to_hive_test\"
spark.executor.instances=4
spark.executor.cores=1
spark.executor.memory=\"1024m\"
spark.executor.memoryOverhead=\"154m\"
spark.executor.extraJavaOptions=\"-XX:MaxDirectMemorySize=1G\"
spark.sql.catalogImplementation=\"hive\"
hive.exec.dynamic.partition=\"true\"
hive.exec.dynamic.partition.mode=\"nonstrict\"
}
source {
mysql {
table=\"task_info\"
url=\"jdbc:mysql://${jdbc_url}/${database}?zeroDateTimeBehavior=convertToNull&useServerPrepStmts=false&rewriteBatchedStatements=true&useUnicode=true&characterEncoding=utf8&tinyInt1isBit=false&serverTimezone=Asia/Shanghai\"
user=\"${username}\"
password=\"${password}\"
result_table_name=\"input_table\"
}
}
transform {
sql {
sql=\"select id as id,menu_id as menu_id,REPLACE(REPLACE(task_name,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as task_name,REPLACE(REPLACE(task_desc,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as task_desc,ds_type as ds_type,ds_id as ds_id,REPLACE(REPLACE(ds_table_name,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as ds_table_name,pipeline_type as pipeline_type,init_status as init_status,df_type as df_type,df_id as df_id,REPLACE(REPLACE(df_table_name,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as df_table_name,first_exec_time as first_exec_time,end_exec_time as end_exec_time,schedule_cycle as schedule_cycle,schedule_type as schedule_type,REPLACE(REPLACE(creator,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as creator,REPLACE(REPLACE(creator_no,CHAR(10),CHAR(3)),CHAR(13), CHAR(3)) as creator_no,audit_status as audit_status,task_status as task_status,is_del as is_del,nezha_task_id as nezha_task_id,nezha_init_id as nezha_init_id,task_version as task_version,audit_pass_time as audit_pass_time,release_status as release_status,project_id as project_id,task_init_step_time as task_init_step_time,task_init_step as task_init_step,create_time as create_time,update_time as update_time,${target_partion_dt} as dt from input_table\"
}
}
sink {
hive {
table=\"${target_table}\"
save_mode=\"overwrite\"
}
}" > temp_clickhouse.config
echo "配置文件内容:"
cat temp_clickhouse.config
sh /apps/scripts/waterdrop-2.0.1/bin/start-waterdrop-spark.sh --master yarn --deploy-mode cluster --config temp_clickhouse.config
#=========================完成====================
echo "执行完成"
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· 【.NET】调用本地 Deepseek 模型
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
· 上周热点回顾(2.17-2.23)
· 如何使用 Uni-app 实现视频聊天(源码,支持安卓、iOS)