Flink使用TableAPi方式读取和写入Hive
以下是一个简单的参考实例,用来验证通过FlinkSQL来跑批方式清洗Hive数据可行的。
(1)验证了Hive中org.openx.data.jsonserde.JsonSerDe格式的表是可以直接读取数据出来的
(2)通过TableAPI方式读取Hive表
(3)表转流操作,以及在流中做数据清洗
(4)将清洗完的流数据转为表,再通过SQL方式插入到hive中。
package com.king;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.table.data.RowData;
import org.apache.flink.util.Collector;
import static org.apache.flink.table.api.Expressions.$;
/**
* @Author: KingWang
* @Date: 2022/12/21
* @Desc:
**/
public class TestHiveJson {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.BATCH); //注意:设置批处理模式,写hive表才能覆盖写入。
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
//checkPoint等设置
Configuration conf = tableEnv.getConfig().getConfiguration();
// conf.setBoolean("table.exec.hive.infer-source-parallelism", true);
// conf.setInteger("table.exec.hive.infer-source-parallelism.max", 2);
// conf.setString("table.exec.mini-batch.enabled", "true");
// conf.setString("table.exec.mini-batch.allow-latency", "5 s");
// conf.setString("table.exec.mini-batch.size", "5000");
conf.setString("table.exec.hive.fallback-mapred-reader", "true");
tableEnv.getConfig().addConfiguration(conf);
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
String name = "hive";
String defaultDatabase = "dt_ods";
///home/hadoop/bigdata/hive/
String hiveConfDir = "hdfs://ns1/user/hive/conf";
HiveCatalog hiveCatalog = new HiveCatalog(name, defaultDatabase, hiveConfDir,"3.1.2");
tableEnv.registerCatalog(name, hiveCatalog);
tableEnv.useCatalog(name);
tableEnv.useDatabase("dt_ods");
String tableName = "ods_test2";
Table transTable1 = tableEnv.sqlQuery("select id,name,age,loc from "+ tableName +" ");
DataStream<RowData> stream1 = tableEnv.toAppendStream(transTable1, RowData.class);
SingleOutputStreamOperator<String> returns = stream1.process(new ProcessFunction<RowData, String>() {
@Override
public void open(Configuration parameters) {
}
@Override
public void close() {
}
@Override
public void processElement(RowData rowData, Context context, Collector<String> collector) throws Exception {
JSONObject obj = new JSONObject();
obj.put("id", rowData.getLong(0));
//注意:RowData.getString返回的类型为StringData,需要转为toString。否则会报序列化出错!!
obj.put("name", rowData.getString(1).toString());
obj.put("age", rowData.getInt(2));
obj.put("loc", rowData.getString(3).toString());
collector.collect(obj.toJSONString());
}
}).setParallelism(1).returns(Types.STRING);
// returns.print(">>>");
//流转换为表,并添加查询条件
Table mytable = tableEnv.fromDataStream(returns).select($("f0").as("a"));
//根据表创建视图
tableEnv.createTemporaryView("mytable", mytable);
mytable.printSchema();
// tableEnv.from(mytable).executeInsert(tableEnv.from(""),true);
//使用sql直接插入到hive表中
tableEnv.executeSql("insert overwrite dt_dwd.dwd_test select a from mytable");
//合并分区的问题:flinkSQL没有在sql中使用repartition合并分区这种写法,只能在转换后的流里面设置并行度1
//insert overwrite dt_dwd.dwd_test select /*+REPARTITION(1)*/ a from mytable
env.execute();
}
}
任务提交方式:
export HADOOP_CLASSPATH=`hadoop classpath`
flink run-application -t yarn-application \
-c com.king.TestHiveJson \
-Djobmanager.memory.process.size=1G \
-Dtaskmanager.memory.process.size=1G \
-Dtaskmanager.numberOfTaskSlots=1 \
-Dparallelism.default=1 \
-Dyarn.application.name="testJson" \
-Dyarn.application.queue="default" \
-Dyarn.provided.lib.dirs="hdfs://ns1/jars/flink-1.12/lib;hdfs://ns1/jars/flink-1.12/plugins" \
-Dyarn.ship-files="/home/hadoop/king/jars/" \
/home/hadoop/king/flinkDemo.jar
本文来自博客园,作者:硅谷工具人,转载请注明原文链接:https://www.cnblogs.com/30go/p/16996885.html
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步