Flink使用TableAPi方式读取和写入Hive

以下是一个简单的参考实例,用来验证通过FlinkSQL来跑批方式清洗Hive数据可行的。

(1)验证了Hive中org.openx.data.jsonserde.JsonSerDe格式的表是可以直接读取数据出来的
(2)通过TableAPI方式读取Hive表
(3)表转流操作,以及在流中做数据清洗
(4)将清洗完的流数据转为表,再通过SQL方式插入到hive中。

package com.king;

import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.table.data.RowData;
import org.apache.flink.util.Collector;

import static org.apache.flink.table.api.Expressions.$;

/**
 * @Author: KingWang
 * @Date: 2022/12/21
 * @Desc:
 **/
public class TestHiveJson {

    public static void main(String[] args) throws Exception {


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.BATCH); //注意:设置批处理模式,写hive表才能覆盖写入。
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);

        //checkPoint等设置
        Configuration conf = tableEnv.getConfig().getConfiguration();
//        conf.setBoolean("table.exec.hive.infer-source-parallelism", true);
//        conf.setInteger("table.exec.hive.infer-source-parallelism.max", 2);
//        conf.setString("table.exec.mini-batch.enabled", "true");
//        conf.setString("table.exec.mini-batch.allow-latency", "5 s");
//        conf.setString("table.exec.mini-batch.size", "5000");


        conf.setString("table.exec.hive.fallback-mapred-reader", "true");
        tableEnv.getConfig().addConfiguration(conf);

        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);


        String name = "hive";
        String defaultDatabase = "dt_ods";
        ///home/hadoop/bigdata/hive/
        String hiveConfDir = "hdfs://ns1/user/hive/conf";
        HiveCatalog hiveCatalog = new HiveCatalog(name, defaultDatabase, hiveConfDir,"3.1.2");

        tableEnv.registerCatalog(name, hiveCatalog);
        tableEnv.useCatalog(name);
        tableEnv.useDatabase("dt_ods");

        String tableName = "ods_test2";
        Table transTable1 = tableEnv.sqlQuery("select id,name,age,loc from "+ tableName +" ");


        DataStream<RowData> stream1 = tableEnv.toAppendStream(transTable1, RowData.class);

        SingleOutputStreamOperator<String> returns = stream1.process(new ProcessFunction<RowData, String>() {

            @Override
            public void open(Configuration parameters) {
            }
            @Override
            public void close() {
            }

            @Override
            public void processElement(RowData rowData, Context context, Collector<String> collector) throws Exception {

                JSONObject obj = new JSONObject();
                obj.put("id", rowData.getLong(0));

                //注意:RowData.getString返回的类型为StringData,需要转为toString。否则会报序列化出错!!
                obj.put("name", rowData.getString(1).toString());
                obj.put("age", rowData.getInt(2));
                obj.put("loc", rowData.getString(3).toString());
                collector.collect(obj.toJSONString());
            }
        }).setParallelism(1).returns(Types.STRING);

//        returns.print(">>>");
        //流转换为表,并添加查询条件
        Table mytable = tableEnv.fromDataStream(returns).select($("f0").as("a"));

        //根据表创建视图
        tableEnv.createTemporaryView("mytable", mytable);
        mytable.printSchema();

//        tableEnv.from(mytable).executeInsert(tableEnv.from(""),true);
        //使用sql直接插入到hive表中
        tableEnv.executeSql("insert overwrite dt_dwd.dwd_test select a from mytable");
		
        //合并分区的问题:flinkSQL没有在sql中使用repartition合并分区这种写法,只能在转换后的流里面设置并行度1
        //insert overwrite dt_dwd.dwd_test select /*+REPARTITION(1)*/ a from mytable

        env.execute();

    }
}

任务提交方式:

export HADOOP_CLASSPATH=`hadoop classpath`


flink run-application -t yarn-application \
-c com.king.TestHiveJson \
-Djobmanager.memory.process.size=1G \
-Dtaskmanager.memory.process.size=1G \
-Dtaskmanager.numberOfTaskSlots=1 \
-Dparallelism.default=1 \
-Dyarn.application.name="testJson" \
-Dyarn.application.queue="default" \
-Dyarn.provided.lib.dirs="hdfs://ns1/jars/flink-1.12/lib;hdfs://ns1/jars/flink-1.12/plugins" \
-Dyarn.ship-files="/home/hadoop/king/jars/" \
/home/hadoop/king/flinkDemo.jar

posted @ 2022-12-21 18:10  硅谷工具人  阅读(1112)  评论(0编辑  收藏  举报
成功之道,在于每个人生阶段都要有不同的目标,并且通过努力实现自己的目标,毕竟人生不过百年! 所有奋斗的意义在于为个人目标实现和提升家庭幸福,同时能推进社会进步和国家目标! 正如古人讲的正心诚意格物致知,修身齐家治国平天下。