Flink使用TableAPi方式读取和写入Hive

以下是一个简单的参考实例，用来验证通过FlinkSQL来跑批方式清洗Hive数据可行的。

（1）验证了Hive中org.openx.data.jsonserde.JsonSerDe格式的表是可以直接读取数据出来的
（2）通过TableAPI方式读取Hive表
（3）表转流操作，以及在流中做数据清洗
（4）将清洗完的流数据转为表，再通过SQL方式插入到hive中。

package com.king;

import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.table.data.RowData;
import org.apache.flink.util.Collector;

import static org.apache.flink.table.api.Expressions.$;

/**
 * @Author: KingWang
 * @Date: 2022/12/21
 * @Desc:
 **/
public class TestHiveJson {

    public static void main(String[] args) throws Exception {


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.BATCH); //注意：设置批处理模式，写hive表才能覆盖写入。
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);

        //checkPoint等设置
        Configuration conf = tableEnv.getConfig().getConfiguration();
//        conf.setBoolean("table.exec.hive.infer-source-parallelism", true);
//        conf.setInteger("table.exec.hive.infer-source-parallelism.max", 2);
//        conf.setString("table.exec.mini-batch.enabled", "true");
//        conf.setString("table.exec.mini-batch.allow-latency", "5 s");
//        conf.setString("table.exec.mini-batch.size", "5000");


        conf.setString("table.exec.hive.fallback-mapred-reader", "true");
        tableEnv.getConfig().addConfiguration(conf);

        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);


        String name = "hive";
        String defaultDatabase = "dt_ods";
        ///home/hadoop/bigdata/hive/
        String hiveConfDir = "hdfs://ns1/user/hive/conf";
        HiveCatalog hiveCatalog = new HiveCatalog(name, defaultDatabase, hiveConfDir,"3.1.2");

        tableEnv.registerCatalog(name, hiveCatalog);
        tableEnv.useCatalog(name);
        tableEnv.useDatabase("dt_ods");

        String tableName = "ods_test2";
        Table transTable1 = tableEnv.sqlQuery("select id,name,age,loc from "+ tableName +" ");


        DataStream<RowData> stream1 = tableEnv.toAppendStream(transTable1, RowData.class);

        SingleOutputStreamOperator<String> returns = stream1.process(new ProcessFunction<RowData, String>() {

            @Override
            public void open(Configuration parameters) {
            }
            @Override
            public void close() {
            }

            @Override
            public void processElement(RowData rowData, Context context, Collector<String> collector) throws Exception {

                JSONObject obj = new JSONObject();
                obj.put("id", rowData.getLong(0));

                //注意：RowData.getString返回的类型为StringData，需要转为toString。否则会报序列化出错！！
                obj.put("name", rowData.getString(1).toString());
                obj.put("age", rowData.getInt(2));
                obj.put("loc", rowData.getString(3).toString());
                collector.collect(obj.toJSONString());
            }
        }).setParallelism(1).returns(Types.STRING);

//        returns.print(">>>");
        //流转换为表，并添加查询条件
        Table mytable = tableEnv.fromDataStream(returns).select($("f0").as("a"));

        //根据表创建视图
        tableEnv.createTemporaryView("mytable", mytable);
        mytable.printSchema();

//        tableEnv.from(mytable).executeInsert(tableEnv.from(""),true);
        //使用sql直接插入到hive表中
        tableEnv.executeSql("insert overwrite dt_dwd.dwd_test select a from mytable");
		
        //合并分区的问题：flinkSQL没有在sql中使用repartition合并分区这种写法,只能在转换后的流里面设置并行度1
        //insert overwrite dt_dwd.dwd_test select /*+REPARTITION(1)*/ a from mytable

        env.execute();

    }
}

任务提交方式：

export HADOOP_CLASSPATH=`hadoop classpath`


flink run-application -t yarn-application \
-c com.king.TestHiveJson \
-Djobmanager.memory.process.size=1G \
-Dtaskmanager.memory.process.size=1G \
-Dtaskmanager.numberOfTaskSlots=1 \
-Dparallelism.default=1 \
-Dyarn.application.name="testJson" \
-Dyarn.application.queue="default" \
-Dyarn.provided.lib.dirs="hdfs://ns1/jars/flink-1.12/lib;hdfs://ns1/jars/flink-1.12/plugins" \
-Dyarn.ship-files="/home/hadoop/king/jars/" \
/home/hadoop/king/flinkDemo.jar

posted @ 2022-12-21 18:10 硅谷工具人阅读(1429) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

时间的往事

所有命运赠送的礼物，早已在暗中标着价格。

Flink使用TableAPi方式读取和写入Hive

公告

搜索

积分与排名

随笔分类 (417)

随笔档案 (435)

阅读排行榜

评论排行榜

推荐排行榜

最新评论