Flink SQL 连接hive

最近在调研flink sql连接hive，初次使用踩了许多坑，记录一下。
首先idea运行需要Windows上安装Hadoop环境，并配置好环境变量，否则报$HADOOP_HOME找不到的错误。
配置完成后进入到Linux服务器上已有的Hadoop环境，将core-site.xml文件和hdfs-site.xml文件放到idea代码的resource目录下。
此时可以尝试运行一下MR的wordcount代码，以验证环境是否配置完善。
然后进入到Linux服务器上已有的hive环境，将hive-site.xml文件放到idea代码的resource目录下。
准备工作完成后就可以进行代码的开发了。

需要引入的依赖包如下：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven4.0.0.xsd">

  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>flink_hive</artifactId>
  <version>1.0-SNAPSHOT</version>

  <properties>
      <maven.compiler.source>8</maven.compiler.source>
      <maven.compiler.target>8</maven.compiler.target>
      <flink.version>1.14.5</flink.version>
      <hive.version>3.1.2</hive.version>
      <flink.scala.version>2.12</flink.scala.version>
      <hadoop.version>3.2.2</hadoop.version>
      <scala.binary.version>2.12</scala.binary.version>
  </properties>

  <dependencies>

      <!--Hadoop依赖包-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>${hadoop.version}</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>${hadoop.version}</version>
      </dependency>
      <dependency>
          <groupId>log4j</groupId>
          <artifactId>log4j</artifactId>
          <version>1.2.17</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>${hadoop.version}</version>
      </dependency>

      <!--mapreduce-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-core</artifactId>
          <version>${hadoop.version}</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-common</artifactId>
          <version>${hadoop.version}</version>
      </dependency>


      <!-- Flink Dependency -->
      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-connector-hive_${scala.binary.version}</artifactId>
          <version>${flink.version}</version>
      </dependency>

      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
          <version>${flink.version}</version>
      </dependency>

      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-table-api-java</artifactId>
          <version>${flink.version}</version>
      </dependency>
      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-table-planner_2.12</artifactId>
          <version>${flink.version}</version>
      </dependency>
      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-clients_2.12</artifactId>
          <version>${flink.version}</version>
      </dependency>
      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-table</artifactId>
          <version>${flink.version}</version>
          <type>pom</type>
      </dependency>
      <dependency>
          <groupId>org.apache.flink</groupId>
          <artifactId>flink-table-common</artifactId>
          <version>${flink.version}</version>
      </dependency>

      <!-- Hive Dependency -->
      <dependency>
          <groupId>org.apache.hive</groupId>
          <artifactId>hive-exec</artifactId>
          <version>${hive.version}</version>
      </dependency>

  </dependencies>

  <build>
      <plugins>
          <!-- Java Compiler -->
          <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-compiler-plugin</artifactId>
              <version>3.1</version>
              <configuration>
                  <source>1.8</source>
                  <target>1.8</target>
              </configuration>
          </plugin>
          <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
          <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
          <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-shade-plugin</artifactId>
              <version>3.1.1</version>
              <executions>
                  <!-- Run shade goal on package phase -->
                  <execution>
                      <phase>package</phase>
                      <goals>
                          <goal>shade</goal>
                      </goals>
                      <configuration>
                          <!--<transformers combine.children="append">
                              &lt;!&ndash; The service transformer is needed to merge META-INF/services files &ndash;&gt;
                              <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                              &lt;!&ndash; ... &ndash;&gt;
                          </transformers>-->
                          <!--                            合并多个connetor 的META-INF.services 文件-->
                          <transformers>
                              <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                  <resource>reference.conf</resource>
                              </transformer>
                              <!-- The service transformer is needed to merge META-INF/services files -->
                              <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                              <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
                                  <projectName>Apache Flink</projectName>
                                  <encoding>UTF-8</encoding>
                              </transformer>
                          </transformers>
                          <!--                                自动排除不使用的类，缩小jar包体积-->
                          <!--                            <minimizeJar>true</minimizeJar>-->
                          <artifactSet>
                              <excludes>
                                  <exclude>org.apache.flink:force-shading</exclude>
                                  <exclude>org.slf4j:*</exclude>
                                  <exclude>org.apache.logging.log4j:*</exclude>
                              </excludes>
                          </artifactSet>
                          <filters>
                              <filter>
                                  <!-- Do not copy the signatures in the META-INF folder.
                                  Otherwise, this might cause SecurityExceptions when using the JAR. -->
                                  <artifact>*:*</artifact>
                                  <excludes>
                                      <exclude>module-info.class</exclude>
                                      <exclude>META-INF/*.SF</exclude>
                                      <exclude>META-INF/*.DSA</exclude>
                                      <exclude>META-INF/*.RSA</exclude>
                                  </excludes>
                              </filter>
                          </filters>
                      </configuration>
                  </execution>
              </executions>
          </plugin>
      </plugins>
  </build>
</project>

代码部分如下：

import org.apache.flink.table.api.*;
import org.apache.flink.table.catalog.hive.HiveCatalog;
public class HiveJdbcMain {
    public static void main(String[] args) throws Exception {
        //设置账户为hadoop，有写入hdfs权限
        System.setProperty("HADOOP_USER_NAME", "hadoop");
        System.setProperty("HADOOP_USER_PASSWORD", "hadoop");
        //使用阿里的Planner
        EnvironmentSettings settings = EnvironmentSettings.newInstance()/*.inBatchMode()*/.build();
//        EnvironmentSettings settings = EnvironmentSettings.newInstance()
//                .useBlinkPlanner()
//                .inStreamingMode() // 有流和批inBatchMode() 任选
//                .build();
        // 构建table环境
        TableEnvironment tableEnv = TableEnvironment.create(settings);
        //设置方言 不同数据库的语句有差别
        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);

        //构造hive catalog 直接调用hiveconstans就可以
        // Catalog名称，定义一个唯一的名称表示
        String NAME="myhive";
        // 默认Hive数据库名称
        String DEFAULTDATABASE="default";
        //hive-site.xml路径  运行Flink的Linux目录下
//        String HIVECONFDIRPATH="/opt/module/hive-3.1.2/conf/";服务器文件位置
        String HIVECONFDIRPATH="src/main/resources";//本地文件位置
        //hive版本
        String VERSION="3.1.2";

        HiveCatalog myHive=new HiveCatalog(NAME, DEFAULTDATABASE,HIVECONFDIRPATH, VERSION);

        //注册指定名字的catalog
        tableEnv.registerCatalog("myhive",myHive);
        //使用上面注册的catalog
        tableEnv.useCatalog("myhive");

        // 执行逻辑，需要提前创建好hive的库表。
        String sql="select * from default.ems_data";
        Table tableResult1 = tableEnv.sqlQuery(sql);
        tableResult1.execute().print();
        //获取结果的迭代器,可以循环迭代器获取结果
        /*CloseableIterator<Row> collect = tableResult1.execute().collect();
        System.out.println(collect.next());*/

        //执行executeSql 插入或更新数据库
        /*String executeSql="insert into table xxxx select * from default.ems_data";
        TableResult tableResult6 = tableEnv.executeSql(executeSql);*/
    }
}

以上代码和依赖经过多次调试修改，最终可运行成功。

posted on 2022-07-21 10:40 张少凯阅读(1065) 评论(0) 编辑收藏举报

刷新页面返回顶部

张少凯

Flink SQL 连接hive

导航

公告