Flink SQL 连接hive

  • 最近在调研flink sql连接hive,初次使用踩了许多坑,记录一下。

  • 首先idea运行需要Windows上安装Hadoop环境,并配置好环境变量,否则报$HADOOP_HOME找不到的错误。

  • 配置完成后进入到Linux服务器上已有的Hadoop环境,将core-site.xml文件和hdfs-site.xml文件放到idea代码的resource目录下。

  • 此时可以尝试运行一下MR的wordcount代码,以验证环境是否配置完善。

  • 然后进入到Linux服务器上已有的hive环境,将hive-site.xml文件放到idea代码的resource目录下。

  • 准备工作完成后就可以进行代码的开发了。

  • 需要引入的依赖包如下:

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven4.0.0.xsd">
    
      <modelVersion>4.0.0</modelVersion>
      <groupId>org.example</groupId>
      <artifactId>flink_hive</artifactId>
      <version>1.0-SNAPSHOT</version>
    
      <properties>
          <maven.compiler.source>8</maven.compiler.source>
          <maven.compiler.target>8</maven.compiler.target>
          <flink.version>1.14.5</flink.version>
          <hive.version>3.1.2</hive.version>
          <flink.scala.version>2.12</flink.scala.version>
          <hadoop.version>3.2.2</hadoop.version>
          <scala.binary.version>2.12</scala.binary.version>
      </properties>
    
      <dependencies>
    
          <!--Hadoop依赖包-->
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-common</artifactId>
              <version>${hadoop.version}</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-hdfs</artifactId>
              <version>${hadoop.version}</version>
          </dependency>
          <dependency>
              <groupId>log4j</groupId>
              <artifactId>log4j</artifactId>
              <version>1.2.17</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-client</artifactId>
              <version>${hadoop.version}</version>
          </dependency>
    
          <!--mapreduce-->
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-mapreduce-client-core</artifactId>
              <version>${hadoop.version}</version>
          </dependency>
          <dependency>
              <groupId>org.apache.hadoop</groupId>
              <artifactId>hadoop-mapreduce-client-common</artifactId>
              <version>${hadoop.version}</version>
          </dependency>
    
    
          <!-- Flink Dependency -->
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-connector-hive_${scala.binary.version}</artifactId>
              <version>${flink.version}</version>
          </dependency>
    
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
              <version>${flink.version}</version>
          </dependency>
    
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-table-api-java</artifactId>
              <version>${flink.version}</version>
          </dependency>
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-table-planner_2.12</artifactId>
              <version>${flink.version}</version>
          </dependency>
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-clients_2.12</artifactId>
              <version>${flink.version}</version>
          </dependency>
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-table</artifactId>
              <version>${flink.version}</version>
              <type>pom</type>
          </dependency>
          <dependency>
              <groupId>org.apache.flink</groupId>
              <artifactId>flink-table-common</artifactId>
              <version>${flink.version}</version>
          </dependency>
    
          <!-- Hive Dependency -->
          <dependency>
              <groupId>org.apache.hive</groupId>
              <artifactId>hive-exec</artifactId>
              <version>${hive.version}</version>
          </dependency>
    
      </dependencies>
    
      <build>
          <plugins>
              <!-- Java Compiler -->
              <plugin>
                  <groupId>org.apache.maven.plugins</groupId>
                  <artifactId>maven-compiler-plugin</artifactId>
                  <version>3.1</version>
                  <configuration>
                      <source>1.8</source>
                      <target>1.8</target>
                  </configuration>
              </plugin>
              <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
              <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
              <plugin>
                  <groupId>org.apache.maven.plugins</groupId>
                  <artifactId>maven-shade-plugin</artifactId>
                  <version>3.1.1</version>
                  <executions>
                      <!-- Run shade goal on package phase -->
                      <execution>
                          <phase>package</phase>
                          <goals>
                              <goal>shade</goal>
                          </goals>
                          <configuration>
                              <!--<transformers combine.children="append">
                                  &lt;!&ndash; The service transformer is needed to merge META-INF/services files &ndash;&gt;
                                  <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                  &lt;!&ndash; ... &ndash;&gt;
                              </transformers>-->
                              <!--                            合并多个connetor 的META-INF.services 文件-->
                              <transformers>
                                  <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                      <resource>reference.conf</resource>
                                  </transformer>
                                  <!-- The service transformer is needed to merge META-INF/services files -->
                                  <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                  <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
                                      <projectName>Apache Flink</projectName>
                                      <encoding>UTF-8</encoding>
                                  </transformer>
                              </transformers>
                              <!--                                自动排除不使用的类,缩小jar包体积-->
                              <!--                            <minimizeJar>true</minimizeJar>-->
                              <artifactSet>
                                  <excludes>
                                      <exclude>org.apache.flink:force-shading</exclude>
                                      <exclude>org.slf4j:*</exclude>
                                      <exclude>org.apache.logging.log4j:*</exclude>
                                  </excludes>
                              </artifactSet>
                              <filters>
                                  <filter>
                                      <!-- Do not copy the signatures in the META-INF folder.
                                      Otherwise, this might cause SecurityExceptions when using the JAR. -->
                                      <artifact>*:*</artifact>
                                      <excludes>
                                          <exclude>module-info.class</exclude>
                                          <exclude>META-INF/*.SF</exclude>
                                          <exclude>META-INF/*.DSA</exclude>
                                          <exclude>META-INF/*.RSA</exclude>
                                      </excludes>
                                  </filter>
                              </filters>
                          </configuration>
                      </execution>
                  </executions>
              </plugin>
          </plugins>
      </build>
    </project>
    
    
  • 代码部分如下:

import org.apache.flink.table.api.*;
import org.apache.flink.table.catalog.hive.HiveCatalog;
public class HiveJdbcMain {
    public static void main(String[] args) throws Exception {
        //设置账户为hadoop,有写入hdfs权限
        System.setProperty("HADOOP_USER_NAME", "hadoop");
        System.setProperty("HADOOP_USER_PASSWORD", "hadoop");
        //使用阿里的Planner
        EnvironmentSettings settings = EnvironmentSettings.newInstance()/*.inBatchMode()*/.build();
//        EnvironmentSettings settings = EnvironmentSettings.newInstance()
//                .useBlinkPlanner()
//                .inStreamingMode() // 有流和批inBatchMode() 任选
//                .build();
        // 构建table环境
        TableEnvironment tableEnv = TableEnvironment.create(settings);
        //设置方言 不同数据库的语句有差别
        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);

        //构造hive catalog 直接调用hiveconstans就可以
        // Catalog名称,定义一个唯一的名称表示
        String NAME="myhive";
        // 默认Hive数据库名称
        String DEFAULTDATABASE="default";
        //hive-site.xml路径  运行Flink的Linux目录下
//        String HIVECONFDIRPATH="/opt/module/hive-3.1.2/conf/";服务器文件位置
        String HIVECONFDIRPATH="src/main/resources";//本地文件位置
        //hive版本
        String VERSION="3.1.2";

        HiveCatalog myHive=new HiveCatalog(NAME, DEFAULTDATABASE,HIVECONFDIRPATH, VERSION);

        //注册指定名字的catalog
        tableEnv.registerCatalog("myhive",myHive);
        //使用上面注册的catalog
        tableEnv.useCatalog("myhive");

        // 执行逻辑,需要提前创建好hive的库表。
        String sql="select * from default.ems_data";
        Table tableResult1 = tableEnv.sqlQuery(sql);
        tableResult1.execute().print();
        //获取结果的迭代器,可以循环迭代器获取结果
        /*CloseableIterator<Row> collect = tableResult1.execute().collect();
        System.out.println(collect.next());*/

        //执行executeSql 插入或更新数据库
        /*String executeSql="insert into table xxxx select * from default.ems_data";
        TableResult tableResult6 = tableEnv.executeSql(executeSql);*/
    }
}
  • 以上代码和依赖经过多次调试修改,最终可运行成功。

posted on 2022-07-21 10:40  张少凯  阅读(1003)  评论(0编辑  收藏  举报

导航