Hive+HBase+Phoenix整合

效果介绍

数据插入Hive表可以在Phoenix对应表中查询到，

数据插入Phoenix表可以在Hive对应表中查询到。

安装软件

zookeeper-3.4.5-cdh5.16.2

hadoop-2.6.0-cdh5.16.2

hbase-1.2.0-cdh5.16.2

hive-1.1.0-cdh5.16.2

phoenix-4.14.0-cdh5.14.2

文件配置

（1）HBase

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
     /**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-->
<configuration>
  <property>
    <name>hbase.rootdir</name>
    <value>hdfs://basecoalmine:9000/HBase</value>
  </property>
  <property>
    <name>hbase.master.info.bindAddress</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>hbase.regionserver.info.bindAddress</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>hbase.master</name>
    <value>16000</value>
  </property>
  <property>
    <name>hbase.master.info.port</name>
    <value>16010</value>
  </property>
  <property>
    <name>hbase.regionserver.port</name>
    <value>16020</value>
  </property>
  <property>
    <name>hbase.regionserver.info.port</name>
    <value>16030</value>
  </property>
  <property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
  </property>
  <property>
    <name>hbase.tmp.dir</name>
    <value>/opt/data/hbase/tmp</value>
  </property>
  <property>
    <name>hbase.zookeeper.quorum</name>
    <value>basecoalmine:2181</value>
  </property>
  <property>
    <name>hbase.master.maxclockskew</name>
    <value>120000</value>
  </property>
  
  <property>
    <name>data.tx.snapshot.dir</name>
    <value>/opt/data/phoenix/tephra/snapshots</value>
  </property>
  <property>
    <name>data.tx.timeout</name>
    <value>120</value>
  </property>
  <property>
    <name>hbase.unsafe.stream.capability.enforce</name>
    <value>false</value>
  </property>
  <property>
    <name>hbase.regionserver.wal.codec</name>
    <value>org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec</value>
  </property>

  <property>
    <name>hbase.regionserver.executor.openregion.threads</name>
    <value>200</value>
  </property>
  <property>
    <name>zookeeper.session.timeout</name>
    <value>6600000</value>
  </property>
  <property>
    <name>hbase.zookeeper.property.tickTime</name>
    <value>6000</value>
  </property>
  <property>
    <name>hbase.regionserver.optionalcacheflushinterval</name>
    <value>3600000</value>
  </property>
  <property>
    <name>hbase.client.scanner.timeout.period</name>
    <value>1200000</value>
  </property>
  
  <property>
    <name>hbase.rpc.timeout</name>
    <value>1200000</value>
  </property>
  

  <property>
    <!--该设置决定了处理RPC的线程数量，默认值是10，通常可以调大，比如：150 -->
    <!--如果该值设置过大则会占用过多的内存，导致频繁的GC，或者出现OutOfMemory，因此该值不是越大越好。 -->
    <name>hbase.regionserver.handler.count</name>
    <value>40</value>
  </property>
  <property>
    <!--HStore的storeFile数量>= compactionThreshold配置的值，则可能会进行compact，默认值为3，可以调大，比如设置为6，在定期的major compact中进行剩下文件的合并。 -->
    <name>hbase.hstore.compactionThreshold</name>
    <value>6</value>
  </property>
  <property>
    <!--避免memstore不及时flush，当写入量大时，触发memstore的block，从而阻塞写操作。 -->
    <name>hbase.hstore.blockingStoreFiles</name>
    <value>100</value>
  </property>
  
  <property>
    <name>hbase.master.namespace.init.timeout</name>
    <value>1200000</value>
  </property>
  <property>
    <name>index.writer.threads.max</name>
    <value>200</value>
  </property>
  <property>
    <name>index.builder.threads.max</name>
    <value>200</value>
  </property>
  <property>
    <name>hbase.regionserver.hlog.blocksize</name>
    <value>536870912</value>
  </property>
  <property>
    <name>hbase.wal.provider</name>
    <value>multiwal</value>
  </property>
  <property>
    <name>hbase.wal.regiongrouping.numgroups</name>
    <value>16</value>
  </property>
  <property>
    <name>hbase.regionserver.thread.compaction.small</name>
    <value>1</value>
  </property>    
  <property>
    <name>hbase.regionserver.thread.compaction.large</name>
    <value>1</value>
  </property>    
  <property>
    <name>hbase.hstore.compaction.max.size</name>
    <value>4294967296</value>
  </property>    
  <property>
    <name>hbase.hstore.flusher.count</name>
    <value>8</value>
  </property>    
  <property>
    <name>hbase.hlog.asyncer.number</name>
    <value>16</value>
  </property>  
  <property>
    <name>hbase.hstore.blockingWaitTime</name>
    <value>30000</value>
  </property>  
  <property>
    <name>hbase.server.thread.wakefrequency</name>
    <value>2000</value>
  </property>
  <property>
    <name>hbase.rest.threads.max</name>
    <value>400</value>
  </property>
  <property>
    <name>hbase.regionserver.hlog.splitlog.writer.threads</name>
    <value>10</value>
  </property>
  <property>
    <name>hbase.client.write.buffer</name>
    <value>5242880</value>
  </property>
  <property>
    <name>hbase.regionserver.optionallogflushinterval</name>
    <value>10000</value>
  </property>
  <property>
    <name>hbase.ipc.server.callqueue.handler.factor</name>
    <value>0.2</value>
  </property>
  <property>
    <name>hbase.ipc.server.callqueue.read.ratio</name>
    <value>0.4</value>
  </property>
  <property>
    <name>hbase.ipc.server.callqueue.scan.ratio</name>
    <value>0.6</value>
  </property>

</configuration>

hbase-site.xml

#!/usr/bin/env bash
#
#/**
# * Licensed to the Apache Software Foundation (ASF) under one
# * or more contributor license agreements. See the NOTICE file
# * distributed with this work for additional information
# * regarding copyright ownership. The ASF licenses this file
# * to you under the Apache License, Version 2.0 (the
# * "License"); you may not use this file except in compliance
# * with the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */

# Set environment variables here.

export JAVA_HOME=/usr/java/jdk1.8.0_221
export HBASE_CLASSPATH=/opt/app/hadoop/etc/hadoop
export HBASE_MANAGES_ZK=false

# This script sets variables multiple times over the course of starting an hbase process,
# so try to keep things idempotent unless you want to take an even deeper look
# into the startup scripts (bin/hbase, etc.)

# The java implementation to use. Java 1.8+ required.
# export JAVA_HOME=/usr/java/jdk1.8.0/

# Extra Java CLASSPATH elements. Optional.
# export HBASE_CLASSPATH=

# The maximum amount of heap to use. Default is left to JVM default.
# export HBASE_HEAPSIZE=1G

# Uncomment below if you intend to use off heap cache. For example, to allocate 8G of
# offheap, set the value to "8G".
# export HBASE_OFFHEAPSIZE=1G

# Extra Java runtime options.
# Below are what we set by default. May only work with SUN JVM.
# For more on why as well as other possible settings,
# see http://hbase.apache.org/book.html#performance
export HBASE_OPTS="$HBASE_OPTS -XX:+UseConcMarkSweepGC"

# Uncomment one of the below three options to enable java garbage collection logging for the server-side processes.

# This enables basic gc logging to the .out file.
# export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps"

# This enables basic gc logging to its own file.
# If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
# export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:<FILE-PATH>"

# This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+.
# If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
# export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:<FILE-PATH> -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M"

# Uncomment one of the below three options to enable java garbage collection logging for the client processes.

# This enables basic gc logging to the .out file.
# export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps"

# This enables basic gc logging to its own file.
# If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
# export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:<FILE-PATH>"

# This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+.
# If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
# export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:<FILE-PATH> -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M"

# See the package documentation for org.apache.hadoop.hbase.io.hfile for other configurations
# needed setting up off-heap block caching.

# Uncomment and adjust to enable JMX exporting
# See jmxremote.password and jmxremote.access in $JRE_HOME/lib/management to configure remote password access.
# More details at: http://java.sun.com/javase/6/docs/technotes/guides/management/agent.html
# NOTE: HBase provides an alternative JMX implementation to fix the random ports issue, please see JMX
# section in HBase Reference Guide for instructions.

# export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false"
# export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10101"
# export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10102"
# export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10103"
# export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10104"
# export HBASE_REST_OPTS="$HBASE_REST_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10105"

# File naming hosts on which HRegionServers will run. $HBASE_HOME/conf/regionservers by default.
# export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers

# Uncomment and adjust to keep all the Region Server pages mapped to be memory resident
#HBASE_REGIONSERVER_MLOCK=true
#HBASE_REGIONSERVER_UID="hbase"

# File naming hosts on which backup HMaster will run. $HBASE_HOME/conf/backup-masters by default.
# export HBASE_BACKUP_MASTERS=${HBASE_HOME}/conf/backup-masters

# Extra ssh options. Empty by default.
# export HBASE_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HBASE_CONF_DIR"

# Where log files are stored. $HBASE_HOME/logs by default.
# export HBASE_LOG_DIR=${HBASE_HOME}/logs

# Enable remote JDWP debugging of major HBase processes. Meant for Core Developers
# export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8070"
# export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8071"
# export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8072"
# export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8073"

# A string representing this instance of hbase. $USER by default.
# export HBASE_IDENT_STRING=$USER

# The scheduling priority for daemon processes. See 'man nice'.
# export HBASE_NICENESS=10

# The directory where pid files are stored. /tmp by default.
# export HBASE_PID_DIR=/var/hadoop/pids
export HBASE_PID_DIR=/data/hbase/pids

# Seconds to sleep between slave commands. Unset by default. This
# can be useful in large clusters, where, e.g., slave rsyncs can
# otherwise arrive faster than the master can service them.
# export HBASE_SLAVE_SLEEP=0.1

# Tell HBase whether it should manage it's own instance of ZooKeeper or not.
# export HBASE_MANAGES_ZK=true

# The default log rolling policy is RFA, where the log file is rolled as per the size defined for the
# RFA appender. Please refer to the log4j.properties file to see more details on this appender.
# In case one needs to do log rolling on a date change, one should set the environment property
# HBASE_ROOT_LOGGER to "<DESIRED_LOG LEVEL>,DRFA".
# For example:
# HBASE_ROOT_LOGGER=INFO,DRFA
# The reason for changing default to RFA is to avoid the boundary case of filling out disk space as
# DRFA doesn't put any cap on the log size. Please refer to HBase-5655 for more context.

stop-hbase.sh

将phoenix中的jar包移动到hbase的lib目录下

phoenix-4.14.0-cdh5.14.2-server.jar

phoenix-core-4.14.0-cdh5.14.2.jar

（2）Hive

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
  <!-- 连接数据库 -->
  <property>
     <name>javax.jdo.option.ConnectionURL</name>
     <value>jdbc:mysql://localhost:3306/hive_metadata?createDatabaseIfNotExist=true</value>
  </property>
  <!-- 数据库驱动 -->
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
  </property>
  <!-- 数据库用户 -->
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
  </property>
  <!-- 数据库密码 -->
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>kingc123</value>
  </property>
  <!-- 显示表的列名 -->
  <property>
    <name>hive.cli.print.header</name>
    <value>true</value>
  </property>
  <!-- 显示当前数据库名 -->
  <property>
    <name>hive.cli.print.current.db</name>
    <value>true</value>
  </property>
  <property>
    <name>hive.server2.thrift.port</name>
    <value>10000</value>
  </property>
  <!-- 启用本地模式 -->
  <property>
    <name>hive.exec.mode.local.auto</name>
    <value>true</value>
  </property>
  <property>
    <name>hive.metastore.uris</name>
    <value>thrift://basecoalmine:9083</value>
  </property>
  <property>
    <name>datanucleus.schema.autoCreateAll</name>
    <value>true</value>
  </property>
  <property>  
    <name>hive.insert.into.multilevel.dirs</name>  
    <value>true</value>  
  </property>
    <property>
    <name>hive.metastore.schema.verification</name>
    <value>false</value>
  </property>

  <property>
    <name>hive.aux.jars.path</name>
    <value>/opt/app/hive/lib</value>
  </property>
  <!-- <property>
    <name>hbase.zookeeper.quorum</name>
    <value>basecoalmine:2181</value>
  </property> -->
</configuration>

hive-site.xml

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Hive and Hadoop environment variables here. These variables can be used
# to control the execution of Hive. It should be used by admins to configure
# the Hive installation (so that users do not have to set environment variables
# or set command line parameters to get correct behavior).
#
# The hive service being invoked (CLI/HWI etc.) is available via the environment
# variable SERVICE


# Hive Client memory usage can be an issue if a large number of clients
# are running at the same time. The flags below have been useful in 
# reducing memory usage:
#
# if [ "$SERVICE" = "cli" ]; then
#   if [ -z "$DEBUG" ]; then
#     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit"
#   else
#     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"
#   fi
# fi

# The heap size of the jvm stared by hive shell script can be controlled via:
#
# export HADOOP_HEAPSIZE=1024
#
# Larger heap size may be required when running queries over large number of files or partitions. 
# By default hive shell scripts use a heap size of 256 (MB).  Larger heap size would also be 
# appropriate for hive server (hwi etc).


# Set HADOOP_HOME to point to a specific hadoop install directory
export HADOOP_HOME=/opt/app/hadoop
export HIVE_HOME=/opt/app/hive
export HIVE_AUX_JARS_PATH=/opt/app/hive/lib
# Hive Configuration Directory can be controlled by:

# export HIVE_CONF_DIR=

# Folder containing extra ibraries required for hive compilation/execution can be controlled by:
# export HIVE_AUX_JARS_PATH=

hive-env.sh

将phoenix中的jar包移动到hive的lib目录下

phoenix-4.14.0-cdh5.14.2-hive.jar

phoenix-4.14.0-cdh5.14.2-client.jar

hbase-server-1.2.0-cdh5.14.2.jar

（3）Phoenix

将hbase中的hbase-site.xml文件移动到phoenix的bin目录下。

操作测试

（1）创建Phoenix表

CREATE TABLE
IF
    NOT EXISTS TEST_PHOENIX ( ID INTEGER NOT NULL PRIMARY KEY, NAME VARCHAR, age INTEGER, score DECIMAL, create_time TIMESTAMP, STATUS TINYINT, date_time time ) column_encoded_bytes = 0;

（2）创建Hive对应外部表

create external table TEST_PHOENIX (
  id int,
  name string,
  age int,
  score decimal
)
STORED BY 'org.apache.phoenix.hive.PhoenixStorageHandler'
TBLPROPERTIES (
  "phoenix.table.name" = "TEST_PHOENIX",
  "phoenix.zookeeper.quorum" = "basecoalmine",
  "phoenix.zookeeper.znode.parent" = "/hbase",
  "phoenix.zookeeper.client.port" = "2181",
  "phoenix.rowkeys" = "id",
  "phoenix.column.mapping" = "id:ID, name:NAME, age:AGE, score:SCORE"
);

（3）在Phoenix表插入数据可以在Hive表中查询到

-- 在Phoenix表中插入数据
UPSERT INTO TEST_PHOENIX(id,name,age,score,create_time,status,date_time) VALUES(1,'foo',123,999.8,timestamp '2019-01-30 18:15:00',1,time '2019-01-30 18:30:00');
-- 在Hive表中查询数据
select * from TEST_PHOENIX;

（4）在Hive表插入数据可以在Phoenix表中查询到

-- 在Hive表中插入数据
insert into TEST_PHOENIX(id,name,age,score) values(333,'mary',25,224);
-- 在Phoenix表中查询数据
SELECT * FROM TEST_PHOENIX

（5）因为hive是外部表，所以删除phoenix表之后，hive表会跟着消失；删除hive表，phoenix表会保留。

posted on 2022-02-08 15:26 吹静静阅读(354) 评论(0) 编辑收藏举报

吹静静

欢迎QQ交流：592590682

效果介绍

安装软件

文件配置

操作测试