hive
下载好hive后解压到应用程序目录中
root@s100:/tools# tar xf apache-hive-2.1.1-bin.tar.gz -C /soft/ root@s100:/soft# ln -s /soft/apache-hive-2.1.1-bin /soft/hive
设置hive环境变量
root@s100:/soft# vim /etc/environment JAVA_HOME=/soft/jdk HADOOP_HOME=/soft/hadoop HIVE_HOME=/soft/hive PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/s oft/jdk/bin:/soft/hadoop/bin:/soft/hadoop/sbin:/soft/hive/bin" //使hive环境变量生效 root@s100:/soft# source /etc/environment //验证hive安装是否成功 root@s100:/soft# hive --version Hive 2.1.1
由于hive需要hadoop的支持,所以在部署hive之前要先部署hadoop,然后配置hive关于hadoop的家目录。还要有hive-site.xml的配置文件
root@s100:/soft/hive/conf# cp hive-env.sh.template hive-env.sh root@s100:/soft/hive/conf# vim hive-env.sh HADOOP_HOME=/soft/hadoop root@s100:/soft/hive/conf# cp hive-default.xml.template hive-site.xml
然后启动hive后报错,解决问题的方式为1格式化数据库,2.修改hive-site.xml配置文件
//初始化数据库,指定数据库为derby
root@s100:~# schematool -initSchema -dbType derby
//修改hive-site.xml配置文件,将${system:java.io.tmpdir}全部修改成/root/hive;将${system:user.name}全部修改成ubuntu0
//修改完hive-site.xml配置文件后,创建配置文件中定义的目录
root@s100:~# mkdir -p hive/ubuntu0
//最后启动hive后就可以进入hive shell环境了,就可以像操作mysql数据库一样操作hive了
root@s100:~# hive
hive>
让hive连接外部的mysql服务器,让hive的metastore存储在mysql数据库中
1.修改hive的hive-site.xml配置文件,修改hive的数据库连接信息
root@s100:/soft/hive/conf# vim hive-site.xml
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.182.1:3306/myhive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>Username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123</value>
<description>password to use against metastore database</description>
</property>
2.导入mysql数据库的驱动到hive的classpath中
root@s100:/soft/hive# cd lib/ #切换到hive的lib库目录中,上传mysql-connector-java-5.0.8-bin.jar到lib目录中即可
3.删除以前derby数据库连接信息
root@s100:~# ls derby.log hadoop hello.txt hive metastore_db root@s100:~# rm -fr metastore_db root@s100:~# hdfs dfs -rm -r /user/hive root@s100:~# hdfs dfs -rm -r /tmp
4.初始化hive连接mysql数据库
root@s100:~# schematool -initSchema -dbType mysql Metastore connection URL: jdbc:mysql://192.168.182.1:3306/myhive Metastore Connection Driver : com.mysql.jdbc.Driver Metastore connection User: root Starting metastore schema initialization to 2.1.0 Initialization script hive-schema-2.1.0.mysql.sql Initialization script completed schemaTool completed
连接外部的mysql数据库时候可能会出如下的错误:
org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version. Underlying cause: java.sql.SQLException : null, message from server: "Host '192.168.1.100' is not allowed to connect to this MySQL server" SQL Error code: 1130
这个错误表示连接mysql的权限有问题,解决办法为修改mysql的数据库访问权限
mysql> grant all on myhive.* to 'root'@'%' identified by '123' with grant option;
5.测试在hive中创建库、表后在mysql中是否能查询得到
hive> create database hive1; hive> create table users(id int,name string,age int); #在mysql中查询hive创建的库和表还有字段 #查询数据库 mysql> select * from dbs; +-------+-----------------------+-----------------------------------------------+---------+------------+------------+ | DB_ID | DESC | DB_LOCATION_URI | NAME | OWNER_NAME | OWNER_TYPE | +-------+-----------------------+-----------------------------------------------+---------+------------+------------+ | 1 | Default Hive database | hdfs://s100:8020/user/hive/warehouse | default | public | ROLE | | 2 | NULL | hdfs://s100:8020/user/hive/warehouse/hive1.db | hive1 | root | USER | +-------+-----------------------+-----------------------------------------------+---------+------------+------------+ #查询表 mysql> select * from tbls; +--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+ | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT | +--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+ | 1 | 1523896224 | 2 | 0 | root | 0 | 1 | users | MANAGED_TABLE | NULL | NULL | +--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+ #查询字段 mysql> select * from columns_v2; +-------+---------+-------------+-----------+-------------+ | CD_ID | COMMENT | COLUMN_NAME | TYPE_NAME | INTEGER_IDX | +-------+---------+-------------+-----------+-------------+ | 1 | NULL | age | int | 2 | | 1 | NULL | id | int | 0 | | 1 | NULL | name | string | 1 | +-------+---------+-------------+-----------+-------------+
通过hive创建表test,然后向这个表中添加数据
#创建表test结构如下:
hive> create table test(id int,name string)
> row format delimited
> fields terminated by ',' ;
#创建一个测试文件,作为数据插入到test表中
root@s100:~# vim info
1,zhangsank
2,lisi
3,wangwu
4,wangerma
5,test
#向test表中插入数据info文件
hive> load data local inpath '/root/info' into table test;
#查看表test中的数据
hive> select * from test;
OK
1 zhangsank
2 lisi
3 wangwu
4 wangerma
5 test
我们知道hive存储的数据是放在hdfs分布式文件系统上的,hive的元数据存储在外部关系型数据库mysql中,hive存储在hdfs上的默认根位置为:/user/hive/warehouse/,创建的库、表在hdfs上的表现形式为文件夹,如上面创建的库hive1和表test在hdfs上的表现形式如下:

创建带有分区的分区表
hive> create table data_partiton(id int,name string)
> partitioned by(country string)
> row format delimited
> fields terminated by ',';
#导入数据
hive> load data local inpath '/root/info' into table data_partiton partition(country='china');
hive> load data local inpath '/root/info2' into table data_partiton partition(country='amerecal');
#查看表data_partiton数据
hive> select * from data_partiton;
OK
1 hadeng amerecal
2 baoluo amerecal
3 alizha amerecal
4 gelin amerecal
1 zhangsank china
2 lisi china
3 wangwu china
4 wangerma china
5 test china
在hdfs上存储的分区数据方式为:在country-amerecal目录中存储info2文件,在country=china目录中存储info文件

创建外部表,向外部表中插入的数据存储在指定的目录中,当删除这个外部表后,外部表中的数据不会被删除,还保留在指定的目录中,这点和普通表不同。
hive> create external table fz_external_tb(id int,name string)
> row format delimited
> fields terminated by ','
> location '/user/external_tb';
hive> load data local inpath '/root/info2' into table fz_external_tb;
#外部表在hdfs上是看不到的,但其确实是存在的,通过show tables可以查看
hive> show tables;
OK
data_partiton
fz_external_tb
#下面测试通过删除外部表来验证hdfs上表的数据是否还保留
hive> drop table fz_external_tb;

创建分区表
#以id字段分桶和排序来创建分桶表test_buck,分桶数为4
hive> create table test_buck(id int,name string)
> clustered by(id)
> sorted by(id DESC)
> into 4 buckets
> row format delimited fields terminated by ',';
#开始向创建的分桶表中插入数据(插入的数据需要时已分桶,且排序的)
hive> insert into table test_buck
> select id,name from test cluster by(id);

浙公网安备 33010602011771号