Hive学习总结

环境配置

本人用的是mysql存储hive元数据，需要提前下载mysql

hive-env.sh

# Set HADOOP_HOME to point to a specific hadoop install directory
HADOOP_HOME=/export/servers/hadoop-2.7.5


# Hive Configuration Directory can be controlled by:
export HIVE_CONF_DIR=/export/servers/apache-hive-2.1.1-bin/conf

hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
      <name>javax.jdo.option.ConnectionUserName</name>
      <value>root</value>
  </property>
  <property>
      <name>javax.jdo.option.ConnectionPassword</name>
      <value>root</value>
  </property>
  <property>
      <name>javax.jdo.option.ConnectionURL</name>
      <value>jdbc:mysql://node03:3306/hive?createDatabaseIfNotExist=true&amp;useSSL=false</value>
  </property>
  <property>
      <name>javax.jdo.option.ConnectionDriverName</name>
      <value>com.mysql.jdbc.Driver</value>
  </property>
  <property>
      <name>hive.metastore.schema.verification</name>
      <value>false</value>
  </property>
  <property>
    <name>datanucleus.schema.autoCreateAll</name>
    <value>true</value>
 </property>
 <property>
		<name>hive.server2.thrift.bind.host</name>
		<value>node03</value>
   </property>
</configuration>

Hive常用命令

进入Hive命令模式

bin/hive
#无需进入hive模式执行hive命令
bin/hive -e 'show databases'

创建(外部)表并指定字段分隔符

create [external] table student(
			sid string,
            name string
) [partitined by (year string,month string,day string)] 
row format delimited fields terminated by '\t';

加载数据

load data [local] inpath '/hivedatas/student.csv' [overwrite] into table student [partition (key='')];

#分桶表加载数据
#创建分桶表
create table course (c_id string,c_name string,t_id string) clustered by(c_id) into 3 buckets row format delimited fields terminated by '\t';
#创建临时表（字段名要对应）
create table course_common (c_id string,c_name string,t_id string) row 
format delimited fields terminated by '\t';
#临时表加载数据
load data local inpath '/export/servers/hivedatas/course.csv' into table course_common;
#分桶表加载数据
insert overwrite table course select * from course_common cluster 
by(c_id);

查询语句

#语法基本上与SQL一样，增加了分区和分桶的概念,底层通过多次的MapReduce实现
select [distinct] sid from student 
[where sid='123']
[group by sid [having name like '李%']]
[cluster by sid
| [distribute by sid] [sort by | order by sid]
]
[limit 1,10];
#group by命令会按照指定的关键字进行分组计算，计算效率较高，但如果有的关键字数量比较少、有的比较多，可能会造成数据倾斜，拖慢整体的效率。
#distribute by相当于MapReduce中的分区，sort by是Map阶段完成后进行局部排序；order by是全局排序；当distribute 和 sort的字段相同时可以用cluster替换，但排序顺序只能是倒序。

查看数据库、表信息

desc database 数据库名；
desc [formatted] student;

查询结果写入本地文件

insert overwrite local directory '/data/' select * from student;

设置reduce个数
```
set mapreduce.job.reduces=3;
```
开启分桶
```
set hive.enforce.bucketing=true;
```

posted @ 2021-02-02 23:08 公鸡不下蛋阅读(105) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

公鸡不下蛋

Hive学习总结

Hive学习总结

环境配置

Hive常用命令

公告