hive 学习笔记
hive
创建库
create database test;
删除库
drop database test cascade;
查看库
desc database test;
使用库
use test;
创建表
create table student(id int,name String);
插入数据
insert into student values (1,'luncy');
查询数据
select * from student;
创建表的基本语句
create table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,int month,int day)
row format delimited fields terminated by '\t'
stored as textfile
location '/inputdata' ;
从txt文件中加载数据到表里
load data local inpath '/home/aaa' into table student;
从hdfs上获取文件然后加载到表中
load data inpath '/home/aaa' into table student;
删除表
drop table student;
------------------------------------------------------
上面创建的都是内部表,下面我们创建外部表 external
create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,int month,int day)
row format delimited fields terminated by '\t'
stored as textfile
location '/inputdata' ;
load data inpath '/home/a2' overwrite into table student;
hive是读时模式
mysql时写时模式
内部表与外部表的区别
alter table xxx
-------------------------------------------------------------------------------
1.尽量避免join
2.避免 in not in这样的子查询
3.小表驱动大表
4.使用map端的join 比 reduce段的join 速度快
left join , right join ,内连接
----------------------------------------------------------------------------------
sort by 局部排序
order by 全部排序
设置reduce的个数 : set mapreduce.job.reduces=3;
select e.empno,e.deptno from emp e sort by e.empno desc;
distribute by 是说 根据这个字段放到多个reducer中去
union 和 union all 的区别
union 是 去重
union all 是 不去重不排序,只是将两部分数据合并。
---------------------------------------------------------------------------------------------
分区
partitioned by
一个表或者一个分区可以由多个分区组成。表现形式是不同的目录中
分区用于加速查询
分区查看: show partitions 什么表;
添加分区: alter table 表名 add partition(year='2019') partition(year='2020'); 这个是添加了两个分区
修改元数据: alter table 表名 partition(year='2019') rename to partition(year='2030'); 把2019的分区名改成了2030
指定分区对应到已有的数据:alter table 表名 partition(year='2019') set location 'XXXX路径';
添加分区时指定数据:alter table 表名 add partition(year='2019') set location 'XXXX路径';
删除分区:alter table 表名 droppartition(year='2019') ;
create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year)
row format delimited fields terminated by '\t'
;
load data local inpath '/home/a2' overwrite into table student partition(year='2015');
load data local inpath '/home/a2' overwrite into table student partition(year='2016');
load data local inpath '/home/a2' overwrite into table student partition(year='2017');
创建二级分区
create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,String month)
row format delimited fields terminated by '\t'
;
load data local inpath '/home/a2' overwrite into table student partition(year='2015',month='1');
load data local inpath '/home/a2' overwrite into table student partition(year='2016',month='2');
load data local inpath '/home/a2' overwrite into table student partition(year='2017',month='3');
分区的分类:
静态分区:对于分区已经知道,可以用load加载数据
动态分区:对于分区已经未知,不可以用load加载数据,需要用下面的insert加载数据
组合分区:
insert intot table comm3 partition(year,month)
select id,comment,year,month from comm2;
insert intot table comm3 partition(year='2019',month)
select id,comment,month from comm2;
查询模式: 严格/非严格
set hive.mapred.mode=strict/nostrict;
是否允许动态分区:
set hive.exec.dynamic.partition=true;
动态分区的模式: 严格模式/非严格模式
set hive.exec.dynamic.partition.mode=strict/nostrict;
最大分区数量
set hive.exec.max.dynamic.partitions=1000;
单个节点的允许最大的分区数量100
set hive.exec.max.dynamic.partitions.pernode=100;
--------------------------------------------------------------------------
hive的数据类型
tinyint 1 -128~127
smallint 2
int 4
bigint 8
String 可变
boolean 1
double 8
float 4
timestamp 2017-10-10 12:0:0
binary 字节数组
java中有的数据类型在hive中没有:
long byte char short
复杂数据类型:
array:字段名 Array<double>
map:字段名 map<String,double>
struct:相当于类
create table if not exists arr1(
name String,
score Array<double>
)
row format delimited fields terminated by '\t'
collection items terminated by ',' 这句话的意思时数组以,分割
;
select a.name,a.score[0],a.score[2] from arr1 where a.score[1] > 81;
create table map1(
name String,
score map<String,double>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
;
select m.name,m.score["Chinese"],m.score["Math"]
from map1 m where m.score["Math"] > 60;
create table struct1(
name String,
score struct<chinese:double,math:double,english:double>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
;
select a.acore.math from struct1 a where a.score.math > 80;
------------------------------------------------------------------------
hive的内部函数
select rand();获得一个0到1的随机数
select split(rand()*100,'\\.')[0];获得一个1到100的整数
select substring(rand()*100,0,2); 获得一个1到100的整数
select (cast(1 as double));
select regexp_replace('a.jpg','jpg','png');
select concat("1","2","3");
select concat_ws("|","1","2","3");
select length("abcdf");
select size(array(1,2,3));
目前有很多个班级的成绩,取得每个班的前三名
select tmp.sclass,tmp.score
(
select r.sclass,r.score,
row_number() over (distribute by r.sclass sort by r.score desc) rr
from rn r
) as tmp where tmp.rr<4;
select r.sclass,r.score,
rank() over (partition by r.sclass order by r.score desc) ra,
dense_rank() over (partition by r.sclass order by r.score desc) rb,
row_number() over (distribute by r.sclass sort by r.score desc)
from rn r;
-------------------------------------------------------------------------
udf函数指的是用户自定义函数
继承UDF,重写evaluate()方法即可
然后就是一些linux的操作了
----------------------------------------------------------------------
hive文件的存储格式
textfile 默认的存储格式,普通的文本文件,数据不压缩,磁盘开销大,分析开销大
sequencefile:hive提供的一个2进制的存储格式,可以切割,天生压缩
rcfile:hive提供的一种行列混合存储模式,
orc:时rcfile的一种优化存储
parquet:自定义输入输出格式
------------------------------------------------------------------------------
serde 记录格式
序列化和反序列化
常见的serde: csv,tsv,json serde ,regex,parqut
创建记录为csv格式的表
create table csv1(
name String,
score String
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;
--------------------------------------------------------------------------------
索引
show index on 表名;
create index idx_log1_phonenumber
on table log1(phonenumber)
as 'compact'
with deferred rebuild;
创建联合索引
create index idx_log1_phonenumberAndId
on table log1(phonenumber,id)
as 'compact'
with deferred rebuild;
alter index idx idx_log1_phonenumberAndId on log1 rebuild;
索引的类型: compact / bitmap
drop index 。。。 on 。。。;