hive 学习笔记

hive
创建库
create database test;
删除库
drop database test cascade;
查看库
desc database test;
使用库
use test;
创建表
create table student(id int,name String);
插入数据
insert into student values (1,'luncy');
查询数据
select * from student;
创建表的基本语句
create table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,int month,int day)
row format delimited fields terminated by '\t'
stored as textfile
location '/inputdata' ;
从txt文件中加载数据到表里
load data local inpath '/home/aaa' into table student;
从hdfs上获取文件然后加载到表中
load data inpath '/home/aaa' into table student;
删除表
drop table student;

------------------------------------------------------
上面创建的都是内部表,下面我们创建外部表 external
create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,int month,int day)
row format delimited fields terminated by '\t'
stored as textfile
location '/inputdata' ;

load data inpath '/home/a2' overwrite into table student;

hive是读时模式
mysql时写时模式

内部表与外部表的区别

alter table xxx


-------------------------------------------------------------------------------

1.尽量避免join
2.避免 in not in这样的子查询
3.小表驱动大表
4.使用map端的join 比 reduce段的join 速度快

left join , right join ,内连接

----------------------------------------------------------------------------------

sort by 局部排序
order by 全部排序

设置reduce的个数 : set mapreduce.job.reduces=3;
select e.empno,e.deptno from emp e sort by e.empno desc;

distribute by 是说 根据这个字段放到多个reducer中去


union 和 union all 的区别
union 是 去重
union all 是 不去重不排序,只是将两部分数据合并。


---------------------------------------------------------------------------------------------
分区
partitioned by
一个表或者一个分区可以由多个分区组成。表现形式是不同的目录中
分区用于加速查询

分区查看: show partitions 什么表;
添加分区: alter table 表名 add partition(year='2019') partition(year='2020'); 这个是添加了两个分区
修改元数据: alter table 表名 partition(year='2019') rename to partition(year='2030'); 把2019的分区名改成了2030
指定分区对应到已有的数据:alter table 表名 partition(year='2019') set location 'XXXX路径';
添加分区时指定数据:alter table 表名 add partition(year='2019') set location 'XXXX路径';
删除分区:alter table 表名 droppartition(year='2019') ;


create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year)
row format delimited fields terminated by '\t'
;

load data local inpath '/home/a2' overwrite into table student partition(year='2015');
load data local inpath '/home/a2' overwrite into table student partition(year='2016');
load data local inpath '/home/a2' overwrite into table student partition(year='2017');


创建二级分区

create external table student(
id int,
name String
)
comment '我是注释'
partitioned by (String year,String month)
row format delimited fields terminated by '\t'
;

load data local inpath '/home/a2' overwrite into table student partition(year='2015',month='1');
load data local inpath '/home/a2' overwrite into table student partition(year='2016',month='2');
load data local inpath '/home/a2' overwrite into table student partition(year='2017',month='3');

 

分区的分类:
静态分区:对于分区已经知道,可以用load加载数据
动态分区:对于分区已经未知,不可以用load加载数据,需要用下面的insert加载数据
组合分区:

insert intot table comm3 partition(year,month)
select id,comment,year,month from comm2;


insert intot table comm3 partition(year='2019',month)
select id,comment,month from comm2;

查询模式: 严格/非严格
set hive.mapred.mode=strict/nostrict;
是否允许动态分区:
set hive.exec.dynamic.partition=true;
动态分区的模式: 严格模式/非严格模式
set hive.exec.dynamic.partition.mode=strict/nostrict;
最大分区数量
set hive.exec.max.dynamic.partitions=1000;
单个节点的允许最大的分区数量100
set hive.exec.max.dynamic.partitions.pernode=100;

--------------------------------------------------------------------------
hive的数据类型

tinyint 1 -128~127
smallint 2
int 4
bigint 8
String 可变
boolean 1
double 8
float 4
timestamp 2017-10-10 12:0:0
binary 字节数组

java中有的数据类型在hive中没有:
long byte char short

复杂数据类型:
array:字段名 Array<double>
map:字段名 map<String,double>
struct:相当于类

create table if not exists arr1(
name String,
score Array<double>
)
row format delimited fields terminated by '\t'
collection items terminated by ',' 这句话的意思时数组以,分割
;

select a.name,a.score[0],a.score[2] from arr1 where a.score[1] > 81;

create table map1(
name String,
score map<String,double>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
;

select m.name,m.score["Chinese"],m.score["Math"]
from map1 m where m.score["Math"] > 60;

create table struct1(
name String,
score struct<chinese:double,math:double,english:double>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
;

select a.acore.math from struct1 a where a.score.math > 80;


------------------------------------------------------------------------

hive的内部函数

select rand();获得一个0到1的随机数

select split(rand()*100,'\\.')[0];获得一个1到100的整数

select substring(rand()*100,0,2); 获得一个1到100的整数

select (cast(1 as double));

select regexp_replace('a.jpg','jpg','png');

select concat("1","2","3");

select concat_ws("|","1","2","3");

select length("abcdf");

select size(array(1,2,3));

目前有很多个班级的成绩,取得每个班的前三名

select tmp.sclass,tmp.score
(
select r.sclass,r.score,
row_number() over (distribute by r.sclass sort by r.score desc) rr
from rn r
) as tmp where tmp.rr<4;

select r.sclass,r.score,
rank() over (partition by r.sclass order by r.score desc) ra,
dense_rank() over (partition by r.sclass order by r.score desc) rb,
row_number() over (distribute by r.sclass sort by r.score desc)
from rn r;

-------------------------------------------------------------------------

udf函数指的是用户自定义函数

继承UDF,重写evaluate()方法即可
然后就是一些linux的操作了

----------------------------------------------------------------------
hive文件的存储格式

textfile 默认的存储格式,普通的文本文件,数据不压缩,磁盘开销大,分析开销大
sequencefile:hive提供的一个2进制的存储格式,可以切割,天生压缩
rcfile:hive提供的一种行列混合存储模式,
orc:时rcfile的一种优化存储
parquet:自定义输入输出格式

 

------------------------------------------------------------------------------
serde 记录格式
序列化和反序列化
常见的serde: csv,tsv,json serde ,regex,parqut


创建记录为csv格式的表
create table csv1(
name String,
score String
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;

--------------------------------------------------------------------------------

索引

show index on 表名;

create index idx_log1_phonenumber
on table log1(phonenumber)
as 'compact'
with deferred rebuild;

创建联合索引
create index idx_log1_phonenumberAndId
on table log1(phonenumber,id)
as 'compact'
with deferred rebuild;

alter index idx idx_log1_phonenumberAndId on log1 rebuild;

索引的类型: compact / bitmap

drop index 。。。 on 。。。;

 

posted @ 2019-06-23 14:24  纯丿乱  阅读(208)  评论(0编辑  收藏  举报