hive-学习-1

hvie

建表语句

create table test2(
	name string,friends array<string>,
	children map<string,int>,
	address struct<street:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n';

非基本数据类型访问方式
1. 数组访问方式
```
select friends[0] from test2;
```
1. map访问方式 select children["xiao song"] from test2;
2. struct访问方式 select address.street from test2;
数据类型转换
```
select cast('1' as int)+2; ```
```

DDL

创建库

create database [if not exists] database_name
[comment database_commmon]
[location hdfs_path] 路径要与库名一致后缀加上.db
[with dbproperties (name=value...)]; 额外信息可以写作者 创建时间等信息
示例:
create database test location "/hive3";

查询数据库

show databases like '';
desc database hive; 查询库描述信息
desc database extends hive;

修改数据库

alter database db_hive
set dbproperties('create_time'='123123');

删除数据库
```
drop databases name;
```

建表语句

create [external] table [if not exists] table_name 
[(col_name data_type [comment col_comment], ...)]
[comment table_comment]
[partitioned by (col_name data_type [comment col_comment], ...)]
[clustered by (col_name, col_name, ...)]
[sorted by (col_name [asc|desc], ...)] into num_buckets buckets]
[row format row_format]
[stored as file_format]
[location hdfs_path]
[tblproperties(property_name=property_value, ...)]
[as select_statement]

external内部表
partitioned 分区表
clusterd 分桶表
[sorted by (col_name [asc|desc], ...)] into num_buckets buckets] 分桶表
[row format row_format] 行格式的限制
[stored as file_format] 指定文件格式默认text
[location hdfs_path] 指定表的存储位置
[tblproperties(property_name=property_value, ...)] 表额外属性
[as select_statement] 通过select语句复制表

alter table test2 set tblproperties('EXTERNAL'='TRUE')把表修改为外部表 区分大小写
desc formatted test2 查询表的类型

修改表

alter table test1 rename to test2; 修改表名
alter table table_name change [column] col_old_name col_new_name column_type [comment col_comment] [first|after column_name] 修改列
alter table table_name replace (column_name column_type,....)覆盖全表

hadoop操作

hadoop fs -put stu.txt /usr/hive/warehouse/test2 需要满足表格式
hadoop fs -mkdir /usr/hive/warehouse/stu4 在指定目录创建一个文件夹
dfs -get 'hdfs_path'  'file path' hadoop命令导出到本地
hadoop fs -rm -r -skipTrash /path_to_file/file_name 删除某个文件

DML

数据导入

向表中装载数据

load data [local] inpath '数据path' [overwrite] into table 表名 [partition (partcoll=vall, ...)];
local: 从本地加载数据到hive 否则从hdfs姐在数据到hive表
partition: 上传到指定分区

查询倒入数据

insert into|overwrite table table_name select * from student_name;

insert into|overwrite table table_name 
select * from student1
insert into|overwrite table table_name
select * from student2;

查询语句中创建表并加载数据

create table table_name as select id,name from table_name;

指定location位置信息建表建议使用外部表

create table table_name(id string) row format...
location 'hdfs filepath'

数据导出

import数据到指定hive表中

import table table_name from 'file path'; 
路径必须是export导出的hdfs路径
表不存在或者是空表才能导入

导出

hive -e 'select * from test;' >> student.txt 有表头信息 不常用
dfs -get 'hdfs_path'  'file path' hadoop命令导出到本地
export table default.test to 'path' 会导出元数据信息

清空表

truncate table table_name; 只能清空内部表

查询

select [all | distinct] select_expr...
from table_name
[where condition]
[group by col...]
[having]
[order by col...]
[cluster by col...  | [distribute by col...][sort by col...] ]
[limit number]

a <=> b ab都为null的时候返回true
a<>b a != b 不等于
between 左闭右闭
like '_a%' 第二个字母为a的字符

group by

排序

order by 效率太低禁止使用
sort by 分区内有排序 单独使用随机划分分区的
distribute by ‘field’ sort by ‘field’分区排序
cluster by 当distribute和sort field相同使用 只能asc顺序

insert overwrite local directory 'path' select_statement 根据查询语句导出
set mapreduce.job.reduces=4; 设置reduce数量

分区表

创建分区表
 create table dept_par(deptno int,dname string,loc string)
 partitioned by(day string)
 row format delimited
 fields terminated by '\t';
导入数据
 load data local inpath '/usr/local/rudyfile/dept_20200401.log' into table 	dept_par partition(day = '2020-10-25');

添加分区 以空格分隔
 alter table dept_par add partition(day='20200121') partition(day='20200122');
删除分区 以逗号分隔
 alter table dept_par drop partition(day='20200121'),partition('20200122'); 
查看表的分区
 show partitions dept_par;

hadoop put操作不会更新元数据
数据上传到分区目录上,让分区表和数据产生关联的三种方式
1. msck repair table dept_par; 更新元数据
2. alter table 'table_name' add partition(); 手动添加分区
3. load data local inpath 'file_path' into table 'table_name' partition(day=''); load命令会更新元数据

动态分区

insert into table 'table_name' partition(deptno)
select dname,deptno from dept; 
最后一个字段作为动态分区的字段 需要在非严格格式下

hive.exec.max.dynamic.partitions.pernode=100
每个mr节点上 最大可以创建多少个动态分区默认一百根据业务设定
hive.exec.dynamic.partition.mode=nostrict 关闭严格模式	```

分桶表

create table table_name(id int,name string)
clustered by(id) into 4 buckets
row format delimited
fields terminited by '/t';

函数

desc function '';
CASE WHEN THEN ELSE END
IF(,,)
NVL(,)

行转列
concat
concat_ws(separator,str1....) 只能是string 和数组

select con_blood,concat_ws('|',collect_set(name)) 
from 
(select concat_ws(',',constellation,blood_type) con_blood,name 
from person_info) t1 
group by con_blood;

列转行
collect_set() 一列数据转换成数组
collect_list()	
explode(col) 将一列中的array 或者map 拆分成多行
lateral

select movie,category_name
from movie_info
lateral view explode(split(category,',')) move_info_tmp as category_name;
炸裂之后的字段与原先的字段有关联

窗口函数

current row 当前行
n preceding 往前n行
n following 往后n行
unbounded 起点
	unbounded preceding  从前面的起点
	unbounded following   到后面的终点
lag(col,n,default_val) 往前n行数据
lead(col,n,default_val) 往后n行数据
ntile(n) 把有序窗口的行进行分组 每个组有一个编号 从1 开始

over() 5条相同的数据被分为5个组,每个组里的数据没有指定的话就都是这5条数据
over(partition by field_name) 根据字段分组统计

根据月份统计用户的流水总额
select name,
orderdate,
cost,
sum(cost) over(partition by name,month(orderdate)) 
from business;


select 
name,
orderdate,
cost,sum(cost) over(partition by name order by orderdate rows between 
unbounded preceding and current row) 
from business;

select 
name,
orderdate,
cost,
lag(orderdate,1) over(partition by name order by orderdate) 
from business;

rank() 排位 相同值会重复 但 总数不变
select name,subject,score,rank() over(order by score) from score;

取每科目前三名
select name,subject,score from 
(select *,rank() over(partition by subject order by score desc) rk from score) t1 	where rk<=3;


dense_rank() 密集排位 相同值会重复,总数会变

row_number() 不重复排位	相同值不会重复 总数不变

自定义函数

add jar /usr/local/apache-hive-3.1.2-bin/lib/hiveTest.jar
create temporary function my_explo as "com.rudy.MyUDTF";

grouping set

 select deptid,gender,count(*) 
 from staff group by deptid,gender 
 grouping sets((deptid,gender),(deptid),(gender),());

压缩储存

```
 set hive.exec.compress.intermediate=true 开启传输数据压缩
 set mapreduce.map.output.compress=true map 输出压缩
 set mapreduce.map.output.compress.codec= 设置map输出的压缩方式
 
 使用orc格式存储 zlib压缩
 create table log_orc_zlib(track_time string,url string,session_id string,
 referer string,ip string,end_user_id string,city_id string) 
 row format delimited 
 fields terminated by '\t' 
 stored as orc 
 tblproperties("orc.compress"="ZLIB");

```

调优

explain 查看执行计划
fetch抓取 默认more 
local本地模式计算 免去了与yarn之间的网络通信

空值过滤 
在非inner join的时候 不需要字段为null的查询时 先过滤再join
field is not null

空值转换
避免原本join不上的数据能join上
nvl(field,round()); 

分桶表
分桶表的思想是根据key的hash分到不同的桶中,join操作的id相同肯定桶位相同

大量相同key导致的数据倾斜
set hive.map.aggr=true
在map端进行集合操作的条目数量
set hive.groupby.mapagr.checkinterval=100000
有数据倾斜的时候进行负载均衡
set hive.groupby.skewindata=true

count(distinct)去重
set mapreduce.job.reduces=5 设置mapreduce的数量
在执行的时候只会使用一个reduce
select count(*) from (select id from emp group by id) t1; group by子查询会起多个reducer

行过滤
谓词下推
关联字段与where条件一致才行

手动谓词下推
select b.id from 
bigtable b 
join
(select id from bigtable whre id <= 10) o 
on b.id = o.id;

调整map数量

调整reducer数量
reducer数量 = 数据量/reduce处理的数据量(256m) 根据数据量手动调整

hive.exec.reducers.bytes.per.reducer; 每个reduce默认处理的数据量
hive.exec.reducers.max; 每个任务最大的reduce数

并行执行
set hive.exec.parallel=true; 开启任务并行执行
set hive.exec.parallel.thread.number=16; 最大并行量默认为8
当任务不依赖的时候可以并行执行,系统有空闲资源的时候使用

严格模式
使用order by 没有limit
hive.strict.checks.orderby.no.limit

迪卡尔基
set hive.strict.checks.cartesian.product=true;

jvm重用 小文件多的时候用

posted @ 2021-12-30 20:18 rudynan 阅读(61) 评论(0) 收藏举报

刷新页面返回顶部

你得支棱起来啊

hive-学习-1

hvie

公告