hive基本操作

一、创建分区分桶表

//clustered by (pnl_id) into 40 buckets 分成40个桶，动态分区如果分区列值太多，造成文件夹数过多引起系统崩溃等问题，因此可以对该列分桶，会根据列值的hash值取模分桶。
桶的数量即producer的数量，每个桶会生成一个文件。
create table test(
pnl_id string,
event_timekey string
)
partitioned by (factory string)
clustered by (pnl_id) into 40 buckets
row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
with serdeproperties (
'field.delim'='\t',
'escape.delim'='\n',
'serialization.null.format'='NULL',
'serialization.encoding'='UTF-8'
) 
stored as parquet;

//select 源表 分区列跟据位置匹配，并不是根据名称，分桶表不能insert into 自己 select 自己
往分桶表插入之前建议先distribute by (pnl_id) sort by(pnl_id)或者cluster by(pnl_id)，便于sort merge join之前免于排序。distribute by根据key分配producer
insert into test partition(factory) select pnl_id,event_timekey,factory as fac from dwr_pnl_hist where shift_timekey='20190610 180000';

set hive.exec.dynamic.partition=true;//开启动态分区
set hive.exec.dynamic.partition.mode=nonstrict;//非严格的分区模式 开启之后才能实现insert动态分区
set hive.exec.max.dynamic.partitions.pernode=100;//每个mapper或reducer可以创建的最大动态分区个数
set hive.exec.max.dynamic.partitions=1000//一个动态分区语句可创建动态分区个数
set hive.exec.max.created.files=100000//全局可创建最多文件个数

set hive.enforce.bucketing=true;//开启强制分桶 若未开启 插入之前应设置producer数量为分桶数，并插入之前cluster by 使producer内部排序。
set mapreduce.job.reduces=40;
set hive.cli.print.header=true;//命令行开启列名显示
set hive.cli.print.row.to.vertical=true;
set hive.limit.optimize.enable=true;//开启limit优化，抽样取数

二、常用查询命令

//sort by 每个reduce内部排序 order by 数据通过一个reduce全局排序
select * from test sort by event_timekey desc limit 10;
//hive 解释计划 explain extended 更为详细的计划
explain select pnl_id from mdw.test where pnl_id='1234';

三、索引

create index index_test on table test(pnl_id) 
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' 
WITH DEFERRED REBUILD
in table index_test_table ;
//索引创建使用懒加载WITH DEFERRED REBUILD，因此rebuild 加载索引数据
alter index index_test on test rebuild;

drop index index_test on test;
//索引表内容
index_test_table.pnl_id    index_test_table._bucketname    index_test_table._offsets    index_test_table.factory
5AMK950001A7BB    hdfs://nameservice1/user/hive/warehouse/mdw.db/test/factory=B9/part-00000    [409800,706900,706920,1070940,1300720,1300740,1984060,2336580]    B9
//开启索引优化
set hive.optimize.index.filter=ture;
set hive.optimize.index.groupby=true;

posted on 2019-06-14 17:28 malloc+ 阅读(274) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

@malloc

导航

公告

hive基本操作