Hive常用语句
一、修改表结构
1、添加删除分区
添加分区:alter table app.app_tabe1 add if not exists partition(dt = '20210621', hour = '17');
删除分区:alter table app.app_tab1 drop partition(dt=20210915,hour=11);
批量删除分区:ALTER TABLE dw.table1 DROP IF EXISTS PARTITION(dt >= '20220525',dt <= '20220605');
2、建表--内部分区表
create table dw.test_wang(
event_id string comment '事件id',
biz_type bigint comment '业务线',
client_type bigint comment '客户端类型',
md5_log string comment '埋点唯一键',
event_type bigint comment '事件类型 1:浏览 2:点击 3:播放 4:展示 5:长按 6:页面响应 7:时间消耗',
sdk_version string comment 'sdk版本',
distinct_id string comment '唯一id',
app_channel string comment '来源')
partitioned by (dt string comment '日期分区')
--普通格式表
CREATE TABLE `dws`.`dws_online_sku_test`(
`sale_area_id` bigint COMMENT '售卖区id',
`owner_id` bigint COMMENT '供应商id(pop id) 1 自营 大于1 pop商',
`sale_type` bigint COMMENT '渠道对应的商户归属',
`customer_type` bigint COMMENT '1非家庭个人 2家庭个人',
`source_tag` bigint COMMENT '1:自营 2 pop 自营+pop in(1,2)',
`sku_id` bigint COMMENT 'sku_id',
`sku_day_flag` bigint COMMENT 'sku_当天是否在线',
`sku_wac_flag` bigint COMMENT 'sku_周是否在线',
`sku_mac_flag` bigint COMMENT 'sku_月是否在线',
`sku_j7_flag` bigint COMMENT 'sku_近7天是否在线',
`sku_j30_flag` bigint COMMENT 'sku_近30天是否在线',
`cmd_pop` bigint COMMENT '县域pop标签 7'
) partitioned by(`dt` int) row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with SERDEPROPERTIES ("separatorChar" = ",", "quotechar" = "\"") STORED AS TEXTFILE;
--orc压缩表
CREATE TABLE `dws`.`dws_online_sku`(
`sale_area_id` bigint COMMENT '售卖区id',
`owner_id` bigint COMMENT '供应商id(pop id) 1 自营 大于1 pop商',
`sale_type` bigint COMMENT '渠道对应的商户归属',
`customer_type` bigint COMMENT '1非家庭个人 2家庭个人',
`source_tag` bigint COMMENT '1:自营 2 pop 自营+pop in (1,2)',
`sku_id` bigint COMMENT 'sku_id',
`sku_day_flag` bigint COMMENT 'sku_当天是否在线',
`sku_wac_flag` bigint COMMENT 'sku_周是否在线',
`sku_mac_flag` bigint COMMENT 'sku_月是否在线',
`sku_j7_flag` bigint COMMENT 'sku_近7天是否在线',
`sku_j30_flag` bigint COMMENT 'sku_近30天是否在线',
`cmd_pop` bigint COMMENT '县域pop标签 7'
) COMMENT '唯一键:dt,sale_area_id'
PARTITIONED BY (`dt` int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
WITH SERDEPROPERTIES ('serialization.format' = '1')
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
TBLPROPERTIES (
'transient_lastDdlTime' = '1596110407',
'last_modified_time' = '1596110407',
'last_modified_by' = 'test'
);
注:路径:创建内部表时可指定路径,若路径不存在建表时会自动创该路径。不指定路径时会存放默认路径。
3、外部分区表
CREATE EXTERNAL TABLE IF NOT EXISTS `tbl_usr_bind_card_count`(
`usr_id` string,
`trans_flow` string,
`trans_time` string,
`phone` string,
`cert_id` string,
`card_no` string)
PARTITIONED BY (
`hp_settle_dt` string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\;',
'serialization.format'='\;')
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count';
注:关键字EXTENAL告诉hive这张表是外部的。
创建外部表时可指定路径,表创建时不会自动生成指定路径,所以要先创建路径再创建表。不指定路径时会存放默认路径。因为表是外部的,所以hive并非认为完全拥有这份数据。因此删除表并不会删除掉这份数据,但是描述表的元数据信息会被删掉。
公司外部表
CREATE external TABLE one_dw.ods_sn_source_leads_data_tmp1 (
`changed_type` int,
`id` bigint COMMENT '唯一id',
`account_id` string COMMENT '账号id')
COMMENT 'leads主表'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'ofs://****/user/one_dw/.Trash/Current/user/hive/warehouse/one_dw.db/ods_table'
insert overwrite table one_dw.ods_tabl1 select * from source
4、表结构复制
CREATE EXTERNAL TABLE IF NOT EXISTS tbl_usr_bind_card_count1
LIKE tbl_usr_bind_card_count
LOCATION '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count1';
5、删除表
DROP TABLE tbl_usr_bind_card_count;
6、hive表重命名
ALTER TABLE tbl_usr_bind_card_count RENAME TO tbl_usr_bind_card_count2;
7、添加分区
--添加分区
ALTER TABLE tbl_usr_bind_card_count ADD IF NOT EXISTS PARTITION(hp_settle_dt = 20220301);
--添加分区,并指定路径
ALTER TABLE tbl_usr_bind_card_count ADD IF NOT EXISTS PARTITION(hp_settle_dt = 20220301)
LOCATION '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count/hp_settle_dt=20220301';
8、修改分区
ALTER TABLE tbl_usr_bind_card_count PARTITION(hp_settle_dt=20220301)
SET LOCALTION '/user/bdhysfas/data/tbl_usr_bind_card_count/hp_settle_dt=20220301';
9、删除分区
ALTER TABLE tbl_usr_bind_card_count DROP PARTITION(hp_settle_dt=20220301);
10、新增表字段
方法1:对于旧的分区的可正常查询,新加的字段值为空。但是旧的分区的数据无法插入或更新,新的分区可正产插入和查询。
ALTER TABLE tbl_oper_in_flow add columns(
less_recovery double,
remark1 string,
remark2 string,
remark3 string);
缺点:旧的分区无法插入或更新
方法2:可加工旧的分区的历史数据,使所以分区的数据可正常查询和插入,但可能加载时间过长。
ALTER TABLE tbl_oper_in_flow add columns(
less_recovery double,
remark1 string,
remark2 string,
remark3 string CASCADE);
缺点:可能加载时间过长
方法3:在某一列后新增列
alter table test123 add columns (test_col string comment '新加列');
alter table test123 change test_col test_col string after area_code;
11、分区插入数据
INSERT INTO dw.test123 partition(dt=20240508) VALUES(111,222,333);
12、修改表结构
alter table dw.test111 change source_col target_col decimal(30,6);
13、Hive内表删除字段
ALTER TABLE tiangong_test.test2 REPLACE COLUMNS (id integer, job_id integer, rule_name string);
二、常用sql
1、复制数据
2663424MB Queue9.8% Cluster6.9%
insert overwrite table test.dwd_c_temp_wang partition (dt,hour)
SELECT * from test.dwd_c where dt='20220713'
2、计算95分位
select dt,percentile_approx(cast(get_json_object(params,'$.elapsed') as float),0.95) AS p_95 from dw.dwd_bhv_performance where
dt=20220523
GROUP BY get_json_object(params,'$.origin_url'), dt
select dt,
approx_percentile(cast(json_extract_scalar(params,'$.elapsed') as double) ,array[0.25,0.5,0.75,0.95]) as p_95
from dw.dwd_bhv_performance where
dt='20220523'
GROUP BY json_extract_scalar(params,'$.origin_url'), dt
3、求某维度topN数据
select a.*
from
(
select row_number() over(partition by biz_name order by client_type) as num,*
from dw.dwd_bhv_clickbrowse_c_test_mb_hi
where dt = 20241230
) a
where num <= 3
4、行转列(多行转一列)
描述:多行数据分组后,根据分组键做聚合。拼接某一列数据到一列
方式1:
select id, concat_ws('_',collect_set(name)) as names
from (
select 1 as id,'test1' as name
union all
select 1 as id,'test2' as name
union all
select 1 as id,'test3' as name
union all
select 1 as id,'test4' as name
)
group by id;
----------------------展示结果
id names
1 test3_test1_test2_test4
5、列转行(一行转多行)
方式1:一行内容根据符号分隔后转多行
描述:一列数据分隔后转多行展示
其中exl_tab是一个虚表的名称,explode作用的是列转行:这个函数接受array类型的参数,其作用恰好与collect_set相反,实现将array类型数据行转列
select id,name,exl_col from (
select 1 as id,'1,11' as name
union all
select 1 as id,'2,22' as name
union all
select 1 as id,'3,33' as name
)
lateral view explode(split(name,','))exl_tab as exl_col ;
----------------------------展示结果----------------------------
id name exl_col
1 1,11 1
1 1,11 11
1 2,22 2
1 2,22 22
1 3,33 3
1 3,33 33
方式2:列为数组转多行
select * from (
SELECT 1 AS id, array('1', '11') AS items
UNION ALL
SELECT 2 AS id, array('2', '22') AS items
UNION ALL
SELECT 3 AS id, array('3', '33') AS items
)
LATERAL VIEW explode(items) exploded_table AS exploded_item;
---------------------查询结果---------------------
id items exploded_item
1 ["1,11"] 1
1 ["1,11"] 11
2 ["2,22"] 2
2 ["2,22"] 22
3 ["3,33"] 3
3 ["3,33"] 33
6、多列转多行
描述:多个列信息转成map方式,最后在key、value方式展示出来
LATERAL VIEW:横向视图首先将UDTF应用于基表的每一行,然后将结果输出行连接到输入行,以形成具有所提供的表别名的虚拟表。
EXPLODE(tmp_column) 把列转行,如果要把维度带出来,则需要组合LATERAL VIEW来实现虚拟表连接,转成多行
SELECT id,columnAlias,columnVal
FROM(
SELECT
id
,MAP('key1', sell_sku_count, 'key2', sell_sku_count_wac) AS tmp_column
FROM (
select 1 as id,11 as sell_sku_count,111 as sell_sku_count_wac
union all
select 2 as id,22 as sell_sku_count,222 as sell_sku_count_wac
union all
select 3 as id,33 as sell_sku_count,333 as sell_sku_count_wac
)
) x LATERAL VIEW EXPLODE(tmp_column) tableAlias AS columnAlias,columnVal
--------------------------------展示结果--------------------------------
id columnAlias columnVal
1 key1 11
1 key2 111
2 key1 22
2 key2 222
3 key1 33
3 key2 333
三、运维
1、表Lock锁住解锁
连接hivemetastore 的mysql库
select * from HIVE_LOCKS limit 10;
delete from HIVE_LOCKS where HL_TABLE='ods_crm_domain_ocrm_private_sea_databus' and HL_USER='hadoop';
2、参数优化
1)如果是小表,自动选择Mapjoin:
set hive.auto.convert.join = true; # 默认为false
该参数为true时,Hive自动对左边的表统计量,如果是小表就加入内存,即对 小表使用Map join
2)大表小表的阀值:
set hive.mapjoin.smalltable.filesize;
hive.mapjoin.smalltable.filesize=25000000
默认值是25mb
3)map join做group by 操作时,可以使用多大的内存来存储数据,如果数据太大,则不会保存在内存里
set hive.mapjoin.followby.gby.localtask.max.memory.usage;
默认值:0.55
4)本地任务可以使用内存的百分比
set hive.mapjoin.localtask.max.memory.usage;
3、其它参数优化
SET mapred.reduce.tasks=50;
SET mapreduce.reduce.memory.mb=6000;
SET mapreduce.reduce.shuffle.memory.limit.percent=0.06;
4、加载本地csv文件到hive表
备注:删除csv的标题行,要不然会有一行空值
load data local inpath '/home/test/test.csv' into table dws.test partition(dt=20201111);
5、从另外一张表导数据
insert overwrite table `dws`.`dws_online_sku` partition(dt = 20201111)
select sale_area_id, owner_id, sale_type, customer_type, source_tag, sku_id, sku_day_flag, sku_wac_flag, sku_mac_flag, sku_j7_flag, sku_j30_flag, cmd_pop
from `dws`.`dws_online_sku_test` where dt = 20201111;
6、手动新增分区
alter table bdg_inf.binlog add partition (inst='test-prod', tn='test.test_tab', dt='20211229', hour='07') location 'ofs://f.chdfs.ap-beijing.myqcloud.com/user/hive/warehouse/test.db/binlog/test_tab/test.tab1/dt=20211229/hour=07'
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?