Hive常用语句

一、修改表结构

1、添加删除分区

添加分区:alter table app.app_tabe1 add if not exists partition(dt = '20210621', hour = '17');

删除分区:alter table app.app_tab1 drop partition(dt=20210915,hour=11);

批量删除分区:ALTER TABLE dw.table1 DROP IF EXISTS PARTITION(dt >= '20220525',dt <= '20220605');

2、建表--内部分区表

create table dw.test_wang(
  event_id string comment '事件id',
  biz_type bigint comment '业务线',
  client_type bigint comment '客户端类型',
  md5_log string comment '埋点唯一键',
  event_type bigint comment '事件类型 1:浏览 2:点击 3:播放 4:展示 5:长按 6:页面响应 7:时间消耗',
  sdk_version string comment 'sdk版本',
  distinct_id string comment '唯一id',
  app_channel string comment '来源')
partitioned by (dt string comment '日期分区')
--普通格式表
CREATE TABLE `dws`.`dws_online_sku_test`(
  `sale_area_id` bigint COMMENT '售卖区id',
  `owner_id` bigint COMMENT '供应商id(pop id) 1 自营 大于1 pop商',
  `sale_type` bigint COMMENT '渠道对应的商户归属',
  `customer_type` bigint COMMENT '1非家庭个人 2家庭个人',
  `source_tag` bigint COMMENT '1:自营 2 pop 自营+pop in(1,2)',
  `sku_id` bigint COMMENT 'sku_id',
  `sku_day_flag` bigint COMMENT 'sku_当天是否在线',
  `sku_wac_flag` bigint COMMENT 'sku_周是否在线',
  `sku_mac_flag` bigint COMMENT 'sku_月是否在线',
  `sku_j7_flag` bigint COMMENT 'sku_近7天是否在线',
  `sku_j30_flag` bigint COMMENT 'sku_近30天是否在线',
  `cmd_pop` bigint COMMENT '县域pop标签 7'
) partitioned by(`dt` int) row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
with SERDEPROPERTIES ("separatorChar" = ",", "quotechar" = "\"") STORED AS TEXTFILE;
--orc压缩表
CREATE TABLE `dws`.`dws_online_sku`(
  `sale_area_id` bigint COMMENT '售卖区id',
  `owner_id` bigint COMMENT '供应商id(pop id) 1 自营 大于1 pop商',
  `sale_type` bigint COMMENT '渠道对应的商户归属',
  `customer_type` bigint COMMENT '1非家庭个人 2家庭个人',
  `source_tag` bigint COMMENT '1:自营 2 pop 自营+pop in (1,2)',
  `sku_id` bigint COMMENT 'sku_id',
  `sku_day_flag` bigint COMMENT 'sku_当天是否在线',
  `sku_wac_flag` bigint COMMENT 'sku_周是否在线',
  `sku_mac_flag` bigint COMMENT 'sku_月是否在线',
  `sku_j7_flag` bigint COMMENT 'sku_近7天是否在线',
  `sku_j30_flag` bigint COMMENT 'sku_近30天是否在线',
  `cmd_pop` bigint COMMENT '县域pop标签 7'
) COMMENT '唯一键:dt,sale_area_id' 
PARTITIONED BY (`dt` int) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
WITH SERDEPROPERTIES ('serialization.format' = '1') 
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' 
TBLPROPERTIES (
  'transient_lastDdlTime' = '1596110407',
  'last_modified_time' = '1596110407',
  'last_modified_by' = 'test'
);

注:路径:创建内部表时可指定路径,若路径不存在建表时会自动创该路径。不指定路径时会存放默认路径。

3、外部分区表

CREATE EXTERNAL TABLE IF NOT EXISTS `tbl_usr_bind_card_count`(
  `usr_id` string, 
  `trans_flow` string, 
  `trans_time` string, 
  `phone` string, 
  `cert_id` string, 
  `card_no` string)
PARTITIONED BY ( 
  `hp_settle_dt` string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
WITH SERDEPROPERTIES ( 
  'field.delim'='\;', 
  'serialization.format'='\;') 
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count';

注:关键字EXTENAL告诉hive这张表是外部的。

创建外部表时可指定路径,表创建时不会自动生成指定路径,所以要先创建路径再创建表。不指定路径时会存放默认路径。因为表是外部的,所以hive并非认为完全拥有这份数据。因此删除表并不会删除掉这份数据,但是描述表的元数据信息会被删掉。

公司外部表

CREATE external  TABLE one_dw.ods_sn_source_leads_data_tmp1 (
  `changed_type` int, 
  `id` bigint COMMENT '唯一id',
  `account_id` string COMMENT '账号id')
COMMENT 'leads主表'
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'ofs://****/user/one_dw/.Trash/Current/user/hive/warehouse/one_dw.db/ods_table'

insert overwrite table one_dw.ods_tabl1 select * from source

4、表结构复制

CREATE EXTERNAL TABLE IF NOT EXISTS tbl_usr_bind_card_count1
LIKE tbl_usr_bind_card_count
LOCATION '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count1';

5、删除表

DROP TABLE tbl_usr_bind_card_count;

6、hive表重命名

ALTER TABLE tbl_usr_bind_card_count RENAME TO tbl_usr_bind_card_count2;

7、添加分区

--添加分区
ALTER TABLE tbl_usr_bind_card_count ADD IF NOT EXISTS PARTITION(hp_settle_dt = 20220301);
--添加分区,并指定路径
ALTER TABLE tbl_usr_bind_card_count ADD IF NOT EXISTS PARTITION(hp_settle_dt = 20220301) 
LOCATION  '/user/bdhysfas/data/tmp/tbl_usr_bind_card_count/hp_settle_dt=20220301';

8、修改分区

ALTER TABLE tbl_usr_bind_card_count PARTITION(hp_settle_dt=20220301) 
SET LOCALTION  '/user/bdhysfas/data/tbl_usr_bind_card_count/hp_settle_dt=20220301';

9、删除分区

ALTER TABLE tbl_usr_bind_card_count  DROP PARTITION(hp_settle_dt=20220301);

10、新增表字段

方法1:对于旧的分区的可正常查询,新加的字段值为空。但是旧的分区的数据无法插入或更新,新的分区可正产插入和查询。
ALTER TABLE tbl_oper_in_flow add columns(
    less_recovery double,
    remark1 string,
    remark2 string,
    remark3 string);
缺点:旧的分区无法插入或更新

方法2:可加工旧的分区的历史数据,使所以分区的数据可正常查询和插入,但可能加载时间过长。
ALTER TABLE tbl_oper_in_flow add columns(
    less_recovery double,
    remark1 string,
    remark2 string,
    remark3 string CASCADE);
缺点:可能加载时间过长

方法3:在某一列后新增列
alter table test123 add columns (test_col string comment '新加列');
alter table test123 change test_col test_col string after area_code;

11、分区插入数据

INSERT INTO dw.test123 partition(dt=20240508) VALUES(111,222,333);

12、修改表结构

alter table dw.test111 change source_col target_col decimal(30,6);

13、Hive内表删除字段

ALTER TABLE tiangong_test.test2 REPLACE COLUMNS (id integer, job_id integer, rule_name string);

二、常用sql

1、复制数据

2663424MB    Queue9.8%    Cluster6.9%

insert overwrite table test.dwd_c_temp_wang partition (dt,hour)
SELECT * from test.dwd_c where dt='20220713'

2、计算95分位

select dt,percentile_approx(cast(get_json_object(params,'$.elapsed') as float),0.95) AS p_95 from dw.dwd_bhv_performance where 
dt=20220523
GROUP BY get_json_object(params,'$.origin_url'), dt

select dt,
approx_percentile(cast(json_extract_scalar(params,'$.elapsed') as double) ,array[0.25,0.5,0.75,0.95]) as p_95
from dw.dwd_bhv_performance where 
dt='20220523'
GROUP BY json_extract_scalar(params,'$.origin_url'), dt

3、求某维度topN数据

select a.*
from
(
      select row_number() over(partition by biz_name order by client_type) as num,*
      from dw.dwd_bhv_clickbrowse_c_test_mb_hi
        where dt = 20241230
) a
where num <= 3

4、行转列(多行转一列)

描述:多行数据分组后,根据分组键做聚合。拼接某一列数据到一列

方式1:

select id, concat_ws('_',collect_set(name)) as names
from (
    select 1 as id,'test1' as name 
    union all
    select 1 as id,'test2' as name 
    union all
    select 1 as id,'test3' as name 
    union all
    select 1 as id,'test4' as name 
)
group by id;
----------------------展示结果
id names
1  test3_test1_test2_test4

5、列转行(一行转多行)

方式1:一行内容根据符号分隔后转多行

描述:一列数据分隔后转多行展示

其中exl_tab是一个虚表的名称,explode作用的是列转行:这个函数接受array类型的参数,其作用恰好与collect_set相反,实现将array类型数据行转列

select id,name,exl_col  from (
	select 1 as id,'1,11' as name 
    union all
    select 1 as id,'2,22' as name 
    union all
    select 1 as id,'3,33' as name
)
lateral view explode(split(name,','))exl_tab as exl_col ;
----------------------------展示结果----------------------------
id	name	exl_col
1	1,11	1
1	1,11	11
1	2,22	2
1	2,22	22
1	3,33	3
1	3,33	33

方式2:列为数组转多行

select * from (
     SELECT 1 AS id, array('1', '11') AS items
    UNION ALL
    SELECT 2 AS id, array('2', '22') AS items
    UNION ALL
    SELECT 3 AS id, array('3', '33') AS items
)
LATERAL VIEW explode(items) exploded_table AS exploded_item;
---------------------查询结果---------------------
id	items	exploded_item
1	["1,11"]	1
1	["1,11"]	11
2	["2,22"]	2
2	["2,22"]	22
3	["3,33"]	3
3	["3,33"]	33

 6、多列转多行

描述:多个列信息转成map方式,最后在key、value方式展示出来

LATERAL VIEW:横向视图首先将UDTF应用于基表的每一行,然后将结果输出行连接到输入行,以形成具有所提供的表别名的虚拟表。

EXPLODE(tmp_column) 把列转行,如果要把维度带出来,则需要组合LATERAL VIEW来实现虚拟表连接,转成多行

SELECT id,columnAlias,columnVal
FROM( 
	SELECT 
       id
       ,MAP('key1', sell_sku_count, 'key2', sell_sku_count_wac) AS tmp_column
   FROM (
       	select 1 as id,11 as sell_sku_count,111 as sell_sku_count_wac
        union all
        select 2 as id,22 as sell_sku_count,222 as sell_sku_count_wac
        union all
        select 3 as id,33 as sell_sku_count,333 as sell_sku_count_wac
   )
) x LATERAL VIEW EXPLODE(tmp_column) tableAlias AS columnAlias,columnVal
--------------------------------展示结果--------------------------------
id	columnAlias	columnVal
1	key1	11
1	key2	111
2	key1	22
2	key2	222
3	key1	33
3	key2	333

 

三、运维

1、表Lock锁住解锁

连接hivemetastore 的mysql库

select * from HIVE_LOCKS limit 10;

delete from HIVE_LOCKS where HL_TABLE='ods_crm_domain_ocrm_private_sea_databus' and HL_USER='hadoop';

2、参数优化

1)如果是小表,自动选择Mapjoin:
set hive.auto.convert.join = true; # 默认为false
该参数为true时,Hive自动对左边的表统计量,如果是小表就加入内存,即对 小表使用Map join

2)大表小表的阀值:
set hive.mapjoin.smalltable.filesize;
hive.mapjoin.smalltable.filesize=25000000
默认值是25mb

3)map join做group by 操作时,可以使用多大的内存来存储数据,如果数据太大,则不会保存在内存里
set hive.mapjoin.followby.gby.localtask.max.memory.usage;
默认值:0.55

4)本地任务可以使用内存的百分比
set hive.mapjoin.localtask.max.memory.usage;

3、其它参数优化

SET mapred.reduce.tasks=50;
SET mapreduce.reduce.memory.mb=6000;
SET mapreduce.reduce.shuffle.memory.limit.percent=0.06;

4、加载本地csv文件到hive表

备注:删除csv的标题行,要不然会有一行空值

load data local inpath '/home/test/test.csv' into table dws.test partition(dt=20201111);

5、从另外一张表导数据

insert overwrite table `dws`.`dws_online_sku` partition(dt = 20201111) 
select sale_area_id, owner_id, sale_type, customer_type, source_tag, sku_id, sku_day_flag, sku_wac_flag, sku_mac_flag, sku_j7_flag, sku_j30_flag, cmd_pop 
from `dws`.`dws_online_sku_test` where dt = 20201111;

6、手动新增分区

alter table bdg_inf.binlog add partition (inst='test-prod', tn='test.test_tab', dt='20211229', hour='07') location 'ofs://f.chdfs.ap-beijing.myqcloud.com/user/hive/warehouse/test.db/binlog/test_tab/test.tab1/dt=20211229/hour=07'

 

 

 

posted @   Robots2  阅读(107)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示