Hive常用的DDL,DQL和分析函数
- DDL
-- 创建数据库,默认数据库地址是/user/hive/warehouse CREATE DATABASE IF NOT EXISTS myhive; -- 指定数据库的地址 CREATE DATABASE set_location_hive LOCATION '/custom' -- 删除数据库 DROP DATABASE myhive -- 如果该数据库下存在表会报错 DROP DATABASE myhive CASCADE;
-- 内部表
CREATE TABLE IF NOT EXISTS sudent;
CREATE TABLE student( id INT, name STRING )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS textfile
LOCATION '/custom/stu'
-- 创建外部表
CREATE EXTERNAL TABLE stu( sid INT, name STRING )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
-- 加载数据
LOAD DATA LOCAL INPATH '/hivedatas/stu.csv' INTO TABLE stu;
LOAD DATA INPATH '/data/stu.txt' OVERWRITE INTO TABLE student;
-- 分区表
CREATE TABLE score( sid INT, sscore INT )
PARTITIONED BY (p_dy STRING, p_dym STRING, p_dymd STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/hivedatas/stu.csv' INTO TABLE score PARTITION(p_dy='',p_dym='',p_dymd='') ;
-- 添加分区,可以同时添加多个分区
ALTER TABLE score ADD PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617');
-- 删除分区
ALTER TABLE score DROP PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617');
-- 添加列
ALTER TABLE score ADD COLUMNS (col1 STRING, col2 INT);
-- 删除表,清空表
DROP TABLE score;
TRUNCATE TABLE score;
-- 插入数据
INSERT INTO TABLE score PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617')
SELECT ...
- DQL和常用的分析函数
- 四个排序函数:
order by:会对输入做全局排序,只有一个reduce,会导致当输入规模较大时,需要较长的计算时间;
sort by:局部排序,在数据进入reduce之前完成排序,当mapred.reduce为1时,只保证每个reduce输出有序,不保证全局有序;
distribute by:根据指定字段将数据分到不同的reducer,且分发算法是hash散列的;
cluster by:具有distribute by的功能,同时对该字段进行排序,当分桶和sort是同一个字段时:cluster by = distribute by + sort by; - 分桶表:
提高join查询效率和抽样效率,分桶表数据不能通过local data和hdfs dfs -put的方式写进去,需要准备临时表作为跳板使用
-- 开启hive分桶
set hive.enforce.bucketing=true;
-- 设置reduce的个数
set mapreduce.job.reduce=3;
-- 创建桶表
create table course( c_id string, c_name string )
clustered by(c_id) into 3 buckets;
-- 写入数据
insert overwrite table course
select * from course_common cluster by(c_id); -- 最后指定桶字段 - 开窗函数:
--聚合函数配合over使用:sum,avg,max,min --分组内起始点到当前行累加
SELECT cookie_id,
sum(pv) over(partition by cookie_id order by create_time) as cum_pv
FROM t_test
--自定义前面累加几行
SELECT cookie_id,
sum(pv) over(partition by cookie_id order by create_time rows between 2 preceding and current row )
FROM t_test
-- 当不指定order by时,是组内累加和
SELECT cookie_id, sum(pv) over(partition by cookie_id) FROM t5--排序函数:row_number, rank, dense_rank, ntile
SELECT cookie_id, pv,
row_number() over(partition by cookie_id order by pv desc) rn,
rank() over(partition by cookie_id order by pv desc) rn1,
dense_rank() over(partition by cookie_id order by pv desc) rn2
FROM t5;
-- ntile: 类似分桶,为组内记录打上桶编号,类似cluster by
SELECT cookie_id, pv, ntile(3) over(partition by cookie_id order by create_time) as rn
FROM t5;
-- 其他分析函数:lag,lead,first_value,last_value
SELECT cookie_id, create_time, pv,
ROW_NUMBER() OVER(PARTITION BY cookie_id ORDER BY create_time) AS rn,
LAG(create_time,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookie_id ORDER BY create_time) AS last_1_time,
LAG(create_time,2) OVER(PARTITION BY cookie_id ORDER BY create_time) AS last_2_time
FROM t5; - 行列转换:
-- 行转列
WITH tmp AS(
SELECT name, concat_ws('-', location, blood_type) as base
FROM t_acc
)
SELECT base, concat_ws('|', collect_set(name)) as cat
FROM tmp
GROUP BY base
-- 列转行
-- 测试数据
-- zhangsan child1,child2,child3,child4 k1:v1,k2:v2
-- lisi child5,child6,child7,child8 k3:v3,k4:v4
CREATE TABLE t3( name string, children array, address Map<string, string> )
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
SELECT explode(children) as mychild FROM t3;
SELECT explode(address) as (mapKey, mapValue) FROM t3;
-- 配合lateral view使用
SELECT name, t_add_key,t_add_value
FROM T3 LATERAL VIEW explode(address) tmp as t_add_key, t_add_value - 一些SQL技巧
-- 删除记录 -- 假设删除id为666的数据 insert overwrite table t_acc select * from t_acc where id <> 666; -- 更新记录 -- 更新id为666的记录的age=21 insert overwrite table t_acc select id, if(id=666,21,age) as age from t_acc; -- 行转列 with tmp as ( select userId, concat(visitDate, ":", visit_count) as value from t_acc ) select userId, concat_ws(",",collect_list(value)) as label from tmp group by userId; -- 列转行 select userId, split(info, ":")[0] as visit_date, split(info, ":")[1] as visit_cnt from t_explode lateral view explode(split(label, ',')) t as info -- 分析函数1:lag,lead,first_value, last_value select userId, visitDate, visit_count, lag(visit_count, 1, 0) over(partition by userId order by visitDate) as lag_value, lead(visit_count, 1, 0) over(partition by userId order by visitDate) as lead_value, first_value(visit_count) over(partition by userId order by visitDate) as head_value from t_acc; -- 多维分析:grouping sets; with cube select city_code, biz_date, sku_type, sum(amount) as total_amount, GROUPING_ID from t_order group by city_code, biz_date, sku_type grouping sets ((city_code, biz_date, sku_type), (city_code, biz_date), (city_code, sku_type), (city_code), (sku_type)); ; -- with cube 是所有维度的组合,包括null select city_code, biz_date, sku_type, sum(amount), GROUPING_ID from t_order group by city_code,biz_date,sku_type with cube; -- 数据倾斜group by select label,sum(cnt) as all from ( select rd,label,sum(1) as cnt from ( select id,round(rand(),2) as rd,value from tmp1 ) as tmp group by rd,label ) as tmp group by label; -- 取前20%的数据,分位数或者排序 with tmp as ( select userId, visitDate, visit_count, row_number() over(order by visit_count) as rn, count(1) over() as total_cnt from t_acc ) select * from tmp where rn / total_cnt < 0.3; -- 使用笛卡尔积 cross join连接,不像ck那样可以申明with select percentile(visit_count, array(0.3, 0.7)) as arr from t_acc -- 或者使用NTILE:分桶函数,将桶号分配到每一行记录,用于求前百分之几的数 select * from ( select userId, visitDate, visit_count, ntile(5) over(order by visit_count) as bulk_num from t_acc ) t where t.bulk_num = 1;

浙公网安备 33010602011771号