Hive常用的DDL,DQL和分析函数

  • DDL
-- 创建数据库,默认数据库地址是/user/hive/warehouse
CREATE DATABASE IF NOT EXISTS myhive;
-- 指定数据库的地址
CREATE DATABASE set_location_hive LOCATION '/custom'

-- 删除数据库
DROP DATABASE myhive  -- 如果该数据库下存在表会报错
DROP DATABASE myhive CASCADE;

-- 内部表
CREATE TABLE IF NOT EXISTS sudent;
CREATE TABLE student( id INT, name STRING )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS textfile
LOCATION '/custom/stu'

-- 创建外部表
CREATE EXTERNAL TABLE stu( sid INT, name STRING )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

-- 加载数据
LOAD DATA LOCAL INPATH '/hivedatas/stu.csv' INTO TABLE stu;
LOAD DATA INPATH '/data/stu.txt' OVERWRITE INTO TABLE student;

-- 分区表
CREATE TABLE score( sid INT, sscore INT )
PARTITIONED BY (p_dy STRING, p_dym STRING, p_dymd STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/hivedatas/stu.csv' INTO TABLE score PARTITION(p_dy='',p_dym='',p_dymd='') ;

-- 添加分区,可以同时添加多个分区
ALTER TABLE score ADD PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617');
-- 删除分区
ALTER TABLE score DROP PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617');
-- 添加列
ALTER TABLE score ADD COLUMNS (col1 STRING, col2 INT);
-- 删除表,清空表
DROP TABLE score;
TRUNCATE TABLE score;
-- 插入数据
INSERT INTO TABLE score PARTITION(p_dy='2021',p_dym='202106',p_dymd='20210617')
SELECT ... 

 

  • DQL和常用的分析函数
  1. 四个排序函数:

    order by:会对输入做全局排序,只有一个reduce,会导致当输入规模较大时,需要较长的计算时间;

    sort by:局部排序,在数据进入reduce之前完成排序,当mapred.reduce为1时,只保证每个reduce输出有序,不保证全局有序;

    distribute by:根据指定字段将数据分到不同的reducer,且分发算法是hash散列的;

    cluster by:具有distribute by的功能,同时对该字段进行排序,当分桶和sort是同一个字段时:cluster by = distribute by + sort by;

  2. 分桶表:
    提高join查询效率和抽样效率,分桶表数据不能通过local data和hdfs dfs -put的方式写进去,需要准备临时表作为跳板使用
    -- 开启hive分桶 
    set hive.enforce.bucketing=true;
    -- 设置reduce的个数
    set mapreduce.job.reduce=3;
    -- 创建桶表
    create table course( c_id string, c_name string )
    clustered by(c_id) into 3 buckets;

    -- 写入数据
    insert overwrite table course
    select * from course_common cluster by(c_id); -- 最后指定桶字段

     

  3. 开窗函数:
    --聚合函数配合over使用:sum,avg,max,min
    --分组内起始点到当前行累加 
    SELECT cookie_id,
      sum(pv) over(partition by cookie_id order by create_time) as cum_pv
    FROM t_test

    --自定义前面累加几行
    SELECT cookie_id,
      sum(pv) over(partition by cookie_id order by create_time rows between 2 preceding and current row )
    FROM t_test

    -- 当不指定order by时,是组内累加和
    SELECT cookie_id, sum(pv) over(partition by cookie_id) FROM t5

    --排序函数:row_number, rank, dense_rank, ntile
    SELECT cookie_id, pv,
      row_number() over(partition by cookie_id order by pv desc) rn,
      rank() over(partition by cookie_id order by pv desc) rn1,
      dense_rank() over(partition by cookie_id order by pv desc) rn2
    FROM t5;

    -- ntile: 类似分桶,为组内记录打上桶编号,类似cluster by
    SELECT cookie_id, pv, ntile(3) over(partition by cookie_id order by create_time) as rn
    FROM t5;


    -- 其他分析函数:lag,lead,first_value,last_value
    SELECT cookie_id, create_time, pv,
      ROW_NUMBER() OVER(PARTITION BY cookie_id ORDER BY create_time) AS rn,
      LAG(create_time,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookie_id ORDER BY create_time) AS last_1_time,
      LAG(create_time,2) OVER(PARTITION BY cookie_id ORDER BY create_time) AS last_2_time
    FROM t5;
  4. 行列转换:
    -- 行转列
    WITH tmp AS(
      SELECT name, concat_ws('-', location, blood_type) as base
      FROM t_acc
    )
    SELECT base, concat_ws('|', collect_set(name)) as cat
    FROM tmp
    GROUP BY base

    -- 列转行
    -- 测试数据
    -- zhangsan child1,child2,child3,child4 k1:v1,k2:v2
    -- lisi child5,child6,child7,child8 k3:v3,k4:v4
    CREATE TABLE t3( name string, children array, address Map<string, string> )
    row format delimited fields terminated by '\t'
    collection items terminated by ','
    map keys terminated by ':'
    lines terminated by '\n'

    SELECT explode(children) as mychild FROM t3;
    SELECT explode(address) as (mapKey, mapValue) FROM t3;

    -- 配合lateral view使用
    SELECT name, t_add_key,t_add_value
    FROM T3 LATERAL VIEW explode(address) tmp as t_add_key, t_add_value 

     

  5. 一些SQL技巧
    -- 删除记录
    -- 假设删除id为666的数据
    insert overwrite table t_acc
    select *
    from t_acc
    where id <> 666;
    
    -- 更新记录
    -- 更新id为666的记录的age=21
    insert overwrite table t_acc
    select 
        id,
        if(id=666,21,age) as age
    from t_acc;
    
    -- 行转列
    with tmp as (
        select userId, concat(visitDate, ":", visit_count) as value
        from t_acc
    )
    select 
        userId,
        concat_ws(",",collect_list(value)) as label
    from tmp
    group by userId;
    
    -- 列转行
    select
        userId,
        split(info, ":")[0] as visit_date,
        split(info, ":")[1] as visit_cnt
    from t_explode
    lateral view explode(split(label, ',')) t as info
    
    -- 分析函数1:lag,lead,first_value, last_value
    select
        userId, visitDate, visit_count,
        lag(visit_count, 1, 0) over(partition by userId order by visitDate) as lag_value,
        lead(visit_count, 1, 0) over(partition by userId order by visitDate) as lead_value,
        first_value(visit_count) over(partition by userId order by visitDate) as head_value
    from t_acc;
    
    -- 多维分析:grouping sets; with cube
    select city_code, biz_date, sku_type, sum(amount) as total_amount, GROUPING_ID
    from t_order
    group by city_code, biz_date, sku_type
    grouping sets ((city_code, biz_date, sku_type), (city_code, biz_date), (city_code, sku_type), (city_code), (sku_type));
    ;
    
    -- with cube 是所有维度的组合,包括null
    select 
        city_code,
        biz_date,
        sku_type,
        sum(amount),
        GROUPING_ID
    from t_order
    group by city_code,biz_date,sku_type with cube;
    
    -- 数据倾斜group by
    select label,sum(cnt) as all from 
    (
        select rd,label,sum(1) as cnt from 
        (
            select id,round(rand(),2) as rd,value from tmp1
        ) as tmp
        group by rd,label
    ) as tmp
    group by label;
    
    -- 取前20%的数据,分位数或者排序
    with tmp as (
        select 
            userId, visitDate, visit_count,
            row_number() over(order by visit_count) as rn,
            count(1) over() as total_cnt
        from t_acc
    ) 
    select *
    from tmp
    where rn / total_cnt < 0.3;
    
    -- 使用笛卡尔积 cross join连接,不像ck那样可以申明with
    select percentile(visit_count, array(0.3, 0.7)) as arr
    from t_acc
    
    -- 或者使用NTILE:分桶函数,将桶号分配到每一行记录,用于求前百分之几的数
    select *
    from (
        select
            userId,
            visitDate,
            visit_count,
            ntile(5) over(order by visit_count) as bulk_num
        from t_acc
    ) t
    where t.bulk_num = 1;

     

   

    

   

    

    

posted @ 2021-11-22 11:11  Shydow  阅读(81)  评论(0编辑  收藏  举报