hive-分析函数

bitmap
--------------
    位图/位映射。
    5
    
hive
--------------
    分区表
    udtf函数
    wordcount
    lateral view            //和udtf配合使用。
    order by                //数据倾斜
    sort by                    //reduce内排序
    distribute by            //分区
    cluster by                //sort by + distribute by

hive
--------------
    --显式表头
    hive>set hive.cli.print.header=true ;

hive分析函数
--------------
    1.准备数据
        create table emp
        (
            id int ,
            empno string ,
            age int ,
            salary float ,
            deptno int 
        )
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ','
        STORED AS TEXTFILE;

    2.加载数据
        1,tom1,28,3000,1
        2,tom2,29,5000,2
        3,tom3,30,5400,1
        4,tom4,28,6600,3
        5,tom5,27,8000,2
        6,tom6,35,10000,3

    3.加载数据
        load data local inpath '/home/centos/emp.txt' into table emp ;

hive分区函数应用
-----------------
    0.简介
        hive分析函数可以连接每条记录形成数据集,字段甚至可以不是分组字段,使用一次mr
        完成聚合查询。
        常规的sql下的分组聚合很诸多限制,其中select字段必须是分组字段,有时需要多次mr.
        select deptno , max(salary) from emp group by deptno ;
    1.分析函数
        -- over , 只是分区
        SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp  ;


        -- OVER + ORDER BY , 分区并在分区内排序
        SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp;

        -- OVER ... rows unbounded preceding 基于前导所有行的汇总操作。
        SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ;

        -- RANK, 排名操作 ,计算每个部门内按照工资的降序进行排名(有缝,并列情况存在缝隙)绝对排名。
        SELECT empno, deptno, salary,  RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
        
        -- dense_rank()密度排名,无缝。绝对排名。
        SELECT empno, deptno, salary,  Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;

        -- percent_rank()百分比排名,相对排名.
        SELECT empno, deptno, salary,  percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;

        --NTILE(n) ,分桶操纵,将数据均匀分散到各个桶中。
        SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;

        -- lead()前导操作,查询从当前开始,后续第几行的操作。
        SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;

        --lag,从当前行计数,访问之前的第几行salary,如果超过窗口范围返回null。
        SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;

        --first_value()
        SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;

        --last_value()
        SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
        
        --使用range开窗函数 RANGE BETWEEN ... AND ...,在分区内在划分记录范围。
        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;

        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
        
        --RANGE : 对值的+/-.
        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;

        --range : 计算的是值
        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ;
        //rows计算的行
        SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ;

        --窗口重用
        SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ;

        [开窗函数]
        range|rows between ... and ;
        range是值偏移,rows是行偏移。

    2.统计员工数量
        select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ;


hive高级聚合
---------------
    1.grouping set
        作用等同于union.
        select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ;
    3.rollup
        select ... from ... GROUP BY a,b,c WITH ROLLUP ;
        select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),())

        select deptno , age , count(1) from emp group by deptno,age with rollup ;
    4.cube
        select ... from ... GROUP BY a,b,c WITH cube ;
        select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),()))

        select deptno , age , count(1) from emp group by deptno,age with cube ;

hive优化
-------------------
    1.创建索引
        本质上就是表,对于检索单条记录是有优势的。
        排序的。
        --创建索引
        CREATE INDEX idx_emp ON TABLE emp (empno) AS 'COMPACT' WITH DEFERRED REBUILD;

        --生成索引
        ALTER INDEX idx_emp ON emp REBUILD;

        --查询是通过索引列查询
        select * from emp where empno = 'tom1'

    2.文件格式
        [列存储格式]
        parquet , orcfile , orc    //
        投影查询时,发挥磁盘的线性读写。
        select id,name from emp ;
        //创建表,存储成parquet格式
        create table pq1(id int , empno string, age int ,salary float ,  deptno int ) stored as parquet ;
        
        //查看文件格式
        insert into pq1 select * from emp ;
        [行存储]
        txt 

    3.压缩
        减少数据传输量,降低网络IO的负载。
        --在多级job中,job之间的结果是否需要压缩。
        SET hive.exec.compress.intermediate=true ;
        SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ;
        SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ;
        SET hive.intermediate.compression.type=record|block|none ;

        --控制job的最终输出是否压缩.
        SET hive.exec.compress.output=true;
        SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec;
    
    4.大量小文件
        导致map过多。
        1.归档
            hadoop Archive and HAR进行归档。

        2.转换成序列文件
            将小文件合成SeqenceFile.

        3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
            alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
            --修改默认输入格式
            set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ;

            --建新表
create table emp2
(
id int ,
empno string ,
age int ,
salary float ,
deptno int 
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'

 

posted on 2018-06-25 17:30  飞机耳朵  阅读(347)  评论(0编辑  收藏  举报

导航