hive-分析函数
bitmap -------------- 位图/位映射。 5 hive -------------- 分区表 udtf函数 wordcount lateral view //和udtf配合使用。 order by //数据倾斜 sort by //reduce内排序 distribute by //分区 cluster by //sort by + distribute by hive -------------- --显式表头 hive>set hive.cli.print.header=true ; hive分析函数 -------------- 1.准备数据 create table emp ( id int , empno string , age int , salary float , deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; 2.加载数据 1,tom1,28,3000,1 2,tom2,29,5000,2 3,tom3,30,5400,1 4,tom4,28,6600,3 5,tom5,27,8000,2 6,tom6,35,10000,3 3.加载数据 load data local inpath '/home/centos/emp.txt' into table emp ; hive分区函数应用 ----------------- 0.简介 hive分析函数可以连接每条记录形成数据集,字段甚至可以不是分组字段,使用一次mr 完成聚合查询。 常规的sql下的分组聚合很诸多限制,其中select字段必须是分组字段,有时需要多次mr. select deptno , max(salary) from emp group by deptno ; 1.分析函数 -- over , 只是分区 SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp ; -- OVER + ORDER BY , 分区并在分区内排序 SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp; -- OVER ... rows unbounded preceding 基于前导所有行的汇总操作。 SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ; -- RANK, 排名操作 ,计算每个部门内按照工资的降序进行排名(有缝,并列情况存在缝隙)绝对排名。 SELECT empno, deptno, salary, RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; -- dense_rank()密度排名,无缝。绝对排名。 SELECT empno, deptno, salary, Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; -- percent_rank()百分比排名,相对排名. SELECT empno, deptno, salary, percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; --NTILE(n) ,分桶操纵,将数据均匀分散到各个桶中。 SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; -- lead()前导操作,查询从当前开始,后续第几行的操作。 SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --lag,从当前行计数,访问之前的第几行salary,如果超过窗口范围返回null。 SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --first_value() SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --last_value() SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --使用range开窗函数 RANGE BETWEEN ... AND ...,在分区内在划分记录范围。 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ; SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ; --RANGE : 对值的+/-. SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ; SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ; --range : 计算的是值 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ; //rows计算的行 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ; --窗口重用 SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ; [开窗函数] range|rows between ... and ; range是值偏移,rows是行偏移。 2.统计员工数量 select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ; hive高级聚合 --------------- 1.grouping set 作用等同于union. select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ; 3.rollup select ... from ... GROUP BY a,b,c WITH ROLLUP ; select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),()) select deptno , age , count(1) from emp group by deptno,age with rollup ; 4.cube select ... from ... GROUP BY a,b,c WITH cube ; select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),())) select deptno , age , count(1) from emp group by deptno,age with cube ; hive优化 ------------------- 1.创建索引 本质上就是表,对于检索单条记录是有优势的。 排序的。 --创建索引 CREATE INDEX idx_emp ON TABLE emp (empno) AS 'COMPACT' WITH DEFERRED REBUILD; --生成索引 ALTER INDEX idx_emp ON emp REBUILD; --查询是通过索引列查询 select * from emp where empno = 'tom1' 2.文件格式 [列存储格式] parquet , orcfile , orc // 投影查询时,发挥磁盘的线性读写。 select id,name from emp ; //创建表,存储成parquet格式 create table pq1(id int , empno string, age int ,salary float , deptno int ) stored as parquet ; //查看文件格式 insert into pq1 select * from emp ; [行存储] txt 3.压缩 减少数据传输量,降低网络IO的负载。 --在多级job中,job之间的结果是否需要压缩。 SET hive.exec.compress.intermediate=true ; SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ; SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ; SET hive.intermediate.compression.type=record|block|none ; --控制job的最终输出是否压缩. SET hive.exec.compress.output=true; SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec; 4.大量小文件 导致map过多。 1.归档 hadoop Archive and HAR进行归档。 2.转换成序列文件 将小文件合成SeqenceFile. 3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat --修改默认输入格式 set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ; --建新表 create table emp2 ( id int , empno string , age int , salary float , deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS INPUTFORMAT 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'