hive高阶函数和采样-优化
select a.id, a.month from user_b a union all select b.id, b.day from user_b b; 桶表:和hash partition类似 ============================================= bucket //clustered(id) into 2 buckets 为了避免分区表过多产生海量文件夹 文件段形式存在 分区+分桶表插入问题 insert into xx partition(''='',''='') select 1,'tom',20; 000000 //第一桶 000001 //第二桶 select size(arr) select array(1,2,3) //创建array select map(1,'tom',2,'tomas') //创建map select struct(1,'tom','male') //创建匿名结构体 ['col1':1,'col2':'tom' ...] select named_struct('id',1,'name','tom','sex','male') //创建带名结构体 select current_date(); //当前日期 select current_timestamp(); //当前时间(精确到毫秒) select date_format(current_date(),'yyyyMMdd') //日期转换 select date_format(current_timestamp(),'yyyyMMdd') //日期转换 select from_unixtime(bigint,'yyyyMMdd') //将时间戳转换成时间 hive的文件格式:stored as parquet; =========================================== 插入方法:创建text,然后insert into xx select * from 行级存储 text seqFile 1M 列级存储 在进行投影查询的时候,会跳过不相关的列 rcFile 4M块 orcFile 能够支持更大的块256M parquet 支持更多hadoop生态圈组件 SerDe: ========================================= serialize && deserialize 将文件字段映射成hive表中的列 使用阶段:在inputFormat之后 textFile:lazySimpleSerde openCsvSerde JsonSerde //创建时字段务必与json中的key对应 创建表指定serde: row format serde 'org.xxx.xxx.JsonSerde'; 在hive字段中以特殊字符开头的字段 create table xx(`_location` string); 分析函数: ================================= sum()over(sort by age rows between unbounded preceding and unbounded following); //排序并界定窗口 sum()over(sort by age range between unbounded preceding and unbounded following); sum()over(partition by province order by age range between unbounded preceding and unbounded following); //分组排序+界定窗口 current row //当前行 Hive的分组方式 row_number() 是没有重复值的排序(即使两天记录相等也是不重复的),可以利用它来实现分页 dense_rank() 是连续排序,两个第二名仍然跟着第三名 rank() 是跳跃排序的,两个第二名下来就是第四名 -- percent_rank()百分比排名,相对排名. SELECT empno, deptno, salary, percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; rank() //并列跳跃 113 dense_rank() //不跳跃 112 row_number() //顺序 123 ntile(n) //三六九等 first_value() //取第一个值 lead() //窗口上浮 //连续两个月活跃 select id , m , n from (select id, (month+1) as m , lead(month)over(partition by id order by month) as n from user_c) a where a.m=a.n; //select id ,m ,n from(select id m(month+1)as m , lead(month) over(partition by id order by month) as n from user_c )a where a.m=a.n; //连续三个月活跃 select distinct id from (select id, (month+2) as first , (lead(month)over(partition by id order by month)+1) as second, lead(month,2)over(partition by id order by month) as third from user_c) a where a.first=a.second and a.second=a.third; //select distinct id from (select id ,(month+2) as first,(lead(month)over(partition by id order by month)+1) //as second,lead(month,2)over (partition by id order by month)as third from user_c) a where a.first = a.second and a.second=a.third; lag pv:page view //页面浏览量,统计总浏览数 uv:user view //用户浏览数,统计用户数 根据pv,统计uv: select month,day, count(distinct id) as uv from user_b group by month,day; 高级聚合函数:********************** grouping__id //组号 grouping sets 组集 //分别统计月活和日活 select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day grouping sets(month,day) order by grouping__id ; //select month,day count(distinct id) as uv grouping_id from user_b group by month,day grouping sets(month,day)order by grouping_id; +----------+-------------+-----+---------------+--+ | month | day | uv | grouping__id | +----------+-------------+-----+---------------+--+ | 2015-04 | NULL | 6 | 1 | | 2015-03 | NULL | 5 | 1 | | 2015-02 | NULL | 2 | 1 | | NULL | 2015-04-16 | 2 | 2 | | NULL | 2015-04-15 | 2 | 2 | | NULL | 2015-04-13 | 3 | 2 | | NULL | 2015-04-12 | 2 | 2 | | NULL | 2015-03-12 | 1 | 2 | | NULL | 2015-03-10 | 4 | 2 | | NULL | 2015-02-16 | 2 | 2 | +----------+-------------+-----+---------------+--+ rollup 汇总 select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day with rollup order by grouping__id ; //select month,day count (distinct id )as uv grouping_id from user_b group by month,day with rollup order by grouping_id; +----------+-------------+-----+---------------+--+ | month | day | uv | grouping__id | +----------+-------------+-----+---------------+--+ | NULL | NULL | 7 | 0 | | 2015-04 | NULL | 6 | 1 | | 2015-03 | NULL | 5 | 1 | | 2015-02 | NULL | 2 | 1 | | 2015-04 | 2015-04-16 | 2 | 3 | | 2015-04 | 2015-04-15 | 2 | 3 | | 2015-04 | 2015-04-13 | 3 | 3 | | 2015-04 | 2015-04-12 | 2 | 3 | | 2015-03 | 2015-03-12 | 1 | 3 | | 2015-03 | 2015-03-10 | 4 | 3 | | 2015-02 | 2015-02-16 | 2 | 3 | +----------+-------------+-----+---------------+--+ select day,month count(distinct id) as uv, grouping__id from user_b group by day,month with rollup order by grouping__id ; //selectt day ,month count(distinct id ) as uv,grouping_id form user_b group by day,month with rollup order by grouping_id; +-------------+----------+-----+---------------+--+ | day | month | uv | grouping__id | +-------------+----------+-----+---------------+--+ | NULL | NULL | 7 | 0 | | 2015-04-16 | NULL | 2 | 2 | | 2015-04-15 | NULL | 2 | 2 | | 2015-04-13 | NULL | 3 | 2 | | 2015-04-12 | NULL | 2 | 2 | | 2015-03-12 | NULL | 1 | 2 | | 2015-03-10 | NULL | 4 | 2 | | 2015-02-16 | NULL | 2 | 2 | | 2015-04-16 | 2015-04 | 2 | 3 | | 2015-04-13 | 2015-04 | 3 | 3 | | 2015-04-12 | 2015-04 | 2 | 3 | | 2015-03-12 | 2015-03 | 1 | 3 | | 2015-03-10 | 2015-03 | 4 | 3 | | 2015-02-16 | 2015-02 | 2 | 3 | | 2015-04-15 | 2015-04 | 2 | 3 | +-------------+----------+-----+---------------+--+ cube 魔方 select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day with cube order by grouping__id ; //select month,day ,count(distinct id) as uv ,grouping_id from user_b group by month,day with cube order by grouping_id; 2 1, 2, [1,2] 3 1,2,3, [1,2],[1,3],[2,3],[1,2,3] 4 +----------+-------------+-----+---------------+--+ | month | day | uv | grouping__id | +----------+-------------+-----+---------------+--+ | NULL | NULL | 7 | 0 | | 2015-02 | NULL | 2 | 1 | | 2015-04 | NULL | 6 | 1 | | 2015-03 | NULL | 5 | 1 | | NULL | 2015-02-16 | 2 | 2 | | NULL | 2015-04-16 | 2 | 2 | | NULL | 2015-04-15 | 2 | 2 | | NULL | 2015-04-13 | 3 | 2 | | NULL | 2015-04-12 | 2 | 2 | | NULL | 2015-03-12 | 1 | 2 | | NULL | 2015-03-10 | 4 | 2 | | 2015-04 | 2015-04-12 | 2 | 3 | | 2015-03 | 2015-03-12 | 1 | 3 | | 2015-03 | 2015-03-10 | 4 | 3 | | 2015-04 | 2015-04-16 | 2 | 3 | | 2015-02 | 2015-02-16 | 2 | 3 | | 2015-04 | 2015-04-15 | 2 | 3 | | 2015-04 | 2015-04-13 | 3 | 3 | +----------+-------------+-----+---------------+--+ cookie1 1 cookie2 5 cookie2 4 cookie1 3 cookie2 7 cookie1 4 cookie2 2 cookie3 2 cookie2 3 cookie3 5 cookie1 6 cookie3 10 cookie2 8 hive的事务性: =================================== 1、配置文件 SET hive.support.concurrency=true; SET hive.enforce.bucketing=true; SET hive.exec.dynamic.partition.mode=nonstrict; SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; SET hive.compactor.initiator.on=true; SET hive.compactor.worker.threads=1; 2、桶表 3、指定事务开启 4、orc文件 create table user3(id int, name string, age int, province string, city string) clustered by(province) into 2 buckets row format delimited fields terminated by '\t' stored as orc tblproperties('transactional'='true'); 5、insert into user3 select * from user1; 6、update user3 set age=200 where id=5; 采样:取出大型数据集的子集 ============================================ 随机采样: select * from user2 distribute by rand() sort by rand() limit 10; //select * from user2 distribite by rand() sort rand() limit 10; 随机模式分发+ 随机模式排序 桶表采样: 对于桶表的采样优化 select name from user1 TABLESAMPLE(bucket 1<指定桶数> out of 4<总桶数> on rand()) ; //select name from user1 tablesample(bucket 1 out of 4 on rand()); 随机取得某个桶的数据, create table user1_bucket(id int, name string, age int, province string, city string) clustered by(province) into 4 buckets row format delimited fields terminated by '\t'; insert into user1_bucket select * from user1; 1 tom 30 anhui anhui 2 tomas 30 hefei 1 tom 30 heilongjiang 2 tomas 30 jilin 1 tom 30 liaoning 2 tomas 30 neimenggu 1 tom 30 shandong 2 tomas 30 shanxi 1 tom 30 qinghai 2 tomas 30 jiangsu 1 tom 30 gansu 2 tomas 30 ningxia 1 tom 30 hubei 2 tomas 30 sichuan 1 tom 30 chongqing 2 tomas 30 taiwan 1 tom 30 xianggang 2 tomas 30 guangdong 块采样:可以指定文件大小、行数、百分比进行数据的采样 select * from users TABLESAMPLE(5 percent); select * from users TABLESAMPLE(30 M); select * from users TABLESAMPLE(2 rows); 用户定义函数 ===================== 1、UDF //单行转单行 add(id,name) ===> idname //处理单位一行,返回一行 入口点:UDF类 2、UDTF //单行转多行 explode(arr) ===> arr[1] arr[2] arr[3] 3、UDAF //多行转单行 sum(age) ===> 500 编写hive自定义函数: ========================================== 1、pom文件 <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>2.1.1</version> </dependency> 2、编写代码 注意:对于hive的UDF,不支持变长参数,但是支持list @Description( name = "add", value = "this is an add function", extended = "select add(1,2,3) ====> 6 ; select add('hello','world') ====> helloworld" ) public class UDFAdd extends UDF{ /** * 将所有i进行相加 * @param i * @return */ public Integer evaluate(int i, int j){ return i+ j; } public String evaluate(String i, String j){ return i+ j; } public int evaluate(List<Integer> i){ int j = 0; for (Integer integer : i) { j += integer; } return j; } } 3、加载并使用jar 第一个方法:打包并将其复制到hive的lib文件夹 //hive的默认类路径 第二个方法:add jar /soft/hive/lib/myhive-1.0-SNAPSHOT.jar; //手动指定hive类路径 第三个方法:修改配置文件:hive-site.xml //hive.aux.jars.path=/x/x/x.jar 4、将类加载成hive的函数 创建函数[临时函数] beeline> create temporary function add as 'com.oldboy.hive.UDFAdd'; [永久函数] beeline> create function add as 'com.oldboy.hive.UDFAdd' using jar 'hdfs:///path/to/x.jar'; 该方式非常重要,完全分布式时需要使用该方式,否则找不到函数类。 5、删除方法: drop temporary function add ; 1、将商家的标签,通过udf进行解析并返回标签 json ==== udf + fastJson ===> {'味道好','服务好'} 如何在hive中使用fastJson? //将其拷贝到hive的lib文件夹并重启hive 1)编写程序并打包程序 2)将jar和fastJson文件传送到hive的lib文件夹 3)添加临时函数create temporary function parseJson as 'com.oldboy.hive.TempTagUDF'; 4)创建temptag表,字段包括id和json,以'\t'作为分隔 5)将TempTag.txt加载到hive的temptag表中 6)使用udf对商家评论进行操作 7)统计各个商家去重之后的标签数 select id, count(distinct tag) from temptags lateral view explode(parseJson(json)) xx as tag group by id; 8)统计各个商家各个标签的数量 select id, tag, count(tag) from temptags lateral view explode(parseJson(json)) xx as tag group by id,tag ; !!!!出现问题:类找不到异常 Caused by: java.lang.ClassNotFoundException: com.oldboy.hive.TempTagUDF 原因分析:hive在使用MR操作时,在其他节点接受不到TempTagUDF类所在jar 解决:将udf的jar和fastjson一同复制到${HADOOP_HOME}/share/hadoop/common/lib下,并同步到其他节点 不需要重启hadoop maven项目设置编译器为jdk1.8(默认1.5) //放在<project>标签下 <build> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> </plugins> </build> 虚列: ====================== > select name, explode(work_place) from default.employee; //udtf不支持外部的select从句 UDTF's are not supported outside the SELECT clause, nor nested in expressions (state=42000,code=10081) lateral view //对表生成函数的补充 select id, tag from temptags lateral view explode(parseJson(json)) xx as tag ; //虚列 //lateral view explode(parseJson(json)) xx as tag (xx无意义,占位符) select id , explode() UDTF: =================================== StructObjectInspector 类 ObjectInspector[] create temporary function wc as 'com.oldboy.hive.TestUDTF'; LazySimpleSerDe中的lazy格式,默认string格式,只有在使用或者声明格式的时候才进行转换 hive优化: ===================================== 性能工具: EXPLAIN //解释hive运行过程中MR作业整体流程 //explain select count(1) from wc; Analyze //在下一次执行的时候使用CBO(cost-based-optimize)基于成本的优化来执行作业 //analyze table wc compute STATISTICS //desc formatted wc =====> 能看到文件行数统计以及文件大小 设计优化: 分区表: //以日期+时间、以location、以业务逻辑为分区字段,优化分区(对where子句查询的优化) //create table xx() partition by (province string, city string); 分桶表: //可以进行采样、对join的优化比较好(分桶字段如果和join字段一致, //在join操作时会选择分区内部的桶文件段,避免了全文件扫描) //create table xx() clustered by(province) into 2 buckets; 创建索引: // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id) AS 'COMPACT' WITH DEFERRED REBUILD; //创建compact索引 // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id) AS 'BITMAP' WITH DEFERRED REBUILD; //创建位图索引 执行引擎优化: //hive2中已经不推荐使用mr作为执行引擎 //推荐使用spark、tez作为执行引擎 数据文件格式优化:、 数据格式: text seqFile RCFile ORCFile Parquet 压缩(default\gzip\lzo\bzip2\lz4\snappy) 中间数据压缩: //分担网络间分发压力和磁盘存储压力 //set hive.exec.compress.intermidiate=true //set hive.intermidiate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec 输出文件压缩 //减小磁盘存储压力 //set hive.exec.compress.output=true //set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec 存储优化: 1、使用har文件 2、使用SeqFile格式 3、使用CombineFileInputFormat格式 //hive自动优化手段,将多个小文件合并为单个文件 4、使用hdfs联邦 //在其他节点存储namenode数据,水平扩展其容量(不实用) 5、配置文件: set hive.merge.mapfiles=true //仅map任务时合并输出的小文件 set hive.merge.mapredfiles=true //MR作业时合并输出小文件 set hive.merge.size.per.task=256000000 //定义job合并文件的大小 set hive.merge.smallfiles.avgsize=16000000 //定义合并小文件的触发阈值 作业和查询优化: 使用本地模式: beeline本地模式(不用开启hive2):beeline -u jdbc:hive2:// -n centos hive本地模式触发条件: SET hive.exec.mode.local.auto=true; //本地模式自动进入 SET hive.exec.mode.local.auto.inputbytes.max=50000000; //自动进入本地模式的输入阈值,超过则退出本地模式 SET hive.exec.mode.local.auto.input.files.max=5; //自动进入本地模式的文件个数阈值,超过则退出本地模式 jvm重用:仅适用于MR一代,yarn不适用 MR作业会共享jvm虚拟机内存而非全部并行计算 set mapred.job.reuse.jvm.num.tasks=5 并行计算:多用于分布式作业,之在不同主机中同时进行同一作业的处理 并发执行:多用于线程 在hive作业执行时,多个stage之间不一定相互依赖,在此时可以设置并行执行 set hive.exec.parellel=true; set hive.exec.parellel.thread.number=16; join优化:********** Common join //reduce 端的join //通过暗示指定大表/*+ STREAMTABLE(bigtable) */ Map join //map 端 //通过暗示指定小表/*+MAP JOIN(smalltable) */ SET hive.auto.convert.join=true; --default false SET hive.mapjoin.smalltable.filesize=600000000; --default 25M 超过此值,使用reduce端join SET hive.auto.convert.join.noconditionaltask=true; --default false. true说明不需要暗示 SET hive.auto.convert.join.noconditionaltask.size=10000000; --控制表大小和内存的适配 桶表join优化: //SET hive.auto.convert.join=true; --default false //SET hive.optimize.bucketmapjoin=true; --default false !!!!!!!!join端数据倾斜处理 //SET hive.optimize.skewjoin=true; 进行负载均衡 SET hive.skewjoin.key=100000; //在reduce中如果一个reduce接受的数据超过此值,会自动发送给空闲的reduce group by优化 !!!!!!!!group by数据倾斜处理 SET hive.groupby.skewindata=true; 1、请把下一语句用hive方式实现? SELECT a.key,a.value FROM a WHERE a.key not in (SELECT b.key FROM b) 答案: select a.key,a.value from a where a.key not exists (select b.key from b) 2、Multi-group by 是hive的一个非常好的特性,请举例说明? 多组 from A insert overwrite table B select A.a, count(distinct A.b) group by A.a insert overwrite table C select A.c, count(distinct A.b) group by A.c //from A insert overwrite table B select A.a count(A.b) group by A.a insert overwrite table C select A.c count(A.b)group by A.c 3、写出将 text.txt 文件放入 hive 中 test 表‘2016-10-10’ 分区的语句,test 的分区字段是 l_date。 LOAD DATA LOCAL INPATH '/your/path/test.txt' OVERWRITE INTO TABLE test PARTITION (l_date='2016-10-10') https://blog.csdn.net/haohaixingyun/article/details/52819588网页连接 https://blog.csdn.net/ukakasu/article/details/47860647面试题是实例,处理大表数据。里面有原题连接。