hive操作实例
word count
select word,count(*) from( select explode(split(sentence,' ')) as word from article )t group by word;
split:数据切分为数组
explode:行转列
统计每个用户购买过多少个商品
建表
create table orders( order_id string, user_id string, eval_set string, order_number string, order_dow string, order_hour_of_day string, days_since_prior_order string) row format delimited fields terminated by ','; create table order_products_prior( order_id string, product_id string, add_to_cart_order string, reordered string) row format delimited fields terminated by ',';
--1.统计每个订单有多少个商品:
select order_id,count(1) as prod_cnt from order_products_prior group by order_id order by prod_cnt desc limit 10;
--2.对每个用户下过的订单对应的商品数目求和:
select user_id,sum(prod_cnt) as prod_sum from orders od join( select order_id,count(1) as prod_cnt from order_products_prior group by order_id)pro on (od.order_id=pro.order_id) group by od.user_id desc order by prod_sum limit 10;
每个用户最喜爱购买的三个product是什么,最终表结构可以是3个列,或者是一个字符串
(数据表和上个例子相同)
--1.统计每个用户和每种商品产生的行为数量
select user_id,product_id,count(1) as prod_buy_cnt from orders t1 join order_products_prior t2 on t1.order_id=t2.order_id group by user_id,product_id limit 20;
--2.对一个用户不同商品,按照购买次数进行降序(desc)排列
select user_id,product_id,prod_buy_cnt, row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num from( select user_id,product_id,count(1) as prod_buy_cnt from orders t1 join order_products_prior t2 on t1.order_id=t2.order_id group by user_id,product_id )t12 limit 20;
得到的结果类似于:
user1 product1 1
user1 product2 2
user1 product3 3
user2 product1 1
user2 product2 2
user2 product3 3
--3.取每个用户购买次数最多的3个商品列转行
select user_id,collect_list(concat_ws('_',product_id,cast(row_num as string))) as top_3_prods from ( select user_id,product_id,prod_buy_cnt, row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num from( select user_id,product_id,count(1) as prod_buy_cnt from orders t1 join order_products_prior t2 on t1.order_id=t2.order_id group by user_id,product_id )t12 )t where row_num<4 group by user_id limit 20;
每个用户最喜爱购买的前10%个product是什么
select user_id,collect_list(concat_ws('_',product_id,cast(rk as string),cast(prod_cate_cnt as string))) as top_10_prod from ( select user_id,product_id,usr_prod_cnt, row_number() over(distribute by user_id sort by usr_prod_cnt desc) as rk, ceil(cast(count(1) over(partition by user_id) as double)*0.1) as prod_cate_cnt --ceil(cast(sum(usr_prod_cnt) over(partition by user_id) as double)*0.1) as total_prod_cnt from ( select user_id,product_id,count(1) as usr_prod_cnt from orders join order_products_prior pri on orders.order_id=pri.order_id group by user_id,product_id )t1 )t where rk<=prod_cate_cnt group by user_id limit 100;
建分区表,orders表按照order_dow建立分区表orders_part,然后从hive查询orders动态插入orders_part表中
1.建立分区表
create table order_part( order_id string, user_id string, eval_set string, order_number string, order_hour_of_day string, days_since_prior_order string )partitioned by(order_dow string) row format delimited fields terminated by '\t';
2.动态插入分区表
set hive.exec.dynamic.partition=true; --使用动态分区 set hive.exec.dynamic.partition.mode=nonstrict; --使用无限制模式 insert overwrite table order_part partition(order_dow) --(dt='20190512') select order_id,user_id,eval_set,order_number,order_hour_of_day,days_since_prior_order,order_dow from orders --where order_dow='2'