hive操作实例

word count

select word,count(*)
from(
select
explode(split(sentence,' '))
as word
from article
)t
group by word;

split:数据切分为数组

explode:行转列

 

统计每个用户购买过多少个商品

建表

create table orders(
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string,
days_since_prior_order string)
row format delimited fields terminated by ',';

create table order_products_prior(
order_id string,
product_id string,
add_to_cart_order string,
reordered string)
row format delimited fields terminated by ',';

--1.统计每个订单有多少个商品:

select order_id,count(1) as prod_cnt
from order_products_prior
group by order_id
order by prod_cnt desc
limit 10;

--2.对每个用户下过的订单对应的商品数目求和:

select user_id,sum(prod_cnt) as prod_sum
from orders od
join(
select order_id,count(1) as prod_cnt
from order_products_prior
group by order_id)pro
on (od.order_id=pro.order_id)
group by od.user_id desc
order by prod_sum
limit 10;

 

每个用户最喜爱购买的三个product是什么,最终表结构可以是3个列,或者是一个字符串

(数据表和上个例子相同)

--1.统计每个用户和每种商品产生的行为数量

select user_id,product_id,count(1) as prod_buy_cnt
from orders t1
join order_products_prior t2 
on t1.order_id=t2.order_id
group by user_id,product_id
limit 20;

--2.对一个用户不同商品,按照购买次数进行降序(desc)排列

select user_id,product_id,prod_buy_cnt,
row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num
from(
    select user_id,product_id,count(1) as prod_buy_cnt
    from orders t1
    join order_products_prior t2 
    on t1.order_id=t2.order_id
    group by user_id,product_id
)t12
limit 20;

得到的结果类似于:

user1  product1  1

user1  product2  2

user1  product3  3

user2  product1  1

user2  product2  2

user2  product3  3

--3.取每个用户购买次数最多的3个商品列转行

select user_id,collect_list(concat_ws('_',product_id,cast(row_num as string))) as top_3_prods
from
(
select user_id,product_id,prod_buy_cnt,
row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num
from(
select user_id,product_id,count(1) as prod_buy_cnt
from orders t1
join order_products_prior t2 
on t1.order_id=t2.order_id
group by user_id,product_id
)t12
)t
where row_num<4
group by user_id
limit 20;

 

每个用户最喜爱购买的前10%个product是什么

select user_id,collect_list(concat_ws('_',product_id,cast(rk as string),cast(prod_cate_cnt as string))) as top_10_prod
from
(
  select
  user_id,product_id,usr_prod_cnt,
  row_number() over(distribute by user_id sort by usr_prod_cnt desc) as rk,
  ceil(cast(count(1) over(partition by user_id) as double)*0.1) as prod_cate_cnt
  --ceil(cast(sum(usr_prod_cnt) over(partition by user_id) as double)*0.1) as total_prod_cnt
  from
  (
    select user_id,product_id,count(1) as usr_prod_cnt
    from orders 
    join 
    order_products_prior pri
    on orders.order_id=pri.order_id
    group by user_id,product_id
  )t1
)t
where rk<=prod_cate_cnt
group by user_id
limit 100;

 

建分区表,orders表按照order_dow建立分区表orders_part,然后从hive查询orders动态插入orders_part表中

1.建立分区表

create table order_part(
order_id string,
user_id string,
eval_set string,
order_number string,
order_hour_of_day string,
days_since_prior_order string
)partitioned by(order_dow string)
row format delimited fields terminated by '\t';

2.动态插入分区表

set hive.exec.dynamic.partition=true; --使用动态分区
set hive.exec.dynamic.partition.mode=nonstrict; --使用无限制模式

insert overwrite table order_part partition(order_dow) --(dt='20190512')
select order_id,user_id,eval_set,order_number,order_hour_of_day,days_since_prior_order,order_dow 
from orders --where order_dow='2'

 

posted @ 2020-04-02 18:03  xd_xumaomao  阅读(240)  评论(0编辑  收藏  举报