电影表

movies表
电影id,电影名称,电影类型

ratings表
用户id,电影id,评分,评论时间(毫秒值)

users表
用户id,性别,年龄段,工作类型,电话

age表
年龄段,年龄范围

job表
工作类型,工作岗位名称


1.清洗数据 mr

2.hive计算
表结构准备 外部表

create external table age(
aid string,
agescore string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile
location '/project/age';

create external table job(
jid string,
jobname string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile
location '/project/job';

create external table movies(
mid string,
mname string,
type string
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/project/movies';

create external table ratings(
uid string,
mid string,
score int,
mytime string
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/project/ratings';


create external table users(
uid string,
sex string,
aid string,
jid string,
tel string
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/project/users';


要求:
1.算出所有电影的平均评分。
每个电影的平均分
select m.mname,round(t.avg,2)
from
movies m,
(select avg(score) avg,mid from ratings group by mid)t
where m.mid = t.mid




中间过程  含有有效 和 脏  score is null
create table result1(
mid string,
mname string,
score double
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

//3883  3882 score  4000 null
insert into result1
select mov.mid,mov.mname,t1.score
from movies mov left join(
   select mid,round(avg(score),1) score
   from ratings
   group by mid) t1 on mov.mid=t1.mid;

//hive和mysql 不同 update 并且 最不值钱的就是 表
create table result2(
mid string,
mname string,
score double
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

insert into table result2
select mid,mname,score
from result1
where score is not null;

insert into table result2
select mid,mname,0
from result10
where score is null;



2.算出所有用户对那些类型的电影最感兴趣,对每个用户推送该类型前三名评分最高的电影。
每个用户最喜欢哪个类型的电影
row_number()
ratings表中 一个电影出现几次?
一个电影有几个类型?
类型在movies   评分或者观看次数ratings

movie join ratings

select t5.uid,t5.lx,m.mname
from
(select t3.uid,t3.lx,t4.mid
from
(select t2.uid,t2.lx
from
(select row_number() over (partition by t1.uid order by t1.cnt desc) rn,t1.uid,t1.lx
from
(select count(*) cnt,r.uid,t.lx
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t
,ratings r
where r.mid=t.mid
group by r.uid,t.lx)t1)t2
where t2.rn=1)t3
,
(select t2.lx,t2.mid 
from
(select row_number() over (partition by t1.lx order by t1.cnt desc) rn,t1.lx,t1.mid
from
(select count(*) cnt,t.lx,r.mid
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t
,
ratings r
where r.mid = t.mid
group by t.lx,r.mid)t1)t2
where t2.rn<4)t4
where t3.lx =t4.lx)t5
,movies m
where t5.mid=m.mid 


每个类型最受欢迎的前三个电影

类型== 类型










select t3.uid,t3.lx,t4.mid
from
(select t2.lx,t2.mid,t2.cnt
from
(select row_number() over (partition by t1.lx order by t1.cnt desc)rn,t1.lx,t1.mid,t1.cnt
from
(select count(*) cnt,t.lx,t.mid
from ratings r,
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
where r.mid = t.mid
group by t.lx,t.mid)t1)t2
where t2.rn<4)t4
join
(select t2.lx,t2.uid,t2.cnt
from
(select row_number() over (partition by t1.uid order by t1.cnt desc)rn,t1.lx,t1.uid,t1.cnt
from
(select count(*) cnt,t.lx,r.uid
from ratings r,
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
where r.mid = t.mid
group by t.lx,r.uid)t1)t2
where t2.rn=1)t3
where t3.lx = t4.lx

//hive 最不值钱的是表 一半 人 和 类型
create table result3(
uid string,
type string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

insert into result3
select users.uid,t4.type
from(
select t3.uid uid,t3.type type
   from(
      select t2.uid uid,t2.type type,t2.num num,row_number() over(partition by t2.uid order by num desc) rk
      from(
         select t1.uid uid,t1.type type,count(*) num
         from(
            select rat.uid uid,mov.type type
            from ratings rat,movies mov
            where rat.mid=mov.mid) t1
         group by t1.uid,t1.type) t2) t3
   where t3.rk=1) t4,users
where t4.uid=users.uid;

create table result4(
type string,
mname string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

insert into result4
select t2.type,t2.mname
from(
select t1.type type,t1.mname mname,row_number() over(partition by t1.type order by t1.score desc) rk
from(
   select mov.type type,mov.mname mname,avg(rat.score) score
   from ratings rat,movies mov
   where rat.mid=mov.mid
   group by mov.type,mov.mname) t1 ) t2
where t2.rk<4;   

select res3.uid,res4.mname
from result3 res3,result4 res4
where res3.type=res4.type;

3.每个年龄层次,前3种最受欢迎的电影类型。
select t3.agescore,t3.lx,t3.cnt
from
(select row_number() over (partition by t2.agescore order by t2.cnt desc) rn,t2.agescore,t2.lx,t2.cnt
from
(select t.lx,r.agescore,count(*) cnt
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
,ratings r
where t.mid = r.mid)t1,users u,age a
where u.uid=t1.uid and u.aid=a.aid
group by a.agescore,t1.lx)t2)t3
where t3.rn<=3






movies ratings users age


ratings users movie
uid mid    aid,type

基础 方式 两两关联 没有优化 job数量是最大的
select age.agescore,t4.type
from(
select t3.aid aid,t3.type type
from(
   select t2.aid aid,t2.type type,t2.num num,row_number() over(partition by t2.aid order by num desc) rk
   from(
      select t1.aid aid,t1.type type,count(*) num
      from(
         select users.aid aid,mov.type type
         from ratings rat,users,movies mov
         where rat.uid=users.uid and rat.mid=mov.mid) t1
      group by t1.aid,t1.type) t2 ) t3
where t3.rk<4) t4,age
where t4.aid=age.aid;

最终目标 1sql 做所有
sql编程 sql语句由复杂的sql嵌套组成  能在一次sql当中 尽量的做做些事情 hive-》job 分布式-》提交 计算资源(数据在哪 需要哪些资源)->run

create table result5(
agescore string,
type string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

insert into result5
select age.agescore,t2.type
from(
   select t1.aid aid,t1.type type,t1.num num,row_number() over(partition by t1.aid order by t1.num desc) rk
   from(
      select users.aid aid,mov.type type,count(*) num
      from ratings rat,users,movies mov
      where rat.uid=users.uid and rat.mid=mov.mid
      group by users.aid,mov.type) t1 ) t2,age
where t2.rk<4 and t2.aid=age.aid;



4.算出所有电影都有那些工作岗位的人评论过,相对应的人数是多少?

select count(*),j.jobname,r.mid
from
ratings r,users u,job j
where r.uid = u.uid and j.jid=u.jid
group by j.jobname,r.mid limit 10;











rat->user->job

select t1.mname,t1.jobname,count(*) num
from(
   select job.jobname jobname,mov.mname
   from ratings rat,users,job,movies mov
   where rat.uid=users.uid and users.jid=job.jid and rat.mid=mov.mid) t1
group by t1.jobname,t1.mname
order by t1.mname,num desc;

from
where
group by
having
select
order by
limit

5.根据电影上映时间,得出当年最受欢迎和最不受欢迎的 10个名单
select t5.year,m1.mname,t5.cnt1,m2.mname,t5.cnt2,t5.rn
from
(select t3.year,t3.mid mid1,t3.cnt cnt1,t4.mid mid2,t4.cnt cnt2,t3.rn
from
(select t2.year,t2.mid,t2.cnt,t2.rn
from
(select row_number() over (partition by t1.year order by t1.cnt desc) rn, t1.year,t1.mid,t1.cnt
from
(select getDate(m.mname) year,r.mid,count(*) cnt
from movies m,ratings r
where m.mid = r.mid
group by getDate(m.mname),r.mid)t1)t2
where t2.rn<11)t3
,
(select t2.year,t2.mid,t2.cnt,t2.rn
from
(select row_number() over (partition by t1.year order by t1.cnt )rn, t1.year,t1.mid,t1.cnt
from
(select getDate(m.mname) year,r.mid,count(*) cnt
from movies m,ratings r
where m.mid = r.mid
group by getDate(m.mname),r.mid)t1)t2
where t2.rn<11)t4
where t4.year=t3.year and t3.rn=t4.rn) t5,movies m1,movies m2
where m1.mid = t5.mid1 and t5.mid2=m2.mid




1.hive udf

add jar /home/data/myTime.jar;
create temporary function myTime as 'com.beiwang.project.MyTime';


2.每年最受欢迎和最不受欢迎 top10
最受欢迎top10
create table result6(
year string,
mname string,
rk string,
flag string
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

insert into result6
select t3.year,t3.mname,t3.rk,'good'
from(
   select t2.year year,t2.mname mname,row_number() over(partition by year order by num desc) rk
   from(
      select t1.year year,t1.mname mname,count(*) num
      from(
         select myTime(mov.mname) year,mov.mname mname
         from ratings rat,movies mov
         where rat.mid=mov.mid) t1
      group by t1.year,t1.mname ) t2 ) t3
where t3.rk<=10;

最不受欢迎top10

insert into result6
select t3.year,t3.mname,t3.rk,'bad'
from(
   select t2.year year,t2.mname mname,row_number() over(partition by year order by num) rk
   from(
      select t1.year year,t1.mname mname,count(*) num
      from(
         select myTime(mov.mname) year,mov.mname mname
         from ratings rat,movies mov
         where rat.mid=mov.mid) t1
      group by t1.year,t1.mname ) t2 ) t3
where t3.rk<=10;

sqoop hive --》 mysql


6.得出每个岗位 最喜欢的电影类型 前3名


7.每年 每个类型 最受欢迎的电影 前3名

(
group by year type
)
where rk<3

age.txt

1    "Under 18"
18    "18-24"
25    "25-34"
35    "35-44"
45    "45-49"
50    "50-55"
56    "56+"

job.txt

0    "other" or not specified
1    "academic/educator"
2    "artist"
3    "clerical/admin"
4    "college/grad student"
5    "customer service"
6    "doctor/health care"
7    "executive/managerial"
8    "farmer"
9    "homemaker"
10    "K-12 student"
11    "lawyer"
12    "programmer"
13    "retired"
14    "sales/marketing"
15    "scientist"
16    "self-employed"
17    "technician/engineer"
18    "tradesman/craftsman"
19    "unemployed"
20    "writer"

 

posted @ 2019-05-13 15:32  lilixia  阅读(729)  评论(0编辑  收藏  举报