hive lab competition
create table user(
userid STRING,
sex STRING,
age INT,
career INT,
code STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':';
LOAD DATA LOCAL INPATH '/home/hadoop03/rating/users.dat' overwrite into table user;
create table rating(
userid STRING,
movieid STRING,
rate INT,
tmpe TIMESTAMP
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':';
LOAD DATA LOCAL INPATH '/home/hadoop03/rating/ratings.dat' overwrite into table rating;
create table movie(
movieid STRING,
moviename STRING,
movietype ARRAY<STRING>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
COLLECTION ITEMS TERMINATED BY '|';
LOAD DATA LOCAL INPATH '/home/hadoop03/rating/movies.dat' overwrite into table movie;
// every age the highest movie
select aa.age,aa.mymovietype,aa.c from ( select jun.age,jun.mymovietype,jun.c,row_number() over (distribute by age sort by c desc) rownum from ( select xiong.age,mymovietype,count(mymovietype) as c from( select bear.age,mymovietype from ( select usr.age,mov.movietype from rating rat left join user usr on rat.userid = usr.userid left join movie mov on rat.movieid = mov.movieid where rat.rate=5) bear LATERAL VIEW explode(bear.movietype) movietype AS mymovietype) xiong group by xiong.age,xiong.mymovietype) jun ) aa where aa.rownum =1;
pmod(datediff(from_unixtime(tmpe,'yyyy-MM-dd'), '2001-01-01'),7)
select pmod(datediff(from_unixtime(tmpe,'yyyy-MM-dd'), '2014-09-21'),7) from rating limit 3;
select weekofyear(from_unixtime(tmpe,'yyyy-MM-dd')), day(from_unixtime(tmpe,'yyyy-MM-dd')) from rating limit 3;
// every day of week every movie type highest movie
select case xx.day when 0 then 7 else xx.day end, xx.mymovietype, xx.moviename from (select jun.day, jun.mymovietype, jun.moviename,jun.cou, row_number() over (distribute by jun.day, jun.mymovietype sort by jun.cou desc) rownum from (select bear.day,bear.mymovietype,bear.moviename,count(bear.tmpe) as cou from ( select * from (select pmod(datediff(from_unixtime(tmpe,'yyyy-MM-dd'), '2014-09-21'),7) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5 ) xiong lateral view explode (xiong.movietype) movietype as mymovietype) bear group by bear.day,bear.mymovietype,bear.moviename ) jun ) xx where xx.rownum = 1;
select movieid,moviename,mymovietype from movie lateral view explode (movietype) movietype as mymovietype limit 9;
select distinct(mymovietype) from (select day(from_unixtime(tmpe,'yyyy-MM-dd')) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5 ) xiong lateral view explode (xiong.movietype) movietype as mymovietype
===============================================================================================================================================================================
create table wifi(
phone STRING,
year STRING,
month STRING,
day STRING,
hour STRING,
minute STRING,
second STRING,
timezone STRING,
host STRING,
facility STRING,
service STRING,
mac STRING,
protocol STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
load data local inpath '/home/hadoop03/rating/wifi_data.txt' overwrite into table wifi;
create table wifi2(
tt BIGINT,
mac STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
insert overwrite table wifi2 select unix_timestamp(concat(year,'-',month,'-',day,' ',hour,':',minute,':',second)),mac,message from wifi;
1 select count(distinct(mac)) from wifi;
2 FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE '%association OK%' AND B.message LIKE '%deauthenticated%' GROUP BY A.mac, A.tt ) e select count(e.tt);
3 FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE '%association OK%' AND B.message LIKE '%deauthenticated%' GROUP BY A.mac, A.tt ) e select avg(e.tt);
4 Not sure.
The competition includes 2 parts, you have to work with you group members to finish 2 tests.
Hive 1:
We have a dvd shop providing the dvd rent service. Based on different age group or flavor, the dvd shop need do some recommendations to the customer according to below movie rating.
Here is the rate file contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 users.
For the format file please check the readme included in the zip package.
Here is the demand:
1. 根据不同年龄段统计出rate最高的电影类型
2. 统计出每周每天推荐度最高的不同电影类型的电影名称
Hive 2:
Background:
The retail customer wanted us to do some research, driven by an idea: It must be possible to bring the concepts of tracking users in the online world to retail stores. one of the most important key performance indicators is revenue per square metre. We thought about bringing in some new metrics. From a wider perspective, data is produced by various sensors. With a real store in mind we figured out possible sensors stores could use – customer frequency counters at the doors, the cashier system, free WiFi access points, video capturing, temperature, background music, smells and many more. While for many of those sensors additional hardware and software is needed, for a few sensors solutions are around, e.g. video capturing with face or even eye recognition. We talked about our ideas with executives and consultants from the retail industry and they confirmed our idea is interesting to persue.
Solution:
We thought the most interesting sensor data (that doesn’t require additional hardware/software) could be the WiFi access points. Especially given that many visitors will have WiFi enabled mobile phones. With it’s log files we should be able to answer at least the following questions for a particular store:
-
How many people visited the store (unique visits)?
-
How many visits did we have in total?
-
What is the average visit duration?
-
How many people are new vs. returning?
Here is the raw data sample:
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-AUTHENTICATE.indication(98:0c:82:dc:8b:15, OPEN_SYSTEM)
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,authenticated
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,association OK (aid 2)
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,associated (aid 2)
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-ASSOCIATE.indication(98:0c:82:dc:8b:15)
1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)
1358757010,2013,1,21,9,30,10,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,deauthenticated
The description of the column is as follows:
iso_8601 year month day hour minute second timezone host facility_level service_level mac_address protocol message
We are interested in “authentication OK‘ and “deauthenticated” messages only. The messages from the router are not standardized (as are protocols such as TCP). We found that those two status messages are the closest ones to our understanding of a “login”/”logout” on the router.
Now we have the data we need to answer the following questions:
-
How many people visited the store (unique visitors)?
Note: Unlike the traditional customer frequency counter at the doors we have mac addresses at the log files that are unique for mobile phones. Supposed people do not change their mobile phones we can recognize unique visitors and not just visits. -
How many visits did we have?
-
What is the average visit duration?
-
How many people are new vs. returning?
Conclusion: