hive笔记

hive笔记

#hive基本语法
-----------------------------------------------------
#http://dblab.xmu.edu.cn/blog/2440-2/
-- 不区分大小写
--建表  if not exists 
CREATE TABLE if not exists page_view(viewTime INT, userid BIGINT,
                page_url STRING, referrer_url STRING,
                friends ARRAY<BIGINT>, properties MAP<STRING, STRING>,
                ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(dt STRING, country STRING)	--分区
CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS	--分桶
ROW FORMAT DELIMITED	--行默认分隔符
        FIELDS TERMINATED BY '1'	--字段分隔符
        COLLECTION ITEMS TERMINATED BY '2'
        MAP KEYS TERMINATED BY '3'
STORED AS SEQUENCEFILE;	--以二进制格式进行存储数据在hdfs上


SHOW TABLES;
SHOW TABLES 'page.*';
SHOW PARTITIONS page_view;  --列出表的分区,无分区则抛出错误。

DESCRIBE page_view;  --列出表的列和列的类型
DESCRIBE EXTENDED page_view;  --列出表的列和表的其他属性。这会打印很多信息,且输出的风格不是很友好,通常用于调试。
DESCRIBE EXTENDED page_view PARTITION (ds='2016-08-08');   --列出列和分区的所有属性。这也会打印出许多信息,通常也是用于调试。

ALTER TABLE old_table_name RENAME TO new_table_name;  --修改表名
ALTER TABLE old_table_name REPLACE COLUMNS (col1 TYPE, ...);  --修改列名(列类型需一致),不修改的列需附上,否则丢失
ALTER TABLE tab1 ADD COLUMNS (c1 INT COMMENT 'a new int column', c2 STRING DEFAULT 'def val');  --增加列

DROP TABLE if exists pv_users;  --删表
ALTER TABLE pv_users DROP PARTITION (ds='2016-08-08')  --删除分区

--加载数据到Hive表 -> 外部表
CREATE EXTERNAL TABLE page_view_stg(viewTime INT, userid BIGINT,
                page_url STRING, referrer_url STRING,
                ip STRING COMMENT 'IP Address of the User',
                country STRING COMMENT 'country of origination')
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '44' 	--‘44’是逗号,‘12’是换页符
LINES TERMINATED BY '12'
STORED AS TEXTFILE
LOCATION '/user/data/staging/page_view';

hadoop dfs -put /tmp/pv_2016-06-08.txt /user/data/staging/page_view

FROM page_view_stg pvs
INSERT OVERWRITE TABLE page_view PARTITION(dt='2016-06-08', country='US')
SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip
WHERE pvs.country = 'US';


--加载数据到Hive表 -> 直接(格式一致),路径参数可以是一个目录(不含子目录),一个文件,或一个通配符
LOAD DATA LOCAL INPATH /tmp/pv_2016-06-08_us.txt INTO TABLE page_view PARTITION(date='2016-06-08', country='US')


--select
INSERT OVERWRITE TABLE user_active
SELECT user.*
FROM user
WHERE user.active = 1;


INSERT OVERWRITE TABLE xyz_com_page_views
SELECT page_views.*
FROM page_views
WHERE page_views.date >= '2008-03-01' AND page_views.date <= '2008-03-31' AND
      page_views.referrer_url like '%xyz.com';

--join
INSERT OVERWRITE TABLE pv_users
SELECT pv.*, u.gender, u.age
FROM user u JOIN page_view pv 
ON (pv.userid = u.id)
WHERE pv.date = '2008-03-03';


INSERT OVERWRITE TABLE pv_users
SELECT pv.*, u.gender, u.age
FROM user u FULL OUTER JOIN page_view pv 
ON (pv.userid = u.id)
WHERE pv.date = '2008-03-03';


INSERT OVERWRITE TABLE pv_friends
SELECT pv.*, u.gender, u.age, f.friends
FROM page_view pv JOIN user u ON (pv.userid = u.id) JOIN friend_list f ON (u.id = f.uid)
WHERE pv.date = '2008-03-03';


--union all
INSERT OVERWRITE TABLE actions_users
SELECT u.id, actions.date
FROM (
    SELECT av.uid AS uid
    FROM action_video av
    WHERE av.date = '2008-06-03'

    UNION ALL

    SELECT ac.uid AS uid
    FROM action_comment ac
    WHERE ac.date = '2008-06-03'
) actions JOIN users u 
ON(u.id = actions.uid);


--聚合(不可distinct作用于不同的列)
INSERT OVERWRITE TABLE pv_gender_agg
SELECT pv_users.gender, count(DISTINCT pv_users.userid), count(*), sum(DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;


--多表/文件插入
FROM pv_users
INSERT OVERWRITE TABLE pv_gender_sum  --插入hive
    SELECT pv_users.gender, count_distinct(pv_users.userid)
    GROUP BY pv_users.gender

INSERT OVERWRITE DIRECTORY '/user/data/tmp/pv_age_sum'  --插入HDFS文件系统
    SELECT pv_users.age, count_distinct(pv_users.userid)
    GROUP BY pv_users.age;


--动态分区插入
FROM page_view_stg pvs  --不便于维护
INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='US')
       SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'US'
INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='CA')
       SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'CA'
INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='UK')
       SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'UK';

FROM page_view_stg pvs  --多分区插入,目前只允许最后一列放置动态分区列,层级次序
INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country)
       SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip, pvs.country


--导出到本地文件
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/pv_gender_sum'
SELECT pv_gender_sum.*
FROM pv_gender_sum;


--抽样
--TABLESAMPLE(BUCKET 3 OUT OF 16) 表示第3个桶和第19个桶
--TABLESAMPLE(BUCKET 3 OUT OF 64 ON userid) 表示第3个桶的一半
INSERT OVERWRITE TABLE pv_gender_sum_sample
SELECT pv_gender_sum.*
FROM pv_gender_sum TABLESAMPLE(BUCKET 3 OUT OF 32);  --32个桶中选择第三个,必须是CLUSTERED BY修饰的列


--表的array、map列操作
CREATE TABLE array_table (int_array_column ARRAY<INT>);
SELECT pv.userid, pv.friends[2], size(pv.friends)
FROM page_view pv;

INSERT OVERWRITE page_views_map
SELECT pv.userid, pv.properties['page type'], size(pv.properties)
FROM page_views pv;

  

参考资料:

Hive 10、Hive的UDF、UDAF、UDTF

posted on 2020-12-09 16:05  iUpoint  阅读(98)  评论(0编辑  收藏  举报

导航