hadoop之hive基本操作

-- 清空表中的数据，保留表结构
truncate table tmp_userid;
insert into tmp_userid values('123456');

-- 搜索库或表支持正则表达式
show tables 'sa*';
show tables in basename;

-- 创建数据库时，默认位置是'/user/hive/warehouse/basename.db',可以创建表时指定物理位置
CREATE DATABASE  BASENAME
LOCATION '/path/to/hdfs/';
-- 查看数据库信息，含hdfs信息
describe database ycappdata;

-- 查看表结构和存储信息
show create table tablename;
describe extended ycappdata.tmp_userid_activity;
-- 给数据库增加附属信息
create database basename 
with dbproperties ('creator'='wangbin','date'='2017-12-01');
-- 可以通过~查看
DESCRIBE DATABASE EXTENDED BASENAME;


-- 递归删除数据库和数据库中的表
DROP DATABASE IF EXISTS BASENAME CASCADE;


-- 查看表存储位置，并将数据put进hdfs
hdfs://data01.ycapp.yiche.com:8020/user/hive/warehouse/ycappdata.db/tmp_userid
hadoop fs -put /home/sa_cluster/wangbin/uid20180105.txt /user/hive/warehouse/ycappdata.db/tmp_userid/

-- 创建外部表，删除表并不会删除hdfs上的数据
create external table if not exists stocks(
*)
row format delimited fields terminated by ','
location '/path/to/hdfs/'

-- 使用已有表创建外部表,管理表也可以这样复制
create external table if not exists stocks2
like stocks
location '';

-- 规定查询分区表必须指定分区以及相反的情况
set hive.mapred.mode=strict;
set hive.mapred.mode=nostrict;

-- 查看表的分区,以及查看特定分区
show partitions tablename;
show partitions tablename partition(dt='2017-12-01');

-- 给表增加一个分区
alter table log_messages add partition(year=2017,month=12,day=2);
location '/path/to/hdfs';

-- 改变表的分区地址
alter table log_messages  partition(year=2017,month=12,day=2);
set location '/newpath/to/hdfs';

-- 查看分区表的地址
describe extended ycappdata.sa_daydau_detail partition (ctl_dt='2017-12-01');

-- 表重命名
alter table log_messages rename to log_msgs;
-- 增加、修改、删除表分区
alter table log_messages drop if exists partition(year=2017,month=12,day=2);
-- 修改列信息
alter table log_messages change column hms1 hms2 int ; 
alter table log_messages add column hms3 int;
-- 还可以修改表属性和列属性

-- 从一个表查询数据并插入到分区表中
insert overwrite table employees
partition(country='US',state='OR')
select * from  tablename;

-- 动态分区插入数据,hive 根据select 语句的最后两列来确定分区字段的值
insert overwrite table employees
partition(country,state)
select ...,se.cnty,se.st 
from tablename se;

set hive.exec.dynamic.partition=true;表示开启动态分区功能。还有一些其他的属性可以配置

-- 从表中导出数据
insert overwrite local directory '/dir/'
select * from ;

-- 从表中查询集合数据类型,array[0],map['key'],struct.key
select subordinates[0],deductions['key'],address.city from employees;


-- 使用表生成函数
select explode(subordinates) as sub  from employees;

-- 扫描一次全表，执行多次操作
from history 
insert overwrite table1 select *  where action='p1'
insert overwrite table2 select *  where action='p2'
insert overwrite table3 select *  where action='p3';

-- 创建视图
create view if not exists viewname(col1,col2)
as select * from tablename;

-- 创建索引,仅对country建索引，一张表的索引数据存储在另外一张表中
create index employees_index
on table employees(country)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
with deferred rebuild
in table employees_index_table;

-- 显示索引
show formated index on employees;

-- 删除索引
drop index employees_index on table employees;

-- hive 数据分桶
create table weblog (user_id int,url string,source_ip string)
partition by (dt string)
cluster by (user_id) into 96 buckets;

-- 设置hive为表分桶的默认reduce数，如果为false就需要手动指定buckets数,分桶时必须加cluster by
set hive.enforce.bucketing=true;
->
set mapred.reduce.tasks=96;

-- 开启中间压缩，shuffle数据会减少
set hive.exec.compress.intermediate=true;
-- 开启输出结果压缩
set hive.exec.compress.output=true;

-- 设置输出压缩格式为Gzip
set  mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;

-- 旧文件访问频率很低，可以考虑进行归档，减少namenode的压力，缺点是查询效率会降低，也不会减少磁盘空间
-- 设置表为归档表,并将指定分区归档->.har，之后的语句可以进行反向操作，将数据从har文件提取出来重新放在hdfs
set hive.archive.enabled=true;
alter table hive_text archive partition(folder='docs');
alter table hive_text unarchive partition(folder='docs');


-- 使用表生成函数

select name,sub from employees
lateral view explode(subordinates) subView as  sub ;

#开窗函数，同时求分量和总量
select amount_percent, cat_name,brand_num,
sum(amount_percent) over(partition by cat_name  ) as amount_percent_num
  from
(SELECT * FROM hive_temp_bad.dlyang_1234 order by cat_name) t

#求数据及数据在总量的占比
select
    usertype
    ,round(volume/10000,2) as volume
    ,round(volume/sum(volume) over()*100,2) as rate
from (
    select usertype
        ,sum(volume) as volume
    from sale_dws.dws_hangran_volume_ans_df
    group by usertype
   
)t

posted @ 2018-01-09 16:57 Mars.wang 阅读(598) 评论(0) 编辑收藏举报

刷新页面返回顶部