hadoop之hive基本操作
-- 清空表中的数据,保留表结构 truncate table tmp_userid; insert into tmp_userid values('123456'); -- 搜索库或表支持正则表达式 show tables 'sa*'; show tables in basename; -- 创建数据库时,默认位置是'/user/hive/warehouse/basename.db',可以创建表时指定物理位置 CREATE DATABASE BASENAME LOCATION '/path/to/hdfs/'; -- 查看数据库信息,含hdfs信息 describe database ycappdata; -- 查看表结构和存储信息 show create table tablename; describe extended ycappdata.tmp_userid_activity; -- 给数据库增加附属信息 create database basename with dbproperties ('creator'='wangbin','date'='2017-12-01'); -- 可以通过~查看 DESCRIBE DATABASE EXTENDED BASENAME; -- 递归删除数据库和数据库中的表 DROP DATABASE IF EXISTS BASENAME CASCADE; -- 查看表存储位置,并将数据put进hdfs hdfs://data01.ycapp.yiche.com:8020/user/hive/warehouse/ycappdata.db/tmp_userid hadoop fs -put /home/sa_cluster/wangbin/uid20180105.txt /user/hive/warehouse/ycappdata.db/tmp_userid/ -- 创建外部表,删除表并不会删除hdfs上的数据 create external table if not exists stocks( *) row format delimited fields terminated by ',' location '/path/to/hdfs/' -- 使用已有表创建外部表,管理表也可以这样复制 create external table if not exists stocks2 like stocks location ''; -- 规定查询分区表必须指定分区以及相反的情况 set hive.mapred.mode=strict; set hive.mapred.mode=nostrict; -- 查看表的分区,以及查看特定分区 show partitions tablename; show partitions tablename partition(dt='2017-12-01'); -- 给表增加一个分区 alter table log_messages add partition(year=2017,month=12,day=2); location '/path/to/hdfs'; -- 改变表的分区地址 alter table log_messages partition(year=2017,month=12,day=2); set location '/newpath/to/hdfs'; -- 查看分区表的地址 describe extended ycappdata.sa_daydau_detail partition (ctl_dt='2017-12-01'); -- 表重命名 alter table log_messages rename to log_msgs; -- 增加、修改、删除表分区 alter table log_messages drop if exists partition(year=2017,month=12,day=2); -- 修改列信息 alter table log_messages change column hms1 hms2 int ; alter table log_messages add column hms3 int; -- 还可以修改表属性和列属性 -- 从一个表查询数据并插入到分区表中 insert overwrite table employees partition(country='US',state='OR') select * from tablename; -- 动态分区插入数据,hive 根据select 语句的最后两列来确定分区字段的值 insert overwrite table employees partition(country,state) select ...,se.cnty,se.st from tablename se; set hive.exec.dynamic.partition=true;表示开启动态分区功能。还有一些其他的属性可以配置 -- 从表中导出数据 insert overwrite local directory '/dir/' select * from ; -- 从表中查询集合数据类型,array[0],map['key'],struct.key select subordinates[0],deductions['key'],address.city from employees; -- 使用表生成函数 select explode(subordinates) as sub from employees;
-- 扫描一次全表,执行多次操作 from history insert overwrite table1 select * where action='p1' insert overwrite table2 select * where action='p2' insert overwrite table3 select * where action='p3'; -- 创建视图 create view if not exists viewname(col1,col2) as select * from tablename; -- 创建索引,仅对country建索引,一张表的索引数据存储在另外一张表中 create index employees_index on table employees(country) as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with deferred rebuild in table employees_index_table; -- 显示索引 show formated index on employees; -- 删除索引 drop index employees_index on table employees; -- hive 数据分桶 create table weblog (user_id int,url string,source_ip string) partition by (dt string) cluster by (user_id) into 96 buckets; -- 设置hive为表分桶的默认reduce数,如果为false就需要手动指定buckets数,分桶时必须加cluster by set hive.enforce.bucketing=true; -> set mapred.reduce.tasks=96; -- 开启中间压缩,shuffle数据会减少 set hive.exec.compress.intermediate=true; -- 开启输出结果压缩 set hive.exec.compress.output=true; -- 设置输出压缩格式为Gzip set mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; -- 旧文件访问频率很低,可以考虑进行归档,减少namenode的压力,缺点是查询效率会降低,也不会减少磁盘空间 -- 设置表为归档表,并将指定分区归档->.har,之后的语句可以进行反向操作,将数据从har文件提取出来重新放在hdfs set hive.archive.enabled=true; alter table hive_text archive partition(folder='docs'); alter table hive_text unarchive partition(folder='docs'); -- 使用表生成函数 select name,sub from employees lateral view explode(subordinates) subView as sub ;
#开窗函数,同时求分量和总量 select amount_percent, cat_name,brand_num, sum(amount_percent) over(partition by cat_name ) as amount_percent_num from (SELECT * FROM hive_temp_bad.dlyang_1234 order by cat_name) t #求数据及数据在总量的占比 select usertype ,round(volume/10000,2) as volume ,round(volume/sum(volume) over()*100,2) as rate from ( select usertype ,sum(volume) as volume from sale_dws.dws_hangran_volume_ans_df group by usertype )t