Hive基本使用

一、Hive基本使用

(一) 基本操作

1. 数据库相关

(1) 创建数据库

create database tm;
create database if not exists tm;

(2) 切换数据库

use tm;

(3) 查看数据库信息

-- 查看数据库信息
desc database tm;

-- 显示数据库详细信息（可以查看数据库属性信息）
desc database extended tm;

(4) 修改数据库信息

-- 修改数据库（属性信息）
alter database tm set dbproperties('create_time'='20210506');

(5) 删除数据库

-- 删除空数据库
drop database tm;
drop database if exists tm;

-- 删除非空数据库
drop database tm cascade;

2. 数据表相关

前提：解决“中文注释”乱码


-- ########### 解决 “建表时” 中文注释是乱码 ################

-- 在mysql中 hive元数据库中执行 以下 SQL，
-- 只有新建表的注释才会是中文，之前表的注释无法变为中文

-- 修改表字段注解和表注解
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;

-- 修改分区字段注解
alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8 ;
alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;

-- 修改索引注解
alter table INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;

(1) 创建内部表

-- 创建“内部表” （管理表）
    -- 指定 字段分隔符
    -- 指定 行分隔符
    -- 指定 表属性信息
    
create table if not exists tm_data(
    id bigint comment '自增ID',
    ann_num int comment '公告期号',
    ann_date date comment '公告日期',
    ann_id int comment '公告类型id',
    ann_type_code string comment '公告类型代码',
    ann_type string comment '公告类型',
    ncl_code smallint comment '国际分类号',
    reg_number string comment '注册号',
    reg_name string comment '申请人',
    tm_name string comment '商标名称',
    pic_url string comment '图片链接',
    page_no int comment '页码',
    down_path string comment '下载地址',
    data_index string comment '数据库位置索引',
    add_time date comment '新增时间',
    mod_time date comment '修改时间',
    info_id string comment '公告id'
)
comment '公告数据列表'
row format delimited fields terminated by '\t'
lines terminated by '\n';
TBLPROPERTIES ('creator'='yzy', 'crate_time'='2021-05-08');

(2) 创建外部表

-- 创建“外部表”

create external table if not exists tm_data(
    id bigint comment '自增ID',
    ann_num int comment '公告期号',
    ann_date date comment '公告日期',
    ann_id int comment '公告类型id',
    ann_type_code string comment '公告类型代码',
    ann_type string comment '公告类型',
    ncl_code smallint comment '国际分类号',
    reg_number string comment '注册号',
    reg_name string comment '申请人',
    tm_name string comment '商标名称',
    pic_url string comment '图片链接',
    page_no int comment '页码',
    down_path string comment '下载地址',
    data_index string comment '数据库位置索引',
    add_time date comment '新增时间',
    mod_time date comment '修改时间',
    info_id string comment '公告id'
)
comment '公告数据列表'
row format delimited fields terminated by '\t'
lines terminated by '\n';
TBLPROPERTIES ('creator'='yzy', 'crate_time'='2021-05-08');

(3) 更改内【外】部表

-- 将 内部表 改为外部表
alter table tm_data set tblproperties('EXTERNAL'='TURE');

-- 将 外部表 改为内部表
alter table tm_data set tblproperties('EXTERNAL'='FALSE');

(4) 创建分区表

-- 创建 “分区表”

-- 注意：分区字段不能存在于表字段中

create table if not exists tm_data_partition(
    id bigint comment '自增ID',
    -- ann_num int comment '公告期号',
    ann_date date comment '公告日期',
    ann_id int comment '公告类型id',
    ann_type_code string comment '公告类型代码',
    ann_type string comment '公告类型',
    ncl_code smallint comment '国际分类号',
    reg_number string comment '注册号',
    reg_name string comment '申请人',
    tm_name string comment '商标名称',
    pic_url string comment '图片链接',
    page_no int comment '页码',
    down_path string comment '下载地址',
    data_index string comment '数据库位置索引',
    add_time date comment '新增时间',
    mod_time date comment '修改时间',
    info_id string comment '公告id'
)
comment '公告数据列表'
partitioned by (ann_num int comment '公告期号')
row format delimited fields terminated by '\t'
lines terminated by '\n';
TBLPROPERTIES ('creator'='yzy', 'crate_time'='2021-05-07');

(5) 操作分区

-- 分区字段可以作为表的普通字段进行查询
select * from tm_data where ann_num=1752;

-- 查看分区信息
show partitions tm_data;

-- 增加分区
alter table tm_data add if not exist partition(ann_num=1753);
alter table tm_data add if not exist partition(ann_num=1753),partition(ann_num=1754);

-- 删除分区
alter table tm_data drop if exist partition(ann_num=1753);
alter table tm_data drop if exist partition(ann_num=1753),partition(ann_num=1754);

(6) 创建分桶表

-- 创建分桶表时，指定桶的个数和分桶的依据字段
/**
	分桶的实质是将数据分成不同的文件。Hive中的分桶和Hadoop中的Reduce个数相同。
	分桶表的作用: 1.提高查询效率 2.提高采样效率 3.提高join的效率
	分桶原理：是把hive单个数据文件，拆分成均匀大小的数据
*/

-- hive 连接中开启分桶功能
set hive.enforce.bucketing=true;   -- 默认是flase
set mapreduce.job.reduces=4;  -- 默认是-1

-- 创建分桶表：
-- 注：分桶字段 必须是必须在表字段中
create table tm_data_buck(
    id BIGINT,
    ann_num SMALLINT,
    reg_number STRING,
)
clustered by(id)
sorted by(id DESC)
into 4 buckets
row format delimited fields terminated by '\t'
lines terminated by '\n';

-- 通过 select into 的方式向 分桶表中插入数据
insert overwrite table tm_data_buck select * from tm_data cluster by(id);

(7) 查看表信息

-- 查看表字段信息
desc tm_data;

-- 查看详细的表字段信息;
desc formatted tm_data;

-- 查看表建表语句;
show create table tm_data;

(8) 根据已有数据表创建数据表

-- 复制表结构
create table if not exists tm_data2 like tm_data;

-- 复制表结构及数据[取部分字段]

-- 1. 未指定分隔符
create table if not exists tm_data3 as select id, ann_num, ann_date from tm_data;

-- 2. 指定分隔符
create table if not exists tm_data4 row format delimited fields terminated by '\t' lines terminated by '\n' as select id, ann_num, ann_date from tm_data;

-- 3. 指定表备注信息，不能指定要创建表的字段及备注信息
create table if not exists tm_data5 comment '表备注' row format delimited fields terminated by '\t' lines terminated by '\n' as select id, ann_num, ann_date from tm_data;

(9) 创建复合类型字段的数据表

-- 数据准备，两条数据
-- 新建test.txt文件，内容如下：导入hive表中
songsong,bingbing_lili,xiao song:18_xiaoxiao song:19,hui long guan_beijing
yangyang,caicai_susu,xiao yang:18_xiaoxiao yang:19,chao yang_beijing

-- 导入数据
load data local inpath '/root/test.txt' into table student;


-- 指定 array，map，struct 等复合类型的 分隔符
create table if not exists student(
	name string,
	friend array<string>,
	children map<string,int>,
	address struct<street:string,city:string>
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n';

-- 查询语句
select name,friend[1],children["xiaoxiao"],address.street,address.city from student;

(10) 插入数据的几种方式

-- 1. 文本文件导入

/** hive> load data [local] inpath '/opt/module/datas/student.txt' overwrite | into table student [partition (partcol1=val1,…)];
（1）load data:表示加载数据
（2）local:表示从本地加载数据到hive表；否则从HDFS加载数据到hive表
（3）inpath:表示加载数据的路径
（4）overwrite:表示覆盖表中已有数据，否则表示追加
（5）into table:表示加载到哪张表
（6）student:表示具体的表
（7）partition:表示上传到指定分区
*/
load data local inpath '/root/test.txt' overwrite into table student;

-- 2. 通过select查询语句加载数据
create table if not exists student2 like student;

insert into table student2 select id, ann_num, ann_date from tm_data;

-- 3. 创建表时通过select查询语句加载数据 
create table if not exists student3 as select * from student;

-- 4. 创建表的时候通过 'location' 指定加载数据路径
-- 	创建表
create table if not exists student(
	name string,
	friend array<string>,
	children map<string,int>,
	address struct<street:string,city:string>
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n'
location '/user/hive/warehouse/student_location';

-- 	上传数据至HDFS
dfs -put '/root/test.txt' /user/hive/warehouse/student_location

-- 查看结果
select name,friend[1],children["xiaoxiao"],address.street,address.city from student_location;

(11) 数据导出的几种方式

-- 1．将查询的结果导出到本地
insert overwrite local directory '/root/export/student' select * from student;

-- 2．将查询的结果格式化导出到本地
insert overwrite local directory '/root/export/student1' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' select * from student;

-- 3．将查询的结果导出到HDFS上(没有local)
insert overwrite directory '/user/student2' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' select * from student;

-- 4. Hadoop命令导出到本地
dfs -get /user/hive/warehouse/student/month=201709/000000_0 /root/student3.txt;

-- 5. Hive Shell 命令导出
-- 基本语法：（hive -f/-e 执行语句或者脚本 > file）
hive -e 'select * from default.student;' > /opt/module/datas/export/student4.txt;

-- 6. Export导出到HDFS上
export table default.student to '/user/export/student';

(12) 修改表字段

-- 1. 修改表名称
ALTER TABLE table_name RENAME TO new_table_name

-- 2. 修改列信息,修改某字段名称，类型并将字段调到 某字段之后
ALTER TABLE table_name CHANGE COLUMN old_column_name new_column_name new_column_type COMMENT 'comment' AFTER columnOne;

-- 3. 增加列字段 “ADD 是代表新增一字段，字段位置在所有列后面(partition 列前)”
ALTER TABLE table_name ADD COLUMNS(column string);

-- 4. 替换列 “REPLACE 则是 表示替换表中所有字段（修改不修改都需要写上字段的类型）”
ALTER TABLE table_name REPLACE COLUMNS(column1 string, column2 string, column3 string);

(二) 详解Hive的分区

1. 分区简介

Hive的分区方式：由于Hive实际是存储在HDFS上的抽象，Hive的一个分区名对应一个目录名，子分区名就是子目录名，并不是一个实际字段。

一）hive中支持两种类型的分区：

静态分区SP（static partition）
动态分区DP（dynamic partition）

2. 静态分区和动态分区的区别

两者主要的差别在于：

加载数据的时候，静态分区需要手动设定不同的分区，按分区分别导入数据；

动态分区不需要指而定分区 key 的值，会根据 key 对应列的值自动分区写入，如果该列值对应的分区目录还没有创建，会自动创建并写入数据。

3. 静态和动态分区的简单使用

-- 1. 新建分区表
CREATE TABLE partition_tm_data_01 
(
    id BIGINT,
    reg_number STRING,
)
PARTITIONED BY (ann_num SMALLINT, ann_type_code STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
lines terminated by '\n';

-- 2.使用静态分区时，必须指定分区的值
-- 建表
create table if not exists partition_tm_data_02 like partition_tm_data_01;

-- 导入数据【只能导入 ann_num=1743 and ann_type_code='TMZCSQ' 的数据】
insert overwrite table partition_tm_data_02 partition (ann_num=1743, ann_type_code='TMZCSQ')
select id, reg_number from partition_table001 where ann_num=1743 and ann_type_code='TMZCSQ';

-- 3. 使用动态分区
-- 注意：用 select 选择数据导入动态分区时，要把关键字放在最后面，因为动态分区默认以最后一个关键字作为分区关键字

-- 开启hive动态分区
set hive.exec.dynamic.partition=true;	-- (可通过这个语句查看：set hive.exec.dynamic.partition;)
set hive.exec.dynamic.partition.mode=nonstrict;	-- （strict 要求至少有一个静态分区， nonstrict 可以都是动态分区）
set hive.exec.max.dynamic.partitions=100000;	-- (如果自动分区数大于这个参数，将会报错)
set hive.exec.max.dynamic.partitions.pernode=100000;

-- 建表
create table if not exists partition_tm_data_03 like partition_tm_data_01;
-- 导入数据【可以将 ann_num=1743 下所有的数据】
insert overwrite table partition_tm_data_03 partition (ann_num, ann_type_code) 
select * from partition_table001 where ann_num=1743;

-- 4. 动态分区和静态分区混合使用

-- 建表
create table if not exists partition_tm_data_04 like partition_tm_data_01;

-- 导入数据
INSERT OVERWRITE TABLE partition_tm_data_04 PARTITION (ann_num=1743, ann_type_code)
SELECT id, reg_number, /*ann_num,*/ ann_type_code FROM partition_tm_data_01 WHERE ann_num is not null and ann_type_code is not null;

-- 错误用法
INSERT OVERWRITE TABLE partition_tm_data_04 PARTITION (ann_num, ann_type_code='TMZCSQ')
SELECT id, reg_number, ann_num/*, ann_type_code*/ FROM partition_tm_data_01 WHERE ann_num is not null and ann_type_code='TMZCSQ';

-- 5. 对于CREATE-AS语句，目标表的schema无法完全的从select语句传递过去。这时需要在create语句中指定partition列

CREATE TABLE partition_tm_data_05 (
    id BIGINT,
    reg_number STRING,
)
PARTITIONED BY (ann_num SMALLINT, ann_type_code STRING) 
AS
SELECT id, reg_number, ann_num, ann_type_code FROM partition_tm_data_01
WHERE ann_num is not null and ann_type_code is not null;

-- 主分区自定义常量

CREATE TABLE partition_tm_data_06 (
    id BIGINT,
    reg_number STRING,
)
PARTITIONED BY (ann_num SMALLINT, ann_type_code STRING) 
AS
SELECT id, reg_number, 1756, ann_type_code FROM partition_tm_data_01
WHERE ann_num is not null and ann_type_code is not null;

上一篇dolphinscheduler文档

下一篇Hive数据压缩与存储格式

本文作者：柒小韩

本文链接：https://www.cnblogs.com/yzyang/p/15123279.html

posted @ 2021-08-10 13:14 柒小韩阅读(321) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

心随你动