Hive学习总结
Hive学习总结
环境配置
-
本人用的是mysql存储hive元数据,需要提前下载mysql
-
hive-env.sh
# Set HADOOP_HOME to point to a specific hadoop install directory HADOOP_HOME=/export/servers/hadoop-2.7.5 # Hive Configuration Directory can be controlled by: export HIVE_CONF_DIR=/export/servers/apache-hive-2.1.1-bin/conf
-
hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>root</value> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>root</value> </property> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://node03:3306/hive?createDatabaseIfNotExist=true&useSSL=false</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <name>hive.metastore.schema.verification</name> <value>false</value> </property> <property> <name>datanucleus.schema.autoCreateAll</name> <value>true</value> </property> <property> <name>hive.server2.thrift.bind.host</name> <value>node03</value> </property> </configuration>
Hive常用命令
-
进入Hive命令模式
bin/hive #无需进入hive模式执行hive命令 bin/hive -e 'show databases'
-
创建(外部)表并指定字段分隔符
create [external] table student( sid string, name string ) [partitined by (year string,month string,day string)] row format delimited fields terminated by '\t';
-
加载数据
load data [local] inpath '/hivedatas/student.csv' [overwrite] into table student [partition (key='')]; #分桶表加载数据 #创建分桶表 create table course (c_id string,c_name string,t_id string) clustered by(c_id) into 3 buckets row format delimited fields terminated by '\t'; #创建临时表(字段名要对应) create table course_common (c_id string,c_name string,t_id string) row format delimited fields terminated by '\t'; #临时表加载数据 load data local inpath '/export/servers/hivedatas/course.csv' into table course_common; #分桶表加载数据 insert overwrite table course select * from course_common cluster by(c_id);
-
查询语句
#语法基本上与SQL一样,增加了分区和分桶的概念,底层通过多次的MapReduce实现 select [distinct] sid from student [where sid='123'] [group by sid [having name like '李%']] [cluster by sid | [distribute by sid] [sort by | order by sid] ] [limit 1,10]; #group by命令会按照指定的关键字进行分组计算,计算效率较高,但如果有的关键字数量比较少、有的比较多,可能会造成数据倾斜,拖慢整体的效率。 #distribute by相当于MapReduce中的分区,sort by是Map阶段完成后进行局部排序;order by是全局排序;当distribute 和 sort的字段相同时可以用cluster替换,但排序顺序只能是倒序。
-
查看数据库、表信息
desc database 数据库名; desc [formatted] student;
-
查询结果写入本地文件
insert overwrite local directory '/data/' select * from student;
-
设置reduce个数
set mapreduce.job.reduces=3;
-
开启分桶
set hive.enforce.bucketing=true;