Hive笔记
hive
1.下载hive 可从apache上下载hive安装包:http://mirror.bit.edu.cn/apache/hive/ 2.在hadoop01容器中解压 cd /home/hadoop tar -zxf apache-hive-3.1.2-bin.tar.gz 3.配置环境变量 vi ~/.bashrc export JAVA_HOME=/home/hadoop/javajdk18/jdk/ export HADOOP_HOME=/home/hadoop/hadoop/ export HIVE_HOME=/home/hadoop/hive export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin 4.修改hive配置文件:conf/hive-env.sh export JAVA_HOME=/home/hadoop/javajdk18/jdk export HADOOP_HOME=/home/hadoop/hadoop export HIVE_HOME=/home/hadoop/hive 5.添加conf/hive-site.xml <configuration> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://192.168.15.45:3307/hivedb?createDatabaseIfNotExist=true</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>root</value> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>root</value> </property> </configuration> 6.复制默认的hive-default.xml cp conf/hive-default.xml.template conf/hive-default.xml 7.拷贝驱动hive/lib 下载地址:https://dev.mysql.com/downloads/connector/j/ mysql-connector-java-5.1.39-bin.jar 8.初始化 schematool -initSchema -dbType mysql 9.启动hive hive 10.创建普通表 create table mytable(id int, name string) row format delimited fields terminated by '\t'; 导入数据:load data local inpath '/tmp/test/mytable.txt' overwrite into table mytable; 11.查看表 show tables; 12.创建外部表(指定warehouse下面的文件夹,不要直接指定warehouse,这样会清空其它表) create external table if not exists pageview( pageid int, page_url string comment 'The page URL' ) row format delimited fields terminated by ',' location 'hdfs://ns1/user/hive/warehouse/pageview/'; 13.创建分区表 create table if not exists invites( id int, name string ) partitioned by (ds string) row format delimited fields terminated by ',' lines terminated by '\n' stored as textfile; 导入数据:load data local inpath '/tmp/test/invites.txt' overwrite into table invites partition (ds='201911'); hive> create table invites2( > id int, > name string, > hobby array<string>, > add map<string,string>) > partitioned by (pt_d string) > row format delimited fields terminated by ',' > collection items terminated by '-' > map keys terminated by ':'; 分区字段和表字段名称不能重复。 分区操作参考链接:https://blog.csdn.net/qq_36743482/article/details/78418343 14.创建带桶的表 create table student( id int, age int, name string ) partitioned by(stat_data string) clustered by(id) sorted by(age) into 2 buckets row format delimited fields terminated by ','; 分桶管理:https://blog.csdn.net/freefish_yzx/article/details/77150714 insert into table bck_student select id,name,sex,age,department from student distribute by sex; 15.增加分区 alter table invites add partition(ds='a') partition(ds='b'); 16.删除分区 alter table invites drop partition(ds='a'); 17.重命名表 alter table tmp_student rename to bck_student2; 18.增加新列 desc student; alter table student add columns (name1 string); 19.更新列 alter table student replace columns (id int, age int, name string); 20.显示命令 show tables show databases show partitions show functions desc extended t_name; desc formatted table_name; 21.加载数据 load data local inpath '/tmp/test/invites.txt' overwrite into table invites partition (ds='201911'); 22.导出数据到HDFS insert overwrite directory 'hdfs://ns1/user/hive/warehouse/mystudent' select * from bck_student;
23.Transform实现,python脚本
add FILE /tmp/iteblog.py;
select transform(*) using 'python my.py' as (sno,sname,sex,sage,sdept) from jstudent;
=====================my.py
#!/usr/bin/python
import sys
for line in sys.stdin:
line = line.strip()
a,b,c,d,e = line.split('\t')
print '\t'.join([a+' -- '+b+' -- '+c,d+' -- '+e])
=====================