Hive 自定义函数
1.Hive自定义函数之UDF
package my_len;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/*根据用户自定义函数类别分为以下三种:
(1)UDF(User-Defined-Function)
一进一出
(2)UDAF(User-Defined Aggregation Function)
聚集函数,多进一出
类似于:count/max/min
(3)UDTF(User-Defined Table-Generating Functions)
一进多出
如lateral view explode()
实现步骤:
(1)继承Hive提供的类
org.apache.hadoop.hive.ql.udf.generic.GenericUDF
org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
(2)实现类中的抽象方法
(3)在hive的命令行窗口创建函数
自定义一个UDF实现计算给定字符串的长度,例如:
hive(default)> select my_len("abcd");
4
*/
public class my_len extends GenericUDF {
/**
* @param arguments 输入参数类型的鉴别器对象
* @return 返回值类型的鉴别器对象
* @throws UDFArgumentException 1
*/
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
/*判断参数个数*/
if (arguments.length != 1) {
throw new UDFArgumentLengthException("参数个数不正确");
}
if (arguments == null) {
return null;
}
//判断类型
if (!arguments[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(0, "输入参数类型不正确");
}
//函数的返回值对象 ObjectInspector hive封装的java的数据类型
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/**
* 函数核心处理方法
*
* @param arguments 上面方法获取到的函数输入
* @return 最后结果
* @throws HiveException
*/
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
//获取函数参数
Object o = arguments[0].get();
//处理传入的值
if (o == null) {
return 0;
}
//业务处理
//返回
return o.toString().length();
}
/**
* @param children ?
* @return ?
*/
@Override
public String getDisplayString(String[] children) {
return " ";
}
}
2.Hive自定义函数之UDTF
package my_split;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/*自定义一个UDTF实现将一个任意分割符的字符串切割成独立的单词,例如:
hive(default)> select my_split("hello,world,hadoop,hive", ",");
hello
world
hadoop
hive
*/
public class my_split extends GenericUDTF {
ArrayList<String> outlist = new ArrayList<String>();
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//获得传入的参数列表
List<? extends StructField> list = argOIs.getAllStructFieldRefs();
//判断参数个数
if (list == null || list.size() != 2) {
throw new UDFArgumentLengthException("参数个数不正确");
}
//判断参数类型
for (int i = 0; i < list.size(); i++) {
StructField field = list.get(i);
if (!field.getFieldObjectInspector().getCategory().equals(StructObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(i, "参数类型不正确");
}
}
//字段名
ArrayList<String> fieldNames = new ArrayList<String>();
fieldNames.add("word");
//统计的数量
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
* 核心处理方法
*
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
//判断是否为空
if (args == null) {
return;
} else {
String string = args[0].toString();
String s = args[1].toString();
String[] split = string.toString().split(s);
for (int i = 0; i < split.length; i++) {
outlist.clear();
outlist.add(split[i]);
forward(outlist);
}
}
}
@Override
public void close() throws HiveException {
}
}
package my_split;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/*自定义一个UDTF实现将一个任意分割符的字符串切割成独立的单词,例如:
hive(default)> select my_split2("hello-2,world-2,hadoop-3,hive-6", ",","-");
hello 2
world 2
hadoop 3
hive 6
*/
public class my_split2 extends GenericUDTF {
ArrayList<String> outlist = new ArrayList<String>();
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//获得传入的参数列表
List<? extends StructField> list = argOIs.getAllStructFieldRefs();
//判断参数个数
if (list == null || list.size() != 3) {
throw new UDFArgumentLengthException("参数个数不正确");
}
//判断参数类型
for (int i = 0; i < list.size(); i++) {
StructField field = list.get(i);
if (!field.getFieldObjectInspector().getCategory().equals(StructObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(i, "参数类型不正确");
}
}
//字段名
ArrayList<String> fieldNames = new ArrayList<String>();
fieldNames.add("word");
fieldNames.add("num");
//统计的数量
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
* 核心处理方法
*
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
//判断是否为空
if (args == null) {
return;
} else {
String string = args[0].toString();
String s = args[1].toString();
String s2 = args[2].toString();
String[] split = string.toString().split(s);
for (int i = 0; i < split.length; i++) { //world-2
String[] num = split[i].split(s2);
outlist.clear();
outlist.add(num[0]);
outlist.add(num[1]);
forward(outlist);
}
}
}
@Override
public void close() throws HiveException {
}
}
3.使用自定义函数
#在Hive根目录创建文件datas 将打好的jar包上传
打成jar包上传到服务器/opt/module/hive/datas/myudf.jar
#将jar包添加到hive的classpath
hive (default)> add jar /opt/module/hive/datas/myudf.jar;
#创建临时函数与开发好的java class关联
hive (default)> create temporary function my_len as "my_len.my_len";
#即可在hql中使用自定义的函数
hive (default)> select ename,my_len(ename) ename_len from emp;
绝不摆烂