Hive 自定义函数

1.Hive自定义函数之UDF

package my_len;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/*根据用户自定义函数类别分为以下三种:
(1)UDF(User-Defined-Function)
	一进一出
(2)UDAF(User-Defined Aggregation Function)
	聚集函数,多进一出
	类似于:count/max/min
(3)UDTF(User-Defined Table-Generating Functions)
	一进多出
	如lateral view explode()
实现步骤:
(1)继承Hive提供的类
 		org.apache.hadoop.hive.ql.udf.generic.GenericUDF
		org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
(2)实现类中的抽象方法
(3)在hive的命令行窗口创建函数

自定义一个UDF实现计算给定字符串的长度,例如:
hive(default)> select my_len("abcd");
4

*/
public class my_len extends GenericUDF {
    /**
     * @param arguments 输入参数类型的鉴别器对象
     * @return 返回值类型的鉴别器对象
     * @throws UDFArgumentException 1
     */
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        /*判断参数个数*/
        if (arguments.length != 1) {
            throw new UDFArgumentLengthException("参数个数不正确");
        }
        if (arguments == null) {
            return null;
        }
        //判断类型
        if (!arguments[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
            throw new UDFArgumentTypeException(0, "输入参数类型不正确");
        }

        //函数的返回值对象  ObjectInspector hive封装的java的数据类型
        return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
    }

    /**
     * 函数核心处理方法
     *
     * @param arguments 上面方法获取到的函数输入
     * @return 最后结果
     * @throws HiveException
     */

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        //获取函数参数
        Object o = arguments[0].get();
        //处理传入的值
        if (o == null) {
            return 0;
        }
        //业务处理

        //返回
        return o.toString().length();
    }

    /**
     * @param children ?
     * @return ?
     */
    @Override
    public String getDisplayString(String[] children) {
        return " ";
    }
}

2.Hive自定义函数之UDTF 

package my_split;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;

/*自定义一个UDTF实现将一个任意分割符的字符串切割成独立的单词,例如:
hive(default)> select my_split("hello,world,hadoop,hive", ",");

hello
world
hadoop
hive
*/
public class my_split extends GenericUDTF {
    ArrayList<String> outlist = new ArrayList<String>();


    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        //获得传入的参数列表
        List<? extends StructField> list = argOIs.getAllStructFieldRefs();
        //判断参数个数
        if (list == null || list.size() != 2) {
            throw new UDFArgumentLengthException("参数个数不正确");
        }
        //判断参数类型
        for (int i = 0; i < list.size(); i++) {
            StructField field = list.get(i);
            if (!field.getFieldObjectInspector().getCategory().equals(StructObjectInspector.Category.PRIMITIVE)) {
                throw new UDFArgumentTypeException(i, "参数类型不正确");
            }
        }
        //字段名
        ArrayList<String> fieldNames = new ArrayList<String>();
        fieldNames.add("word");
        //统计的数量
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    /**
     * 核心处理方法
     *
     * @param args
     * @throws HiveException
     */
    @Override
    public void process(Object[] args) throws HiveException {
        //判断是否为空
        if (args == null) {
            return;
        } else {
            String string = args[0].toString();
            String s = args[1].toString();
            String[] split = string.toString().split(s);
            for (int i = 0; i < split.length; i++) {
                outlist.clear();
                outlist.add(split[i]);
                forward(outlist);

            }

        }


    }

    @Override
    public void close() throws HiveException {

    }

}
package my_split;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;

/*自定义一个UDTF实现将一个任意分割符的字符串切割成独立的单词,例如:
hive(default)> select my_split2("hello-2,world-2,hadoop-3,hive-6", ",","-");

hello 2
world 2
hadoop 3
hive 6
*/
public class my_split2 extends GenericUDTF {
    ArrayList<String> outlist = new ArrayList<String>();


    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        //获得传入的参数列表
        List<? extends StructField> list = argOIs.getAllStructFieldRefs();
        //判断参数个数
        if (list == null || list.size() != 3) {
            throw new UDFArgumentLengthException("参数个数不正确");
        }
        //判断参数类型
        for (int i = 0; i < list.size(); i++) {
            StructField field = list.get(i);
            if (!field.getFieldObjectInspector().getCategory().equals(StructObjectInspector.Category.PRIMITIVE)) {
                throw new UDFArgumentTypeException(i, "参数类型不正确");
            }
        }
        //字段名
        ArrayList<String> fieldNames = new ArrayList<String>();
        fieldNames.add("word");
        fieldNames.add("num");
        //统计的数量
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    /**
     * 核心处理方法
     *
     * @param args
     * @throws HiveException
     */
    @Override
    public void process(Object[] args) throws HiveException {
        //判断是否为空
        if (args == null) {
            return;
        } else {
            String string = args[0].toString();
            String s = args[1].toString();
            String s2 = args[2].toString();
            String[] split = string.toString().split(s);
            for (int i = 0; i < split.length; i++) { //world-2
                String[] num = split[i].split(s2);
                outlist.clear();
                outlist.add(num[0]);
                outlist.add(num[1]);
                forward(outlist);


            }

        }


    }

    @Override
    public void close() throws HiveException {

    }

}

3.使用自定义函数

#在Hive根目录创建文件datas 将打好的jar包上传
打成jar包上传到服务器/opt/module/hive/datas/myudf.jar
#将jar包添加到hive的classpath
hive (default)> add jar /opt/module/hive/datas/myudf.jar;
#创建临时函数与开发好的java class关联
hive (default)> create temporary function my_len as "my_len.my_len";
#即可在hql中使用自定义的函数 
hive (default)> select ename,my_len(ename) ename_len from emp;

 

posted @ 2021-08-05 15:51  超级无敌小剑  阅读(237)  评论(0编辑  收藏  举报