第八章_函数【UDF 自定义函数】

1. 自定义函数的类型

名称	特点	示例
UDF (User-Defined-Function)	一进一出	示例 : like、rlike、if、upper
UDAF (User-Defined-Aggregation-Function)	聚合函数,多进一出	示例 : max、min、count、arg
UDTF (User-Defined-Table-Generating-Function)	遍历函数,一进多出	示例 : lateral view explode()

2. 编写UDF函数步骤

　　1. 构建maven 项目,导入jar 依赖

<dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>3.1.2</version>
        </dependency>
</dependencies>

　　2. 构建 UDF类

　　　　1. 继承 GenericUDF类

　　　　　　UDF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDF

　　　　　　 UDAF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver

UDTF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDTF

　　　 2. 实现抽象方法

　　　　　　1. initialize (初始化方法) ：

　　　　　　　　1. 对函数参数的个数和数据类型进行校验

　　　　　　　　2. 指定函数的返回值类型

　　　　　　2. evaluate (求值方法) : 逻辑处理方法

　　　　　　3. getDisplayString : 函数说明方法

　　3. 打包

　　　　Maven Projects -> package

　　4. 上传jar

　　　　scp hive-udf-1.0-SNAPSHOT.jar root@gaocun:/root

　　5. 引入jar

　　　　add jar /root/ hive-udf-1.0-SNAPSHOT.jar;　　

　　6. 创建临时函数(会话级别)

　　　　create temporary function concat1 as "com.dxm.udf.ConcatDemo";

　　7. 创建永久函数

语法 :

create function [db_name.]function_name as class_name
  [using jar|file|archive 'file_uri' [, jar|file|archive 'file_uri'] ];

　　示例 :

 　　　　　create function mylen as "com.dxm.udf.getLength" using jar 'hdfs://gaocun:8020/hiveudf_lib/hive-udf-1.0-SNAPSHOT.jar';

　　　　　　注意 : hdfs 协议端口 请查看 core-site.xml (fs.defaultFS)

　8. 删除函数

-- 删除函数
drop function [if exists] function_name;

-- 删除临时函数
drop temporary function [if exists] function_name;

3. UDF 函数

　　1. GenericUDF 源码

package org.apache.hadoop.hive.ql.udf.generic;

/**
 * A Generic User-defined function (GenericUDF) for the use with Hive.
 *
 * 新的用户自定义的 GenericUDF类 都需要继承 这个类
 *
 * 这个 GenericUDF 与 普通的UDF 函数的 优点在于
 *    1. 接收的参数可以 复杂数据类型(array、map、struct)，并且返回值 也可以是 复杂数据类型
 *    2. 他可以接收可变长度参数
 *    3. 它可以接收无限个函数签名
 */

public abstract class GenericUDF implements Closeable {

  /**
   * A Defered Object allows us to do lazy-evaluation and short-circuiting.
   * 延迟对象 允许 我们 延迟执行 和短路
   * GenericUDF use DeferedObject to pass arguments.
   * GenericUDF 使用 DeferedObject 作为参数传递
   */
  @InterfaceAudience.Public
  @InterfaceStability.Stable
  public static interface DeferredObject {
    void prepare(int version) throws HiveException;
    Object get() throws HiveException;
  };

  /**
   * A basic dummy implementation of DeferredObject which just stores a Java
   * Object reference.
   */
  public static class DeferredJavaObject implements DeferredObject {
    private final Object value;

    public DeferredJavaObject(Object value) {
      this.value = value;
    }

    @Override
    public void prepare(int version) throws HiveException {
    }

    @Override
    public Object get() throws HiveException {
      return value;
    }
  }

  /**
   * The constructor.
   */
  public GenericUDF() {
  }

  /**
   * Initialize this GenericUDF. This will be called once and only once per
   * GenericUDF instance.
   * 每个 GenericUDF实例 只调用一次 初始化方法
   *
   * 初始化方法的作用
   *     1. 检查 UDF函数的参数个数和数据类型
   *     2. 指定 UDF返回值 的数据类型
   *
   * @param arguments
   *          The ObjectInspector for the arguments
   * @throws UDFArgumentException
   *           Thrown when arguments have wrong types, wrong length, etc.
   * @return The ObjectInspector for the return value
   */
  public abstract ObjectInspector initialize(ObjectInspector[] arguments)
      throws UDFArgumentException;

  /**
   * Additionally setup GenericUDF with MapredContext before initializing.
   * This is only called in runtime of MapRedTask.
   *
   * @param context context
   */
  public void configure(MapredContext context) {
  }

  

  /**
   * Evaluate the GenericUDF with the arguments.
   * 使用 参数对 UDF求值
   *
   * @param arguments
   *          The arguments as DeferedObject, use DeferedObject.get() to get the
   *          actual argument Object. The Objects can be inspected by the
   *          ObjectInspectors passed in the initialize call.
   *          1. 参数被封装为 延迟对象,可以使用 get方法 获取 真正的参数对象
   *          2. 这个对象 将作为 initialize方法的参数 进行校验
   * @return The
   */
  public abstract Object evaluate(DeferredObject[] arguments)
      throws HiveException;

  /**
   * Get the String to be displayed in explain.
   * 
   */
  public abstract String getDisplayString(String[] children);

  /**
   * Close GenericUDF.
   * This is only called in runtime of MapRedTask.
   */
  @Override
  public void close() throws IOException {
  }

View Code

　　2. 案例1

　　　　需求 : 获取指定字符串长度(一个参数)

package com.dxm.udf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/**
 * @author gaocun
 * @create 2022-01-30 12:28 AM
 * 功能 : 计算指定 字符串的长度
 * 实现步骤
 * 1. 继承 GenericUDF类
 */
public class getLength extends GenericUDF {
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        //1. 判断输入参数个数
        if (objectInspectors.length != 1) {
            throw new UDFArgumentException("输入参数 个数异常");
        }

        return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
    }

    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        if (deferredObjects[0].get() == null) {
            return 0;
        }
        return deferredObjects[0].get().toString().length();
    }

    public String getDisplayString(String[] strings) {
        return "";
    }
}

View Code

　　3. 案例2

　　　　需求 : 将指定字符串按照指定分隔符进行合并(多个参数)

package com.dxm.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.Arrays;

/**
 * @author gaocun
 * @create 2022-02-02 11:05 AM
 * 函数功能 :
 * 1. 将指定字符串 按照指定分隔符 进行合并
 * 示例 :
 * select ConcatDemo('ab','12','-') = ab-12
 */
@Description(name = "ConcatDemo",
            value = "_FUNC_(str1, str2, ... separator) - returns 将 str1, str2 按照 separator 进行拼接",
            extended = "Returns NULL if any argument is NULL.\n"
                        + "Example:\n"
                        + "  > SELECT _FUNC_('abc', 'def','-') FROM src LIMIT 1;\n"
                        + "  'abc-def'")
public class ConcatDemo extends GenericUDF {
    public ConcatDemo() {
        System.out.println("ConcatDemo 初始化了");
    }

    // 初始化方法
    // 说明 : 每个UDF实例,只能初始化的时候调用一次
    // 作用 :
    //      1. 对参数的类型 和参数的个数 做检查
    //      2. 对UDF 返回值的类型 做限制(即返回值类型)

    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        System.out.println("1-initialize 执行了");
        System.out.println("initialize 当前参数个数为:" + arguments.length);
        System.out.println("initialize 当前参数类型为:" + Arrays.toString(arguments));
        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
    }

    // 求值方法,用来逻辑处理
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        System.out.println("2-evaluate 执行了");
        // 1. 参数作为一个 DeferredObject类,可以通过 get方法 获取真正的类
        String str = "";
        String delimiter = arguments[arguments.length - 1].get().toString();
        for (int i = 0; i <= arguments.length - 2; i++) {
            if (i == arguments.length - 2) {
                str += arguments[i].get().toString();
            } else {
                str += arguments[i].get().toString() + delimiter;
            }
        }
        return str;
    }

    // 在explain 中显示的 函数信息
    public String getDisplayString(String[] children) {
        System.out.println("3-getDisplayString 执行了");
        return "这是一个udf函数,可以接受多个参数";
    }
}

View Code

4. UDTF 函数

　　1. GenericUDTF 源码

package org.apache.hadoop.hive.ql.udf.generic;


/**
 * A Generic User-defined Table Generating Function (UDTF)
 *
 * Generates a variable number of output rows for a single input row. Useful for
 * explode(array)...
 * 单个输入行 生成 可变数量的输出行
 *
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class GenericUDTF {
  Collector collector = null;

  /**
   * Additionally setup GenericUDTF with MapredContext before initializing.
   * This is only called in runtime of MapRedTask.
   *
   * @param context context
   */
  public void configure(MapredContext mapredContext) {
  }

  public StructObjectInspector initialize(StructObjectInspector argOIs)
      throws UDFArgumentException {
    List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
    ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()];
    for (int i = 0; i < inputFields.size(); i++) {
      udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector();
    }
    return initialize(udtfInputOIs);
  }

  /**
   * Initialize this GenericUDTF. This will be called only once per instance.
   * 初始化这个 GenericUDTF,每个实例 值调用一次
   *
   * @param argOIs
   *          An array of ObjectInspectors for the arguments
   *          UDTF 的参数,被封装在 ObjectInspectors的数组中
   * @return A StructObjectInspector for output. The output struct represents a
   *         row of the table where the fields of the stuct are the columns. The
   *         field names are unimportant as they will be overridden by user
   *         supplied column aliases.
   *         1. 方法输出一个 StructObjectInspector
   *         2. 数据的结构体 包含 函数数据列的别名 和 列的数据类型
   */
  @Deprecated
  public StructObjectInspector initialize(ObjectInspector[] argOIs)
      throws UDFArgumentException {
    throw new IllegalStateException("Should not be called directly");
  }

  /**
   * Give a set of arguments for the UDTF to process.
   * 对UDTF的参数进行加工
   *
   * @param args
   *          object array of arguments
   *          UDF的参数的数组
   */
  public abstract void process(Object[] args) throws HiveException;

  /**
   * Called to notify the UDTF that there are no more rows to process.
   * Clean up code or additional forward() calls can be made here.
   */
  public abstract void close() throws HiveException;

  /**
   * Associates a collector with this UDTF. Can't be specified in the
   * constructor as the UDTF may be initialized before the collector has been
   * constructed.
   *
   * @param collector
   */
  public final void setCollector(Collector collector) {
    this.collector = collector;
  }

  /**
   * Passes an output row to the collector.
   * 将输出行 传递给收集器
   *
   * @param o
   * @throws HiveException
   */
  protected final void forward(Object o) throws HiveException {
    collector.collect(o);
  }

}

View Code

　　2. 案例1

　　　　需求 : 将逗号分隔的字符串,遍历成多行(接收一个参数)

package com.dxm.udtf;


import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet;

import java.util.ArrayList;
import java.util.List;

/**
 * @author gaocun
 * @create 2022-01-31 4:45 PM
 * 需求 : 将 逗号分隔的字符串,遍历成多行(接收一个参数)
 */

public class forEach extends GenericUDTF {
    private ArrayList<String> output= new ArrayList<String>();


    // 初始化方法
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        System.out.println("forEach 初始化了1");

        //1. 设置输出数据默认列名,可以被别名覆盖
        List<String> fileNames = new ArrayList<String>();
        fileNames.add("OneName");
        //fileNames.add("TwoName");

        //2. 输出数据类型
        List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        //fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fileNames, fieldOIs);
    }

    // 处理输入操作
    public void process(Object[] args) throws HiveException {
        System.out.println("开始遍历字段了");
        //1. 获取输入数据
        String input = args[0].toString();

        //2. 按照"," 分割字符串
        String[] strings = input.split(",");

        //3. 遍历数据写出
        for (String string : strings) {
            //1. 清空集合
            output.clear();

            //2. 向 output集合中 添加元素
            output.add(string);

            //3. 输出数据
            forward(strings);
        }



    }

    // 关闭资源
    public void close() throws HiveException {

    }


}

View Code

　　3. 案例2

　　　　需求 : 对多个字符串,进行遍历,每个字符串生成一个列(多个参数)

package com.dxm.udtf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;

/**
 * @author gaocun
 * @create 2022-02-04 11:59 AM
 * 需求 :
 * 1. 对多个字符串,进行遍历,每个字符串生成一个列(多个参数)
 */
public class MoreForeach extends GenericUDTF {
    private ArrayList<String> output = new ArrayList<String>();

    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        System.out.println("1-MoreForeach 初始化了");

        String fileName = "Name_";

        //1. 设置输出数据默认列名,可以被别名覆盖
        List<String> fileNames = new ArrayList<String>();

        //2. 输出数据类型
        List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();

        for (int i = 1; i <= argOIs.getAllStructFieldRefs().size(); i++) {

            fileNames.add(fileName + i);
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        }

        return ObjectInspectorFactory.getStandardStructObjectInspector(fileNames, fieldOIs);
    }


    public void process(Object[] args) throws HiveException {
        System.out.println("2-process 执行了");

        if (args.length == 1) {

            for (Object arg : args) {
                System.out.println("参数:" + arg.toString());
                //1. 获取输入数据
                String input = arg.toString();

                //2. 按照"," 分割字符串
                String[] strings = input.split(",");

                //3. 写出数据
                for (String string : strings) {
                    forward(string);
                }
            }

        } else {

            for (int i = 0; i < args.length; i++) {

                //1. 清空集合
                output.clear();

                for (Object arg : args) {
                    System.out.println("参数:" + arg.toString());
                    //1. 获取输入数据
                    String input = arg.toString();

                    //2. 按照"," 分割字符串
                    String[] strings = input.split(",");

                    //3. 向 output集合中 添加元素
                    if (strings.length > i) {
                        output.add(strings[i]);
                    } else {
                        output.add("0");
                    }

                }
                //2. 写出数据
                forward(output);
            }


        }


    }


    public void close() throws HiveException {

    }
}

View Code

5. UDAF 函数

6. 官网地址 : https://cwiki.apache.org/confluence/display/Hive/HivePlugins

7. 需求 :

        功能描述 :
                输入 : 开始日期,结束日期
                返回 : [开始日期...结束日期] 的数组
        示例 :
            -- 导入udf jar包
            add jar /root/ hive-udf-1.0-SNAPSHOT.jar;
            create temporary function getdays as "com.dxm.udf.getdays";

            select  getdays('2021-06-05','2021-06-08');
            结果 : ["2021-06-05","2021-06-06","2021-06-07","2021-06-08"]

-- 源码

package com.dxm.udf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.text.ParseException;


/**
 * @author gaocun
 * @create 2022-02-17 6:17 PM
 */
public class getdays extends GenericUDF {
    private ArrayList<Text> result = new ArrayList<Text>();
    private SimpleDateFormat simpleFormat = new SimpleDateFormat("yyyy-MM-dd");


    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        System.out.println("initialize-进入了");
        // 函数类型返回一个字符串数组数组
        return ObjectInspectorFactory
                    .getStandardListObjectInspector(PrimitiveObjectInspectorFactory
                                .writableStringObjectInspector);
    }

    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        System.out.println("evaluate-进入了");
        //1.获取 start
        String start = arguments[0].get().toString();

        //2.获取 end
        String end = arguments[1].get().toString();

        //3.清空 arraylist
        result.clear();
        try {
            /*天数差*/
            Date fromDate1 = simpleFormat.parse(start);
            Date toDate1 = simpleFormat.parse(end);
            long from1 = fromDate1.getTime();
            long to1 = toDate1.getTime();
            int days = (int) ((to1 - from1) / (1000 * 60 * 60 * 24));
            Calendar cd = Calendar.getInstance();

            for (int i = 0; i <= days; i++) {
                cd.setTime(fromDate1);
                cd.add(Calendar.DATE, i);//增加一天
                result.add(new Text(simpleFormat.format(cd.getTime())));
            }
        } catch (ParseException e) {
            System.out.println("报错了。。。");
            e.printStackTrace();
        }

        return result;
    }

    public String getDisplayString(String[] children) {
        return null;
    }
}