第八章_函数【UDF 自定义函数】
1. 自定义函数的类型
名称 | 特点 | 示例 |
UDF (User-Defined-Function) |
一进一出 |
示例 : like、rlike、if、upper |
UDAF (User-Defined-Aggregation-Function) |
聚合函数,多进一出 |
示例 : max、min、count、arg |
UDTF (User-Defined-Table-Generating-Function) |
遍历函数,一进多出 |
示例 : lateral view explode() |
2. 编写UDF函数步骤
1. 构建maven 项目,导入jar 依赖
<dependencies> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>3.1.2</version> </dependency> </dependencies>
2. 构建 UDF类
1. 继承 GenericUDF类
UDF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDF
UDAF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver
UDTF继承 : org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
2. 实现 抽象方法
1. initialize (初始化方法) :
1. 对 函数参数的 个数和数据类型 进行校验
2. 指定 函数的 返回值类型
2. evaluate (求值方法) : 逻辑处理方法
3. getDisplayString : 函数说明方法
3. 打包
Maven Projects -> package
4. 上传jar
scp hive-udf-1.0-SNAPSHOT.jar root@gaocun:/root
5. 引入jar
add jar /root/ hive-udf-1.0-SNAPSHOT.jar;
6. 创建临时函数(会话级别)
create temporary function concat1 as "com.dxm.udf.ConcatDemo";
7. 创建永久函数
语法 :
create function [db_name.]function_name as class_name [using jar|file|archive 'file_uri' [, jar|file|archive 'file_uri'] ];
示例 :
create function mylen as "com.dxm.udf.getLength" using jar 'hdfs://gaocun:8020/hiveudf_lib/hive-udf-1.0-SNAPSHOT.jar'; 注意 : hdfs 协议端口 请查看 core-site.xml (fs.defaultFS)
8. 删除函数
-- 删除函数 drop function [if exists] function_name; -- 删除临时函数 drop temporary function [if exists] function_name;
3. UDF 函数
1. GenericUDF 源码

package org.apache.hadoop.hive.ql.udf.generic; /** * A Generic User-defined function (GenericUDF) for the use with Hive. * * 新的用户自定义的 GenericUDF类 都需要继承 这个类 * * 这个 GenericUDF 与 普通的UDF 函数的 优点在于 * 1. 接收的参数可以 复杂数据类型(array、map、struct),并且返回值 也可以是 复杂数据类型 * 2. 他可以接收可变长度参数 * 3. 它可以接收无限个函数签名 */ public abstract class GenericUDF implements Closeable { /** * A Defered Object allows us to do lazy-evaluation and short-circuiting. * 延迟对象 允许 我们 延迟执行 和短路 * GenericUDF use DeferedObject to pass arguments. * GenericUDF 使用 DeferedObject 作为参数传递 */ @InterfaceAudience.Public @InterfaceStability.Stable public static interface DeferredObject { void prepare(int version) throws HiveException; Object get() throws HiveException; }; /** * A basic dummy implementation of DeferredObject which just stores a Java * Object reference. */ public static class DeferredJavaObject implements DeferredObject { private final Object value; public DeferredJavaObject(Object value) { this.value = value; } @Override public void prepare(int version) throws HiveException { } @Override public Object get() throws HiveException { return value; } } /** * The constructor. */ public GenericUDF() { } /** * Initialize this GenericUDF. This will be called once and only once per * GenericUDF instance. * 每个 GenericUDF实例 只调用一次 初始化方法 * * 初始化方法的作用 * 1. 检查 UDF函数的参数个数和数据类型 * 2. 指定 UDF返回值 的数据类型 * * @param arguments * The ObjectInspector for the arguments * @throws UDFArgumentException * Thrown when arguments have wrong types, wrong length, etc. * @return The ObjectInspector for the return value */ public abstract ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException; /** * Additionally setup GenericUDF with MapredContext before initializing. * This is only called in runtime of MapRedTask. * * @param context context */ public void configure(MapredContext context) { } /** * Evaluate the GenericUDF with the arguments. * 使用 参数对 UDF求值 * * @param arguments * The arguments as DeferedObject, use DeferedObject.get() to get the * actual argument Object. The Objects can be inspected by the * ObjectInspectors passed in the initialize call. * 1. 参数被封装为 延迟对象,可以使用 get方法 获取 真正的参数对象 * 2. 这个对象 将作为 initialize方法的参数 进行校验 * @return The */ public abstract Object evaluate(DeferredObject[] arguments) throws HiveException; /** * Get the String to be displayed in explain. * */ public abstract String getDisplayString(String[] children); /** * Close GenericUDF. * This is only called in runtime of MapRedTask. */ @Override public void close() throws IOException { }
2. 案例1
需求 : 获取指定字符串 长度(一个参数)

package com.dxm.udf; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; /** * @author gaocun * @create 2022-01-30 12:28 AM * 功能 : 计算指定 字符串的长度 * 实现步骤 * 1. 继承 GenericUDF类 */ public class getLength extends GenericUDF { public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException { //1. 判断输入参数个数 if (objectInspectors.length != 1) { throw new UDFArgumentException("输入参数 个数异常"); } return PrimitiveObjectInspectorFactory.javaIntObjectInspector; } public Object evaluate(DeferredObject[] deferredObjects) throws HiveException { if (deferredObjects[0].get() == null) { return 0; } return deferredObjects[0].get().toString().length(); } public String getDisplayString(String[] strings) { return ""; } }
3. 案例2
需求 : 将指定字符串 按照指定分隔符 进行合并(多个参数)

package com.dxm.udf; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.Arrays; /** * @author gaocun * @create 2022-02-02 11:05 AM * 函数功能 : * 1. 将指定字符串 按照指定分隔符 进行合并 * 示例 : * select ConcatDemo('ab','12','-') = ab-12 */ @Description(name = "ConcatDemo", value = "_FUNC_(str1, str2, ... separator) - returns 将 str1, str2 按照 separator 进行拼接", extended = "Returns NULL if any argument is NULL.\n" + "Example:\n" + " > SELECT _FUNC_('abc', 'def','-') FROM src LIMIT 1;\n" + " 'abc-def'") public class ConcatDemo extends GenericUDF { public ConcatDemo() { System.out.println("ConcatDemo 初始化了"); } // 初始化方法 // 说明 : 每个UDF实例,只能初始化的时候调用一次 // 作用 : // 1. 对参数的类型 和参数的个数 做检查 // 2. 对UDF 返回值的类型 做限制(即返回值类型) public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { System.out.println("1-initialize 执行了"); System.out.println("initialize 当前参数个数为:" + arguments.length); System.out.println("initialize 当前参数类型为:" + Arrays.toString(arguments)); return PrimitiveObjectInspectorFactory.javaStringObjectInspector; } // 求值方法,用来逻辑处理 public Object evaluate(DeferredObject[] arguments) throws HiveException { System.out.println("2-evaluate 执行了"); // 1. 参数作为一个 DeferredObject类,可以通过 get方法 获取真正的类 String str = ""; String delimiter = arguments[arguments.length - 1].get().toString(); for (int i = 0; i <= arguments.length - 2; i++) { if (i == arguments.length - 2) { str += arguments[i].get().toString(); } else { str += arguments[i].get().toString() + delimiter; } } return str; } // 在explain 中显示的 函数信息 public String getDisplayString(String[] children) { System.out.println("3-getDisplayString 执行了"); return "这是一个udf函数,可以接受多个参数"; } }
4. UDTF 函数
1. GenericUDTF 源码

package org.apache.hadoop.hive.ql.udf.generic; /** * A Generic User-defined Table Generating Function (UDTF) * * Generates a variable number of output rows for a single input row. Useful for * explode(array)... * 单个输入行 生成 可变数量的输出行 * */ @InterfaceAudience.Public @InterfaceStability.Stable public abstract class GenericUDTF { Collector collector = null; /** * Additionally setup GenericUDTF with MapredContext before initializing. * This is only called in runtime of MapRedTask. * * @param context context */ public void configure(MapredContext mapredContext) { } public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs(); ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()]; for (int i = 0; i < inputFields.size(); i++) { udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector(); } return initialize(udtfInputOIs); } /** * Initialize this GenericUDTF. This will be called only once per instance. * 初始化这个 GenericUDTF,每个实例 值调用一次 * * @param argOIs * An array of ObjectInspectors for the arguments * UDTF 的参数,被封装在 ObjectInspectors的数组中 * @return A StructObjectInspector for output. The output struct represents a * row of the table where the fields of the stuct are the columns. The * field names are unimportant as they will be overridden by user * supplied column aliases. * 1. 方法输出一个 StructObjectInspector * 2. 数据的结构体 包含 函数数据列的别名 和 列的数据类型 */ @Deprecated public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { throw new IllegalStateException("Should not be called directly"); } /** * Give a set of arguments for the UDTF to process. * 对UDTF的参数进行加工 * * @param args * object array of arguments * UDF的参数的数组 */ public abstract void process(Object[] args) throws HiveException; /** * Called to notify the UDTF that there are no more rows to process. * Clean up code or additional forward() calls can be made here. */ public abstract void close() throws HiveException; /** * Associates a collector with this UDTF. Can't be specified in the * constructor as the UDTF may be initialized before the collector has been * constructed. * * @param collector */ public final void setCollector(Collector collector) { this.collector = collector; } /** * Passes an output row to the collector. * 将输出行 传递给收集器 * * @param o * @throws HiveException */ protected final void forward(Object o) throws HiveException { collector.collect(o); } }
2. 案例1
需求 : 将 逗号分隔的字符串,遍历成多行(接收一个参数)

package com.dxm.udtf; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; import java.util.ArrayList; import java.util.List; /** * @author gaocun * @create 2022-01-31 4:45 PM * 需求 : 将 逗号分隔的字符串,遍历成多行(接收一个参数) */ public class forEach extends GenericUDTF { private ArrayList<String> output= new ArrayList<String>(); // 初始化方法 @Override public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { System.out.println("forEach 初始化了1"); //1. 设置输出数据默认列名,可以被别名覆盖 List<String> fileNames = new ArrayList<String>(); fileNames.add("OneName"); //fileNames.add("TwoName"); //2. 输出数据类型 List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); //fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fileNames, fieldOIs); } // 处理输入操作 public void process(Object[] args) throws HiveException { System.out.println("开始遍历字段了"); //1. 获取输入数据 String input = args[0].toString(); //2. 按照"," 分割字符串 String[] strings = input.split(","); //3. 遍历数据写出 for (String string : strings) { //1. 清空集合 output.clear(); //2. 向 output集合中 添加元素 output.add(string); //3. 输出数据 forward(strings); } } // 关闭资源 public void close() throws HiveException { } }
3. 案例2
需求 : 对多个字符串,进行遍历,每个字符串生成一个列(多个参数)

package com.dxm.udtf; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; import java.util.List; /** * @author gaocun * @create 2022-02-04 11:59 AM * 需求 : * 1. 对多个字符串,进行遍历,每个字符串生成一个列(多个参数) */ public class MoreForeach extends GenericUDTF { private ArrayList<String> output = new ArrayList<String>(); @Override public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { System.out.println("1-MoreForeach 初始化了"); String fileName = "Name_"; //1. 设置输出数据默认列名,可以被别名覆盖 List<String> fileNames = new ArrayList<String>(); //2. 输出数据类型 List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(); for (int i = 1; i <= argOIs.getAllStructFieldRefs().size(); i++) { fileNames.add(fileName + i); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); } return ObjectInspectorFactory.getStandardStructObjectInspector(fileNames, fieldOIs); } public void process(Object[] args) throws HiveException { System.out.println("2-process 执行了"); if (args.length == 1) { for (Object arg : args) { System.out.println("参数:" + arg.toString()); //1. 获取输入数据 String input = arg.toString(); //2. 按照"," 分割字符串 String[] strings = input.split(","); //3. 写出数据 for (String string : strings) { forward(string); } } } else { for (int i = 0; i < args.length; i++) { //1. 清空集合 output.clear(); for (Object arg : args) { System.out.println("参数:" + arg.toString()); //1. 获取输入数据 String input = arg.toString(); //2. 按照"," 分割字符串 String[] strings = input.split(","); //3. 向 output集合中 添加元素 if (strings.length > i) { output.add(strings[i]); } else { output.add("0"); } } //2. 写出数据 forward(output); } } } public void close() throws HiveException { } }
5. UDAF 函数
6. 官网地址 : https://cwiki.apache.org/confluence/display/Hive/HivePlugins
7. 需求 :
功能描述 : 输入 : 开始日期,结束日期 返回 : [开始日期...结束日期] 的数组 示例 : -- 导入udf jar包 add jar /root/ hive-udf-1.0-SNAPSHOT.jar; create temporary function getdays as "com.dxm.udf.getdays"; select getdays('2021-06-05','2021-06-08'); 结果 : ["2021-06-05","2021-06-06","2021-06-07","2021-06-08"]
-- 源码
package com.dxm.udf; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.text.ParseException; /** * @author gaocun * @create 2022-02-17 6:17 PM */ public class getdays extends GenericUDF { private ArrayList<Text> result = new ArrayList<Text>(); private SimpleDateFormat simpleFormat = new SimpleDateFormat("yyyy-MM-dd"); public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { System.out.println("initialize-进入了"); // 函数类型返回一个字符串数组数组 return ObjectInspectorFactory .getStandardListObjectInspector(PrimitiveObjectInspectorFactory .writableStringObjectInspector); } public Object evaluate(DeferredObject[] arguments) throws HiveException { System.out.println("evaluate-进入了"); //1.获取 start String start = arguments[0].get().toString(); //2.获取 end String end = arguments[1].get().toString(); //3.清空 arraylist result.clear(); try { /*天数差*/ Date fromDate1 = simpleFormat.parse(start); Date toDate1 = simpleFormat.parse(end); long from1 = fromDate1.getTime(); long to1 = toDate1.getTime(); int days = (int) ((to1 - from1) / (1000 * 60 * 60 * 24)); Calendar cd = Calendar.getInstance(); for (int i = 0; i <= days; i++) { cd.setTime(fromDate1); cd.add(Calendar.DATE, i);//增加一天 result.add(new Text(simpleFormat.format(cd.getTime()))); } } catch (ParseException e) { System.out.println("报错了。。。"); e.printStackTrace(); } return result; } public String getDisplayString(String[] children) { return null; } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通