自定义udf函数(一进一出)

1. 添加maven依赖

一、pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>hive_function</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.8</version>
<!--排除依赖中的某个jar包-->
<exclusions>
<exclusion>
<groupId>org.pentaho</groupId>
<artifactId>pentaho-aggdesigner-algorithm</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

二、自定义一个length()函数

package udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
/**
* 要求:自定义一个length()函数
* 1. 继承GenericUDF类,
* 2. 重写两个核心方法
*/
public class MyLength extends GenericUDF {
/**
* 初始化方法
* @param arguments 是输入参数的类型检查器数组
* @return 返回类型检查器
* @throws UDFArgumentException
*/
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// 1. 长度是否合法
if (arguments.length != 1) {
throw new UDFArgumentLengthException("there can be at most one parameter");
}
// 2. 检查参数类型是否合法
ObjectInspector inspector = arguments[0];
if (!(inspector instanceof StringObjectInspector)) {
throw new UDFArgumentTypeException(0, "paramter type is not string");
}
// 3. 返回一个函数执行完成的类型检查器 int类型的检查器
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/**
* 函数的执行逻辑
* @param arguments
* @return
* @throws HiveException
*/
public Object evaluate(DeferredObject[] arguments) throws HiveException {
DeferredObject argument = arguments[0];
Object value = argument.get();
String str = value.toString();
return str.length();
}
public String getDisplayString(String[] strings) {
return null;
}
}

三、打jar包


四、创建临时函数

hive (test)> create temporary function mylength as "udf.MyLength" using jar "hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar";
Added [/tmp/ce2bebf9-ddf7-443f-badd-2d4f147960bf_resources/hive_function-1.0-SNAPSHOT.jar] to class path
Added resources: [hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar]
OK
Time taken: 0.995 seconds
hive (test)> show functions;
OK
tab_name
hive (test)> desc function extended mylength;
OK
tab_name
There is no documentation for function 'mylength'
Function class:udf.MyLength
Function type:TEMPORARY
Resource:hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar
Time taken: 0.008 seconds, Fetched: 4 row(s)
hive (test)> show create table page_ad;
OK
createtab_stmt
CREATE TABLE `page_ad`(
`page` string,
`aids` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'line.delim'='\n',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://node1:9000/user/hive/warehouse/test.db/page_ad'
TBLPROPERTIES (
'transient_lastDdlTime'='1659523482')
Time taken: 0.108 seconds, Fetched: 17 row(s)
hive (test)> select mylength(aids) from page_ad;
OK
_c0
13
16
19
Time taken: 0.38 seconds, Fetched: 3 row(s)

五、创建永久函数

hive (default)> create function mylength as "udf.MyLength" using jar "hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar";
Added [/tmp/ac2a0a66-3121-4e8e-8faf-039a09419d7c_resources/hive_function-1.0-SNAPSHOT.jar] to class path
Added resources: [hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar]
OK
Time taken: 0.396 seconds

注意

  1. 临时函数:只有当前连接会话可以使用,重建一个hive的连接会话,临时函数失效
  2. 永久函数:不管连接多少次,函数都可以使用,临时函数可以使用show functions查询,永久函数无法使用改命令查询
    但是可以在hive的元数据库的FUNCS表中找到我们自己新建的永久函数
  3. 临时函数和永久函数都是和数据库挂钩的,如果在别的数据库中使用这个自定义函数,必须带上数据库的名字
# 在default数据库中创建的永久函数
hive (default)> create function mylength as "udf.MyLength" using jar "hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar";
hive (default)> select mylength(name) from student_partition;
OK
_c0
Time taken: 1.421 seconds
hive (default)> use test;
OK
Time taken: 0.02 seconds
hive (test)> select mylength(aids) from page_ad;
FAILED: SemanticException [Error 10011]: Invalid function mylength
# 必须带上数据库的名字
hive (test)> select default.mylength(aids) from page_ad;
OK
_c0
13
16
19
Time taken: 0.154 seconds, Fetched: 3 row(s)
posted @   jsqup  阅读(141)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
点击右上角即可分享
微信分享提示