开发一个dremio user_agent 解析函数
参考apache drill 实现一个user_agent 解析的函数
项目结构
- maven 项目结构
├── pom.xml
├── src
│ ├── main
│ │ ├── java
│ │ │ ├── com
│ │ │ │ └── dalong
│ │ │ │ └── udf
│ │ │ │ ├── MyFunc.java
│ │ │ │ ├── UAAPP.java
│ │ │ │ └── UserAgentAnalyzerProvider.java
│ │ │ └── helper
│ │ └── resources
│ │ └── sabot-module.conf
- 代码说明
sabot-module.conf 老样子配置包扫描
dremio.classpath.scanning.packages += com.dalong.udf
- pom.xml
主要是user agent 解析依赖包的添加,以及maven-shade-plugin 插件的配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dalong</groupId>
<artifactId>dremio-func</artifactId>
<version>2.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<version.dremio>13.0.0-202101272034330307-20fb9275</version.dremio>
</properties>
<dependencies>
<dependency>
<groupId>com.dremio.sabot</groupId>
<artifactId>dremio-sabot-kernel</artifactId>
<version>${version.dremio}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.dremio.sabot</groupId>
<artifactId>dremio-sabot-kernel</artifactId>
<version>${version.dremio}</version>
<classifier>tests</classifier>
<scope>test</scope>
</dependency>
<dependency>
<!-- Mockito needs to be on the class path after JUnit (or Hamcrest)
as long as Mockito _contains_ older Hamcrest classes. -->
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
<version>1.10.19</version>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<scope>test</scope>
<version>1.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
<version>4.12</version>
</dependency>
<dependency>
<groupId>com.dremio</groupId>
<artifactId>dremio-common</artifactId>
<classifier>tests</classifier>
<version>${version.dremio}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>net.sf.uadetector</groupId>
<artifactId>uadetector-resources</artifactId>
<version>2014.04</version>
</dependency>
<dependency>
<groupId>nl.basjes.parse.useragent</groupId>
<artifactId>yauaa</artifactId>
<version>5.9</version>
</dependency>
</dependencies>
<repositories>
<repository>
<id>dremio-free</id>
<url>http://maven.dremio.com/free/</url>
</repository>
<repository>
<id>dremio-public</id>
<url>http://maven.dremio.com/public/</url>
</repository>
</repositories>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<includes>
<include>nl.basjes.parse.useragent:yauaa</include>
<include>nl.basjes.collections:prefixmap</include>
<include>org.apache.commons:commons-text</include>
<include>org.apache.commons:commons-collections4</include>
</includes>
</artifactSet>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<phase>package</phase>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- 核心代码
UAAPP.java
package com.dalong.udf;
import com.dremio.common.expression.CompleteType;
import com.dremio.common.expression.LogicalExpression;
import com.dremio.exec.expr.SimpleFunction;
import com.dremio.exec.expr.annotations.FunctionTemplate;
import com.dremio.exec.expr.annotations.Output;
import com.dremio.exec.expr.annotations.Param;
import com.dremio.exec.expr.annotations.Workspace;
import com.dremio.exec.expr.fn.OutputDerivation;
import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter;
import org.apache.arrow.vector.holders.VarCharHolder;
import org.apache.arrow.vector.types.pojo.ArrowType;
import javax.inject.Inject;
import java.util.List;
public class UAAPP {
// derivation 很重要
@FunctionTemplate(names = {"parse_user_agent"}, isDeterministic = false, derivation = UAGenOutput.class)
public static class UA implements SimpleFunction {
@Param
VarCharHolder input;
// 比较重要,需要使用ComplexWriter ,因为是复杂类型
@Output
ComplexWriter outWriter;
@Inject
ArrowBuf outBuffer;
@Workspace
nl.basjes.parse.useragent.UserAgentAnalyzer uaa;
@Workspace
List<String> allFileds;
// 共享对象的初始化
public void setup() {
uaa = com.dalong.udf.UserAgentAnalyzerProvider.getInstance();
allFileds= java.util.Arrays.asList("DeviceClass","DeviceName","DeviceBrand","DeviceCpu","OperatingSystemClass","OperatingSystemName","OperatingSystemVersion","OperatingSystemNameVersion","LayoutEngineClass","LayoutEngineName","LayoutEngineVersion","LayoutEngineVersionMajor","LayoutEngineNameVersion","LayoutEngineNameVersionMajor","AgentClass","AgentName","AgentVersion","AgentVersionMajor","AgentNameVersion","AgentNameVersionMajor");
}
public void eval() {
org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter queryMapWriter = outWriter.rootAsStruct();
if (input.isSet == 0) {
// Return empty map
queryMapWriter.start();
queryMapWriter.end();
return;
}
String userAgentString = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(0,input.end, input.buffer);
nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
queryMapWriter.start();
for (String fieldName : allFileds){
org.apache.arrow.vector.holders.VarCharHolder rowHolder = new org.apache.arrow.vector.holders.VarCharHolder();
String field = agent.getValue(fieldName);
byte[] rowStringBytes = field.getBytes();
outBuffer.reallocIfNeeded(rowStringBytes.length);
outBuffer.setBytes(0, rowStringBytes);
rowHolder.start = 0;
rowHolder.end = rowStringBytes.length;
rowHolder.buffer = outBuffer;
queryMapWriter.varChar(fieldName).write(rowHolder);
}
queryMapWriter.end();
}
}
// 此处比较重要,目前是固定的几个字段,实际上我们可以自己定义一个数据类型
public static class UAGenOutput implements OutputDerivation {
public CompleteType getOutputType(CompleteType baseReturn, List<LogicalExpression> args) {
return new CompleteType(
ArrowType.Struct.INSTANCE,
CompleteType.VARCHAR.toField("DeviceClass"),
CompleteType.VARCHAR.toField("DeviceName"),
CompleteType.VARCHAR.toField("DeviceBrand"),
CompleteType.VARCHAR.toField("DeviceCpu"),
CompleteType.VARCHAR.toField("OperatingSystemClass"),
CompleteType.VARCHAR.toField("OperatingSystemName"),
CompleteType.VARCHAR.toField("OperatingSystemVersion"),
CompleteType.VARCHAR.toField("OperatingSystemNameVersion"),
CompleteType.VARCHAR.toField("LayoutEngineClass"),
CompleteType.VARCHAR.toField("LayoutEngineName"),
CompleteType.VARCHAR.toField("LayoutEngineVersion"),
CompleteType.VARCHAR.toField("LayoutEngineVersionMajor"),
CompleteType.VARCHAR.toField("LayoutEngineNameVersion"),
CompleteType.VARCHAR.toField("LayoutEngineNameVersionMajor"),
CompleteType.VARCHAR.toField("AgentClass"),
CompleteType.VARCHAR.toField("AgentName"),
CompleteType.VARCHAR.toField("AgentVersion"),
CompleteType.VARCHAR.toField("AgentVersionMajor"),
CompleteType.VARCHAR.toField("AgentNameVersion"),
CompleteType.VARCHAR.toField("AgentNameVersionMajor"));
}
}
}
使用
- 编译
mvn clean package -DskipTests
- copy jars
主要需要同时包含源码(了解的话,dremio与drill 一样使用了基于java 代码生成执行处理,依赖了janino) - 制作一个docker 镜像
FROM dremio/dremio-oss:13.0
COPY dremio-func-2.0-SNAPSHOT.jar /opt/dremio/jars/
COPY dremio-func-2.0-SNAPSHOT-sources.jar /opt/dremio/jars/
COPY --from=hengyunabc/arthas:latest /opt/arthas /opt/arthas
- 效果
导入一些数据
select parse_user_agent(ua),myinfo from mypg.public.ua2
几个问题
- 默认derivation
提示信息
com.google.inject.CreationException: Unable to create injector, see the following errors:
1) Error in custom provider, java.lang.AssertionError: Function [com.dalong.udf.UA] has a ComplexWriter output but it's using the Default derivation
解决方法,需要实现自己的OutputDerivation,注意字段需要完整(同时我们使用的是struct,需要指明)
- 类全名称
这个与drill 是一样的,同时对于需要共享的对象使用@Workspace 注解
参考资料
https://github.com/rongfengliang/dremio-user-agent-parse-func
https://github.com/dremio/dremio-oss/blob/master/sabot/kernel/src/main/java/com/dremio/exec/expr/fn/impl/Mappify.java