开发一个dremio user_agent 解析函数

参考apache drill 实现一个user_agent 解析的函数

项目结构

  • maven 项目结构
 
├── pom.xml
├── src
├── main
├── java
├── com
└── dalong
└── udf
├── MyFunc.java
├── UAAPP.java
└── UserAgentAnalyzerProvider.java
└── helper
└── resources
└── sabot-module.conf
 
  • 代码说明
    sabot-module.conf 老样子配置包扫描
 
dremio.classpath.scanning.packages += com.dalong.udf
  • pom.xml
    主要是user agent 解析依赖包的添加,以及maven-shade-plugin 插件的配置
 
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
 
    <groupId>com.dalong</groupId>
    <artifactId>dremio-func</artifactId>
    <version>2.0-SNAPSHOT</version>
 
    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <version.dremio>13.0.0-202101272034330307-20fb9275</version.dremio>
    </properties>
 
    <dependencies>
        <dependency>
            <groupId>com.dremio.sabot</groupId>
            <artifactId>dremio-sabot-kernel</artifactId>
            <version>${version.dremio}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>com.dremio.sabot</groupId>
            <artifactId>dremio-sabot-kernel</artifactId>
            <version>${version.dremio}</version>
            <classifier>tests</classifier>
            <scope>test</scope>
        </dependency>
        <dependency>
            <!-- Mockito needs to be on the class path after JUnit (or Hamcrest)
              as long as Mockito _contains_ older Hamcrest classes. -->
            <groupId>org.mockito</groupId>
            <artifactId>mockito-core</artifactId>
            <scope>test</scope>
            <version>1.10.19</version>
        </dependency>
        <dependency>
            <groupId>org.hamcrest</groupId>
            <artifactId>hamcrest-all</artifactId>
            <scope>test</scope>
            <version>1.3</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <scope>test</scope>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>com.dremio</groupId>
            <artifactId>dremio-common</artifactId>
            <classifier>tests</classifier>
            <version>${version.dremio}</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>net.sf.uadetector</groupId>
            <artifactId>uadetector-resources</artifactId>
            <version>2014.04</version>
        </dependency>
        <dependency>
            <groupId>nl.basjes.parse.useragent</groupId>
            <artifactId>yauaa</artifactId>
            <version>5.9</version>
        </dependency>
    </dependencies>
 
    <repositories>
        <repository>
            <id>dremio-free</id>
            <url>http://maven.dremio.com/free/</url>
        </repository>
        <repository>
            <id>dremio-public</id>
            <url>http://maven.dremio.com/public/</url>
        </repository>
    </repositories>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.2.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <artifactSet>
                                <includes>
                                    <include>nl.basjes.parse.useragent:yauaa</include>
                                    <include>nl.basjes.collections:prefixmap</include>
                                    <include>org.apache.commons:commons-text</include>
                                    <include>org.apache.commons:commons-collections4</include>
                                </includes>
                            </artifactSet>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-source-plugin</artifactId>
                <version>3.2.1</version>
                <executions>
                    <execution>
                        <id>attach-sources</id>
                        <phase>package</phase>
                        <goals>
                            <goal>jar-no-fork</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>
  • 核心代码
    UAAPP.java
 
package com.dalong.udf;
 
import com.dremio.common.expression.CompleteType;
import com.dremio.common.expression.LogicalExpression;
import com.dremio.exec.expr.SimpleFunction;
import com.dremio.exec.expr.annotations.FunctionTemplate;
import com.dremio.exec.expr.annotations.Output;
import com.dremio.exec.expr.annotations.Param;
import com.dremio.exec.expr.annotations.Workspace;
import com.dremio.exec.expr.fn.OutputDerivation;
import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter;
import org.apache.arrow.vector.holders.VarCharHolder;
import org.apache.arrow.vector.types.pojo.ArrowType;
import javax.inject.Inject;
import java.util.List;
 
public class UAAPP {
    // derivation 很重要
    @FunctionTemplate(names = {"parse_user_agent"}, isDeterministic = false, derivation = UAGenOutput.class)
    public static class UA implements SimpleFunction {
        @Param
        VarCharHolder input;
        // 比较重要,需要使用ComplexWriter ,因为是复杂类型
        @Output
        ComplexWriter outWriter;
        @Inject
        ArrowBuf outBuffer;
        @Workspace
        nl.basjes.parse.useragent.UserAgentAnalyzer uaa;
        @Workspace
        List<String> allFileds;
        // 共享对象的初始化
        public void setup() {
            uaa = com.dalong.udf.UserAgentAnalyzerProvider.getInstance();
            allFileds= java.util.Arrays.asList("DeviceClass","DeviceName","DeviceBrand","DeviceCpu","OperatingSystemClass","OperatingSystemName","OperatingSystemVersion","OperatingSystemNameVersion","LayoutEngineClass","LayoutEngineName","LayoutEngineVersion","LayoutEngineVersionMajor","LayoutEngineNameVersion","LayoutEngineNameVersionMajor","AgentClass","AgentName","AgentVersion","AgentVersionMajor","AgentNameVersion","AgentNameVersionMajor");
        }
        public void eval() {
            org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter queryMapWriter = outWriter.rootAsStruct();
            if (input.isSet == 0) {
                // Return empty map
                queryMapWriter.start();
                queryMapWriter.end();
                return;
            }
            String userAgentString = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(0,input.end, input.buffer);
            nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
            queryMapWriter.start();
            for (String fieldName : allFileds){
                org.apache.arrow.vector.holders.VarCharHolder rowHolder = new org.apache.arrow.vector.holders.VarCharHolder();
                String field = agent.getValue(fieldName);
                byte[] rowStringBytes = field.getBytes();
                outBuffer.reallocIfNeeded(rowStringBytes.length);
                outBuffer.setBytes(0, rowStringBytes);
                rowHolder.start = 0;
                rowHolder.end = rowStringBytes.length;
                rowHolder.buffer = outBuffer;
                queryMapWriter.varChar(fieldName).write(rowHolder);
            }
            queryMapWriter.end();
        }
    }
     // 此处比较重要,目前是固定的几个字段,实际上我们可以自己定义一个数据类型
    public static class UAGenOutput implements OutputDerivation {
        public CompleteType getOutputType(CompleteType baseReturn, List<LogicalExpression> args) {
            return new CompleteType(
                    ArrowType.Struct.INSTANCE,
                    CompleteType.VARCHAR.toField("DeviceClass"),
                    CompleteType.VARCHAR.toField("DeviceName"),
                    CompleteType.VARCHAR.toField("DeviceBrand"),
                    CompleteType.VARCHAR.toField("DeviceCpu"),
                    CompleteType.VARCHAR.toField("OperatingSystemClass"),
                    CompleteType.VARCHAR.toField("OperatingSystemName"),
                    CompleteType.VARCHAR.toField("OperatingSystemVersion"),
                    CompleteType.VARCHAR.toField("OperatingSystemNameVersion"),
                    CompleteType.VARCHAR.toField("LayoutEngineClass"),
                    CompleteType.VARCHAR.toField("LayoutEngineName"),
                    CompleteType.VARCHAR.toField("LayoutEngineVersion"),
                    CompleteType.VARCHAR.toField("LayoutEngineVersionMajor"),
                    CompleteType.VARCHAR.toField("LayoutEngineNameVersion"),
                    CompleteType.VARCHAR.toField("LayoutEngineNameVersionMajor"),
                    CompleteType.VARCHAR.toField("AgentClass"),
                    CompleteType.VARCHAR.toField("AgentName"),
                    CompleteType.VARCHAR.toField("AgentVersion"),
                    CompleteType.VARCHAR.toField("AgentVersionMajor"),
                    CompleteType.VARCHAR.toField("AgentNameVersion"),
                    CompleteType.VARCHAR.toField("AgentNameVersionMajor"));
        }
    }
}

使用

  • 编译
mvn clean  package -DskipTests
  • copy jars
    主要需要同时包含源码(了解的话,dremio与drill 一样使用了基于java 代码生成执行处理,依赖了janino)
  • 制作一个docker 镜像
 
FROM dremio/dremio-oss:13.0
COPY dremio-func-2.0-SNAPSHOT.jar /opt/dremio/jars/
COPY dremio-func-2.0-SNAPSHOT-sources.jar /opt/dremio/jars/
COPY --from=hengyunabc/arthas:latest /opt/arthas /opt/arthas
  • 效果

导入一些数据

 

 

 
select parse_user_agent(ua),myinfo from mypg.public.ua2
 

 

 

几个问题

  • 默认derivation
    提示信息
 
com.google.inject.CreationException: Unable to create injector, see the following errors:
1) Error in custom provider, java.lang.AssertionError: Function [com.dalong.udf.UA] has a ComplexWriter output but it's using the Default derivation   

解决方法,需要实现自己的OutputDerivation,注意字段需要完整(同时我们使用的是struct,需要指明)

  • 类全名称
    这个与drill 是一样的,同时对于需要共享的对象使用@Workspace 注解

参考资料

https://github.com/rongfengliang/dremio-user-agent-parse-func
https://github.com/dremio/dremio-oss/blob/master/sabot/kernel/src/main/java/com/dremio/exec/expr/fn/impl/Mappify.java

posted on 2021-02-14 11:35  荣锋亮  阅读(294)  评论(0编辑  收藏  举报

导航