为Vertica数据库增加自定义函数to_base64和from_base64

Vertica提供了UDx的机制,用来扩展自定义函数,本文演示了在公司项目中的真实案例。

 

------------1、编写自定义函数类---------------

自定义函数的Java类,依赖VerticaSDK.jar包,位于服务器上/opt/vertica/bin/VerticaSDK.jar,下载到本地,让项目依赖这个jar包。

然后就可以实现to_base64和from_base64这两个函数了。

package tebon.vertica;

import com.vertica.sdk.*;

import java.util.TimeZone;

/**
 * 自定义vertica函数,tebon_to_base64,把指定的字符串转换为base64格式。
 *
 * @author zhanglei
 */
public class ToBase64Function extends ScalarFunctionFactory {
    public ScalarFunction createScalarFunction(ServerInterface serverInterface) {
        return new InternToBase64Function();
    }

    public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) {
        inputTypes.addVarchar();
        returnType.addVarchar();
    }
    
    @Override
    public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){
        returnType.addVarchar(argTypes.getColumnType(0).getStringLength());
    }

    public class InternToBase64Function extends ScalarFunction {
        public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation {
            TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai"));
            do {
                String inputString = argReader.getString(0);
                String returnString = null;
                if (inputString != null) {
                    try {
                        returnString = encodeToBase64(inputString.getBytes("utf-8"));
                    } catch (Exception ex) {
                        returnString = inputString;
                    }
                }
                resWriter.setString(returnString);
                resWriter.next();
            } while (argReader.next());
        }

        //Base64编码表
        private final char[] BASE64CODE
                = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
                'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',};

        private final int HEX_255 = 0x0000ff;
        private final int HEX_16515072 = 0xfc0000;
        private final int HEX_258048 = 0x3f000;
        private final int HEX_4032 = 0xfc0;
        private final int HEX_63 = 0x3f;
        private final int NUMBER_TWO = 2;
        private final int NUMBER_THREE = 3;
        private final int NUMBER_FOUR = 4;
        private final int NUMBER_SIX = 6;
        private final int NUMBER_EIGHT = 8;
        private final int NUMBER_TWELVE = 12;
        private final int NUMBER_SIXTEEN = 16;
        private final int NUMBER_EIGHTEEN = 18;

        private String encodeToBase64(byte[] b) {
            if (b == null || b.length == 0) {
                return "";
            }

            // 按实际编码后长度开辟内存,加快速度
            StringBuilder sb = new StringBuilder(((b.length - 1) / NUMBER_THREE) << NUMBER_TWO + NUMBER_FOUR);

            // 进行编码
            int code = 0;
            for (int i = 0; i < b.length; i++) {
                code |= (b[i] << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT))
                        & (HEX_255 << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT));
                if (i % NUMBER_THREE == NUMBER_TWO || i == b.length - 1) {
                    sb.append(BASE64CODE[(code & HEX_16515072) >>> NUMBER_EIGHTEEN]);
                    sb.append(BASE64CODE[(code & HEX_258048) >>> NUMBER_TWELVE]);
                    sb.append(BASE64CODE[(code & HEX_4032) >>> NUMBER_SIX]);
                    sb.append(BASE64CODE[code & HEX_63]);
                    code = 0;
                }
            }

            // 对于长度非3的整数倍的字节数组,编码前先补0,编码后结尾处编码用=代替,
            // =的个数和短缺的长度一致,以此来标识出数据实际长度
            if (b.length % NUMBER_THREE > 0) {
                sb.setCharAt(sb.length() - 1, '=');
            }
            if (b.length % NUMBER_THREE == 1) {
                sb.setCharAt(sb.length() - NUMBER_TWO, '=');
            }
            return sb.toString();
        }
    }
}
package tebon.vertica;

import com.vertica.sdk.*;

import java.util.TimeZone;

/**
 * 自定义vertica函数,tebon_from_base64,把指定的base64字符串转换为普通格式。
 *
 * @author zhanglei
 */
public class FromBase64Function extends ScalarFunctionFactory {
    public ScalarFunction createScalarFunction(ServerInterface serverInterface) {
        return new InternFromBase64Function();
    }

    public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) {
        inputTypes.addVarchar();
        returnType.addVarchar();
    }

    @Override
    public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){
        returnType.addVarchar(argTypes.getColumnType(0).getStringLength());
    }

    public class InternFromBase64Function extends ScalarFunction {
        public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation {
            TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai"));
            do {
                String inputString = argReader.getString(0);
                String returnString = null;
                if (inputString != null) {
                    try {
                        returnString = new String(decodeFromBase64(inputString), "utf-8");
                    } catch (Exception ex) {
                        returnString = inputString;
                    }
                }
                resWriter.setString(returnString);
                resWriter.next();
            } while (argReader.next());
        }

        //Base64解码表
        private final byte[] BASE64DECODE
                = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1,
                -1,
                -1, // 注意两个63,为兼容SMP,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, 63,
                -1,
                63, // “/”和“-”都翻译成63。
                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, 0, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                12, 13,
                14, // 注意两个0:
                15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1,
                -1, // “A”和“=”都翻译成0。
                -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
                -1, -1, -1, -1, -1,};

        private final int HEX_255 = 0x0000ff;
        private final int HEX_16711680 = 0xff0000;
        private final int HEX_65280 = 0x00ff00;
        private final int NUMBER_TWO = 2;
        private final int NUMBER_THREE = 3;
        private final int NUMBER_FOUR = 4;
        private final int NUMBER_SIX = 6;
        private final int NUMBER_EIGHT = 8;
        private final int NUMBER_TWELVE = 12;
        private final int NUMBER_SIXTEEN = 16;
        private final int NUMBER_EIGHTEEN = 18;

        private byte[] decodeFromBase64(String code) {
            if (code == null || code.length() <= 0) {
                return null;
            }

            code = code.replace("\r", "").replace("\n", "").replace(" ", "");

            int len = code.length();
            if (len % NUMBER_FOUR != 0) {
                throw new IllegalArgumentException("Base64 string length must be 4*n");
            }

            // 统计填充的等号个数
            int pad = 0;
            if (code.charAt(len - 1) == '=') {
                pad++;
            }
            if (code.charAt(len - NUMBER_TWO) == '=') {
                pad++;
            }

            // 根据填充等号的个数来计算实际数据长度
            int retLen = len / NUMBER_FOUR * NUMBER_THREE - pad;

            // 分配字节数组空间
            byte[] ret = new byte[retLen];

            // 查表解码
            char ch1, ch2, ch3, ch4;
            int i;
            for (i = 0; i < len; i += NUMBER_FOUR) {
                int j = i / NUMBER_FOUR * NUMBER_THREE;
                ch1 = code.charAt(i);
                ch2 = code.charAt(i + 1);
                ch3 = code.charAt(i + NUMBER_TWO);
                ch4 = code.charAt(i + NUMBER_THREE);
                int tmp = (BASE64DECODE[ch1] << NUMBER_EIGHTEEN) | (BASE64DECODE[ch2] << NUMBER_TWELVE)
                        | (BASE64DECODE[ch3] << NUMBER_SIX) | (BASE64DECODE[ch4]);
                ret[j] = (byte) ((tmp & HEX_16711680) >> NUMBER_SIXTEEN);
                if (i < len - NUMBER_FOUR) {
                    ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT);
                    ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255));
                } else {
                    if (j + 1 < retLen) {
                        ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT);
                    }
                    if (j + NUMBER_TWO < retLen) {
                        ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255));
                    }
                }
            }
            return ret;
        }
    }
}

 

------------2、把函数类上传到服务器上---------------

确保java代码在本地是编译通过的,然后上传到服务器上。

建议上传到这个位置:/opt/vertica/sdk/   ,后面以此路径为准。

 

------------3、配置java编译器环境---------------

安装Jave-devel。

通过如下命令查看已经安装的jdk版本:
rpm -qa|grep java

如果是Centos,一般情况下会列出两个,例如:
java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64
java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64

通过如下命令卸载掉:
rpm -e --nodeps java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64
rpm -e --nodeps java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64

然后安装新的java-devel:
yum -y install java-1.8.0-openjdk-devel.x86_64

测试一下:
java -version
javac -version

 

------------4、编译源代码---------------

cd /opt/vertica/sdk

javac -g -cp /opt/vertica/bin/VerticaSDK.jar /opt/vertica/sdk/BuildInfo.java -d .

javac -g -cp /opt/vertica/bin/VerticaSDK.jar FromBase64Function.java -d .

javac -g -cp /opt/vertica/bin/VerticaSDK.jar ToBase64Function.java -d .

jar cf TebonVerticaFunctions.jar ./tebon/vertica/*.class ./com/vertica/sdk/*.class

 

------------5、注册类库和函数---------------

用vsql登录vertica,执行如下SQL:

SELECT SET_CONFIG_PARAMETER('JavaBinaryForUDx','/usr/bin/java');

DROP
LIBRARY TebonVerticaFunctions CASCADE; CREATE LIBRARY TebonVerticaFunctions AS '/opt/vertica/sdk/TebonVerticaFunctions.jar' LANGUAGE 'JAVA'; CREATE FUNCTION tebon_to_base64 AS language 'java' NAME 'tebon.vertica.ToBase64Function' LIBRARY TebonVerticaFunctions; CREATE FUNCTION tebon_from_base64 AS language 'java' NAME 'tebon.vertica.FromBase64Function' LIBRARY TebonVerticaFunctions;

 

 

------------6、测试---------------

dbadmin=> select tebon_to_base64('hello123你好'), tebon_from_base64('aGVsbG8xMjPkvaDlpb0=');
tebon_to_base64 | tebon_from_base64
----------------------+-------------------
aGVsbG8xMjPkvaDlpb0= | hello123你好
(1 row)

 

到这里就结束了。

如果java代码执行有问题,可以看错误日志,位于这里(记得把/home/dbadmin/bigdata换成你自己的数据位置):

/home/dbadmin/bigdata/v_bigdata_node0001_catalog/UDxLogs/UDxFencedProcessesJava.log

 

posted @ 2021-04-08 19:48  卡卡西村长  阅读(690)  评论(0编辑  收藏  举报