为Vertica数据库增加自定义函数to_base64和from_base64
Vertica提供了UDx的机制,用来扩展自定义函数,本文演示了在公司项目中的真实案例。
------------1、编写自定义函数类---------------
自定义函数的Java类,依赖VerticaSDK.jar包,位于服务器上/opt/vertica/bin/VerticaSDK.jar,下载到本地,让项目依赖这个jar包。
然后就可以实现to_base64和from_base64这两个函数了。
package tebon.vertica; import com.vertica.sdk.*; import java.util.TimeZone; /** * 自定义vertica函数,tebon_to_base64,把指定的字符串转换为base64格式。 * * @author zhanglei */ public class ToBase64Function extends ScalarFunctionFactory { public ScalarFunction createScalarFunction(ServerInterface serverInterface) { return new InternToBase64Function(); } public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) { inputTypes.addVarchar(); returnType.addVarchar(); } @Override public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){ returnType.addVarchar(argTypes.getColumnType(0).getStringLength()); } public class InternToBase64Function extends ScalarFunction { public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation { TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai")); do { String inputString = argReader.getString(0); String returnString = null; if (inputString != null) { try { returnString = encodeToBase64(inputString.getBytes("utf-8")); } catch (Exception ex) { returnString = inputString; } } resWriter.setString(returnString); resWriter.next(); } while (argReader.next()); } //Base64编码表 private final char[] BASE64CODE = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',}; private final int HEX_255 = 0x0000ff; private final int HEX_16515072 = 0xfc0000; private final int HEX_258048 = 0x3f000; private final int HEX_4032 = 0xfc0; private final int HEX_63 = 0x3f; private final int NUMBER_TWO = 2; private final int NUMBER_THREE = 3; private final int NUMBER_FOUR = 4; private final int NUMBER_SIX = 6; private final int NUMBER_EIGHT = 8; private final int NUMBER_TWELVE = 12; private final int NUMBER_SIXTEEN = 16; private final int NUMBER_EIGHTEEN = 18; private String encodeToBase64(byte[] b) { if (b == null || b.length == 0) { return ""; } // 按实际编码后长度开辟内存,加快速度 StringBuilder sb = new StringBuilder(((b.length - 1) / NUMBER_THREE) << NUMBER_TWO + NUMBER_FOUR); // 进行编码 int code = 0; for (int i = 0; i < b.length; i++) { code |= (b[i] << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT)) & (HEX_255 << (NUMBER_SIXTEEN - i % NUMBER_THREE * NUMBER_EIGHT)); if (i % NUMBER_THREE == NUMBER_TWO || i == b.length - 1) { sb.append(BASE64CODE[(code & HEX_16515072) >>> NUMBER_EIGHTEEN]); sb.append(BASE64CODE[(code & HEX_258048) >>> NUMBER_TWELVE]); sb.append(BASE64CODE[(code & HEX_4032) >>> NUMBER_SIX]); sb.append(BASE64CODE[code & HEX_63]); code = 0; } } // 对于长度非3的整数倍的字节数组,编码前先补0,编码后结尾处编码用=代替, // =的个数和短缺的长度一致,以此来标识出数据实际长度 if (b.length % NUMBER_THREE > 0) { sb.setCharAt(sb.length() - 1, '='); } if (b.length % NUMBER_THREE == 1) { sb.setCharAt(sb.length() - NUMBER_TWO, '='); } return sb.toString(); } } }
package tebon.vertica; import com.vertica.sdk.*; import java.util.TimeZone; /** * 自定义vertica函数,tebon_from_base64,把指定的base64字符串转换为普通格式。 * * @author zhanglei */ public class FromBase64Function extends ScalarFunctionFactory { public ScalarFunction createScalarFunction(ServerInterface serverInterface) { return new InternFromBase64Function(); } public void getPrototype(ServerInterface serverInterface, ColumnTypes inputTypes, ColumnTypes returnType) { inputTypes.addVarchar(); returnType.addVarchar(); } @Override public void getReturnType(ServerInterface srvInterface, final SizedColumnTypes argTypes, SizedColumnTypes returnType){ returnType.addVarchar(argTypes.getColumnType(0).getStringLength()); } public class InternFromBase64Function extends ScalarFunction { public void processBlock(ServerInterface serverInterface, BlockReader argReader, BlockWriter resWriter) throws UdfException, DestroyInvocation { TimeZone.setDefault(TimeZone.getTimeZone("Asia/Shanghai")); do { String inputString = argReader.getString(0); String returnString = null; if (inputString != null) { try { returnString = new String(decodeFromBase64(inputString), "utf-8"); } catch (Exception ex) { returnString = inputString; } } resWriter.setString(returnString); resWriter.next(); } while (argReader.next()); } //Base64解码表 private final byte[] BASE64DECODE = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 注意两个63,为兼容SMP, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, 63, -1, 63, // “/”和“-”都翻译成63。 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, 0, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 注意两个0: 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, // “A”和“=”都翻译成0。 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,}; private final int HEX_255 = 0x0000ff; private final int HEX_16711680 = 0xff0000; private final int HEX_65280 = 0x00ff00; private final int NUMBER_TWO = 2; private final int NUMBER_THREE = 3; private final int NUMBER_FOUR = 4; private final int NUMBER_SIX = 6; private final int NUMBER_EIGHT = 8; private final int NUMBER_TWELVE = 12; private final int NUMBER_SIXTEEN = 16; private final int NUMBER_EIGHTEEN = 18; private byte[] decodeFromBase64(String code) { if (code == null || code.length() <= 0) { return null; } code = code.replace("\r", "").replace("\n", "").replace(" ", ""); int len = code.length(); if (len % NUMBER_FOUR != 0) { throw new IllegalArgumentException("Base64 string length must be 4*n"); } // 统计填充的等号个数 int pad = 0; if (code.charAt(len - 1) == '=') { pad++; } if (code.charAt(len - NUMBER_TWO) == '=') { pad++; } // 根据填充等号的个数来计算实际数据长度 int retLen = len / NUMBER_FOUR * NUMBER_THREE - pad; // 分配字节数组空间 byte[] ret = new byte[retLen]; // 查表解码 char ch1, ch2, ch3, ch4; int i; for (i = 0; i < len; i += NUMBER_FOUR) { int j = i / NUMBER_FOUR * NUMBER_THREE; ch1 = code.charAt(i); ch2 = code.charAt(i + 1); ch3 = code.charAt(i + NUMBER_TWO); ch4 = code.charAt(i + NUMBER_THREE); int tmp = (BASE64DECODE[ch1] << NUMBER_EIGHTEEN) | (BASE64DECODE[ch2] << NUMBER_TWELVE) | (BASE64DECODE[ch3] << NUMBER_SIX) | (BASE64DECODE[ch4]); ret[j] = (byte) ((tmp & HEX_16711680) >> NUMBER_SIXTEEN); if (i < len - NUMBER_FOUR) { ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT); ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255)); } else { if (j + 1 < retLen) { ret[j + 1] = (byte) ((tmp & HEX_65280) >> NUMBER_EIGHT); } if (j + NUMBER_TWO < retLen) { ret[j + NUMBER_TWO] = (byte) ((tmp & HEX_255)); } } } return ret; } } }
------------2、把函数类上传到服务器上---------------
确保java代码在本地是编译通过的,然后上传到服务器上。
建议上传到这个位置:/opt/vertica/sdk/ ,后面以此路径为准。
------------3、配置java编译器环境---------------
安装Jave-devel。 通过如下命令查看已经安装的jdk版本: rpm -qa|grep java 如果是Centos,一般情况下会列出两个,例如: java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64 java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64 通过如下命令卸载掉: rpm -e --nodeps java-1.8.0-openjdk-1.8.0.282.b08-1.el7_9.x86_64 rpm -e --nodeps java-1.8.0-openjdk-headless-1.8.0.282.b08-1.el7_9.x86_64 然后安装新的java-devel: yum -y install java-1.8.0-openjdk-devel.x86_64 测试一下: java -version javac -version
------------4、编译源代码---------------
cd /opt/vertica/sdk javac -g -cp /opt/vertica/bin/VerticaSDK.jar /opt/vertica/sdk/BuildInfo.java -d . javac -g -cp /opt/vertica/bin/VerticaSDK.jar FromBase64Function.java -d . javac -g -cp /opt/vertica/bin/VerticaSDK.jar ToBase64Function.java -d . jar cf TebonVerticaFunctions.jar ./tebon/vertica/*.class ./com/vertica/sdk/*.class
------------5、注册类库和函数---------------
用vsql登录vertica,执行如下SQL:
SELECT SET_CONFIG_PARAMETER('JavaBinaryForUDx','/usr/bin/java');
DROP LIBRARY TebonVerticaFunctions CASCADE; CREATE LIBRARY TebonVerticaFunctions AS '/opt/vertica/sdk/TebonVerticaFunctions.jar' LANGUAGE 'JAVA'; CREATE FUNCTION tebon_to_base64 AS language 'java' NAME 'tebon.vertica.ToBase64Function' LIBRARY TebonVerticaFunctions; CREATE FUNCTION tebon_from_base64 AS language 'java' NAME 'tebon.vertica.FromBase64Function' LIBRARY TebonVerticaFunctions;
------------6、测试---------------
dbadmin=> select tebon_to_base64('hello123你好'), tebon_from_base64('aGVsbG8xMjPkvaDlpb0=');
tebon_to_base64 | tebon_from_base64
----------------------+-------------------
aGVsbG8xMjPkvaDlpb0= | hello123你好
(1 row)
到这里就结束了。
如果java代码执行有问题,可以看错误日志,位于这里(记得把/home/dbadmin/bigdata换成你自己的数据位置):
/home/dbadmin/bigdata/v_bigdata_node0001_catalog/UDxLogs/UDxFencedProcessesJava.log
作者:Lave Zhang
出处:http://www.cnblogs.com/lavezhang/
本文版权归作者所有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。