Hive自定义UDF
1、添加依赖jar <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.mengyao.dataformat</groupId> <artifactId>hortonworks</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>hortonworks</name> <url>http://maven.apache.org</url> <repositories> <!-- hortonworks --> <repository> <releases> <enabled>true</enabled> <updatePolicy>always</updatePolicy> <checksumPolicy>warn</checksumPolicy> </releases> <snapshots> <enabled>false</enabled> <updatePolicy>never</updatePolicy> <checksumPolicy>fail</checksumPolicy> </snapshots> <id>HDPReleases</id> <name>HDP Releases</name> <url>http://repo.hortonworks.com/content/repositories/releases/</url> <layout>default</layout> </repository> <!-- cloudera --> <!-- <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> --> </repositories> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <junit.version>4.10</junit.version> <hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version> <hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version> <slf4j.version>1.7.10</slf4j.version> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>jdk.tools</groupId> <artifactId>jdk.tools</artifactId> <version>1.7</version> <scope>system</scope> <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> </dependency> <dependency> <groupId>org.mortbay.jetty</groupId> <artifactId>jetty</artifactId> <version>6.1.26</version> </dependency> <!-- HortonWorks Hadoop --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <!-- Hortonworks Hive --> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>${hortonworks.hive.version}</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>${hortonworks.hive.version}</version> </dependency> <!-- slf4j --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>${slf4j.version}</version> </dependency> </dependencies> </project> 2、自定义Hive的UDF函数 package com.mengyao.hadoop.hortonworks.hive.udf; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; public class AddrSplitUDF extends UDF { public static class AddrBean implements Writable { private String province; private String city; private String region; private String county; private String street; private String road; private String other; private String make; @Override public void readFields(DataInput in) throws IOException { this.province = in.readUTF(); this.city = in.readUTF(); this.region = in.readUTF(); this.county = in.readUTF(); this.street = in.readUTF(); this.road = in.readUTF(); this.other = in.readUTF(); this.make = in.readUTF(); } @Override public void write(DataOutput out) throws IOException { out.writeUTF(province); out.writeUTF(city); out.writeUTF(region); out.writeUTF(county); out.writeUTF(street); out.writeUTF(road); out.writeUTF(other); out.writeUTF(make); } public AddrBean(){ } public void set(String province, String city, String region, String county, String street, String road, String other, String make) { this.province = province; this.city = city; this.region = region; this.county = county; this.street = street; this.road = road; this.other = other; this.make = make; } public String getProvince() { return province; } public void setProvince(String province) { this.province = province; } public String getCity() { return city; } public void setCity(String city) { this.city = city; } public String getRegion() { return region; } public void setRegion(String region) { this.region = region; } public String getCounty() { return county; } public void setCounty(String county) { this.county = county; } public String getStreet() { return street; } public void setStreet(String street) { this.street = street; } public String getRoad() { return road; } public void setRoad(String road) { this.road = road; } public String getOther() { return other; } public void setOther(String other) { this.other = other; } public String getMake() { return make; } public void setMake(String make) { this.make = make; } @Override public String toString() { return province + "\t" + city + "\t" + region + "\t" + county + "\t" + street + "\t" + road + "\t" + other + "\t " + make; } } public static AddrBean splitAddr(String addrStr) { Pattern mpattern = Pattern.compile("(((.*省)|(.*市)|(.*区)|(.*县)|(.*街)|(.*路)).*?|.*)"); Matcher mmatcher = mpattern.matcher(addrStr); String str = ""; AddrBean addr = new AddrBean(); while (mmatcher.find()) { str = mmatcher.group(); if (str.length() > 0) { if (str.endsWith("省")) { addr.setProvince(str); } else if (str.endsWith("市")) { addr.setCity(str); } else if (str.endsWith("区")) { addr.setRegion(str); } else if (str.endsWith("县")) { addr.setCounty(str); } else if (str.endsWith("街")) { addr.setStreet(str); } else if (str.endsWith("路")) { addr.setRoad(str); } else { addr.setOther(str); } } } return addr; } public Text evaluate(final Text addr){ if (null == addr) { return null; } AddrBean splitAddr = splitAddr(addr.toString()); if (null == splitAddr) { return null; } return new Text(splitAddr.toString()); } } 3、打包为Jar File:splitAddr-0.0.1-SNAPSHOT.jar 4、注册自定义UDF到hive会话中(临时) 4.1:在hive的会话中添加临时函数 hive hive> add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar; Added [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar] to class path Added resources: [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar] hive> create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF'; OK Time taken: 0.444 seconds 验证:select splitAddr(shop_addr) from rtc_nuomi limit 10; 4.2:运行hive时加载配置文件 vim init_func_splitAddr add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar; create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF'; hive -i init_func_splitAddr 验证:select splitAddr(shop_addr) from rtc_nuomi limit 10;