Hive自定义UDTF
需求: 原始数据为 F8 楼梯 MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723))) 我自定义的UDTF需要把POI数据使用0、1;1、2、2、3;3、4...规则多行输出,如下: F8 楼梯 27.7363472618163 -76.4068597634723 27.7363472618163 -80.0131241459239 F8 楼梯 27.7363472618163 -80.0131241459239 19.72894019261 -80.0118552827063 F8 楼梯 19.72894019261 -80.0118552827063 19.502938400954 -76.4055550890202 F8 楼梯 19.502938400954 -76.4055550890202 27.7363472618163 -76.4068597634723 如下使用自定义UDTF实现 1、添加依赖jar <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.mengyao.dataformat</groupId> <artifactId>hortonworks</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>hortonworks</name> <url>http://maven.apache.org</url> <repositories> <!-- hortonworks --> <repository> <releases> <enabled>true</enabled> <updatePolicy>always</updatePolicy> <checksumPolicy>warn</checksumPolicy> </releases> <snapshots> <enabled>false</enabled> <updatePolicy>never</updatePolicy> <checksumPolicy>fail</checksumPolicy> </snapshots> <id>HDPReleases</id> <name>HDP Releases</name> <url>http://repo.hortonworks.com/content/repositories/releases/</url> <layout>default</layout> </repository> <!-- cloudera --> <!-- <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> --> </repositories> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <junit.version>4.10</junit.version> <hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version> <hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version> <slf4j.version>1.7.10</slf4j.version> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>jdk.tools</groupId> <artifactId>jdk.tools</artifactId> <version>1.7</version> <scope>system</scope> <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> </dependency> <dependency> <groupId>org.mortbay.jetty</groupId> <artifactId>jetty</artifactId> <version>6.1.26</version> </dependency> <!-- HortonWorks Hadoop --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>${hortonworks.hadoop.version}</version> </dependency> <!-- Hortonworks Hive --> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>${hortonworks.hive.version}</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>${hortonworks.hive.version}</version> </dependency> <!-- slf4j --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>${slf4j.version}</version> </dependency> </dependencies> </project> 2、自定义UDTF package com.mengyao.hadoop.hortonworks.hive.udf; import java.util.ArrayList; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; public class Udf1 extends GenericUDTF { @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length!=3) { throw new UDFArgumentLengthException("参数小于3个字段"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentException("参数不是基本类型"); } ArrayList<String> fieldNames = new ArrayList<String>(); ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(); fieldNames.add("floor"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldNames.add("poiName"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldNames.add("x1"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldNames.add("y1"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldNames.add("x2"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldNames.add("y2"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs); } @Override public void close() throws HiveException { } @Override public void process(Object[] args) throws HiveException { final String floor = args[0].toString(); final String poiNameCn = args[1].toString(); final String geom = args[2].toString(); String input = geom.replace("MULTIPOLYGON(((", "").replace(")))", ""); //5 String[] fields = input.split(","); for(int i=0; i<fields.length; i++) { try { String[] arr = new String[6]; arr[0]=floor; arr[1]=poiNameCn; String[] tmp1 = fields[i].split(" "); arr[2]=tmp1[0]; arr[3]=tmp1[1]; if (i+1<fields.length) { String[] tmp2 = fields[i+1].split(" "); arr[4]=tmp2[0]; arr[5]=tmp2[1]; } forward(arr); } catch (Exception e) { continue; } } } } 3、原始数据 hive> select floor_id,poi_name_cn,geom from tmp_source_poi limit 10; OK F8 楼梯 MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723))) F8 避风塘 MULTIPOLYGON(((142.38873291015625 -89.830322265625,142.38848876953125 -92.31036376953125,142.3878173828125 -95.64105224609375,151.43511962890625 -95.62530517578125,151.4322509765625 -114.43475341796875,145.563720703125 -114.43389892578125,140.51824951171875 -114.43310546875,140.51849365234375 -111.59051513671875,140.52069091796875 -98.031005859375,125.305908203125 -98.02862548828125,125.3070068359375 -92.30804443359375,128.85321044921875 -92.30859375,138.40338134765625 -92.309814453125,138.40386962890625 -89.82965087890625,142.38873291015625 -89.830322265625))) F8 咖啡时间 MULTIPOLYGON(((73.9320653229952 -73.1165769445031,73.9267330393195 -99.5226487368938,60.0283501408994 -99.5200368016607,42.2555900737643 -99.5584639401271,42.2556011453271 -95.2023675085028,39.3781932592392 -95.2019115369498,38.6626852564514 -95.2017981567676,38.6649185493588 -81.1082055937875,41.4105622805655 -81.108640672537,41.4118176437914 -73.1864483631389,73.9320653229952 -73.1165769445031))) F8 电梯 MULTIPOLYGON(((27.7363472618163 -73.7509837455356,27.7363472618163 -76.4068597521423,27.0886035524309 -76.4067571138262,19.502938400954 -76.4055550890202,19.336468026042 -73.7496526911885,27.7363472618163 -73.7509837455356))) F8 电梯 MULTIPOLYGON(((71.791599329561 -120.789716028472,71.7846846170723 -123.743601208178,63.5224315151572 -123.589247200859,63.5290417298674 -120.765438285686,71.791599329561 -120.789716028472))) F8 电梯 MULTIPOLYGON(((151.438877351582 -71.140565793033,143.380713831633 -71.1289797308272,143.41783554107 -67.7106754227564,151.439391598105 -67.8952907441503,151.438877351582 -71.140565793033))) F8 自动扶梯 MULTIPOLYGON(((129.16515929997 -72.9842409176565,129.16515929997 -69.2177793933323,136.625542271882 -69.2177793933323,136.599083889276 -72.9842409176565,129.16515929997 -72.9842409176565))) F8 自动扶梯 MULTIPOLYGON(((90.0243225693702 -73.1198265228713,90.0237087868154 -76.1593672441145,83.4253928624093 -76.2595438690263,83.4661638364195 -73.1185022099924,90.0243225693702 -73.1198265228713))) F8 卡乐时代 MULTIPOLYGON(((56.796247374266386 -117.50865583121777,56.6748490780592 -117.49650815501809,48.22472807019949 -117.49516914784908,48.26815593987703 -123.26608142256737,47.20688885077834 -123.24331614375114,47.17912720888853 -121.70872777327895,23.973091162741184 -120.98440674319863,22.212835498154163 -95.15799333155155,27.736347261816263 -95.15799333155155,27.736347261816263 -97.24820585176349,28.933990716934204 -97.24888809770346,29.609365351498127 -97.24910343065858,39.72476739436388 -97.26569728925824,39.74184823036194 -100.99478274583817,56.65072625130415 -100.9862755574286,56.796247374266386 -117.50865583121777))) F8 一味一诚 MULTIPOLYGON(((104.10507772862911 -113.762516528368,104.10343369841576 -124.07710339874029,71.78468461707234 -123.74360121414065,71.80045375600457 -117.0072059892118,80.00307461991906 -117.11536817625165,80.00307461991906 -111.21669100224972,85.23863035440445 -113.57487217336893,104.10507772862911 -113.762516528368))) Time taken: 0.052 seconds, Fetched: 10 row(s) hive> 4、执行UDTF验证结果 hive> select poipoint(floor_id,poi_name_cn,geom) as (floor,poiName,x1,y1,x2,y2) from tmp_source_poi limit 10; OK F8 楼梯 27.7363472618163 -76.4068597634723 27.7363472618163 -80.0131241459239 F8 楼梯 27.7363472618163 -80.0131241459239 19.72894019261 -80.0118552827063 F8 楼梯 19.72894019261 -80.0118552827063 19.502938400954 -76.4055550890202 F8 楼梯 19.502938400954 -76.4055550890202 27.7363472618163 -76.4068597634723 F8 楼梯 27.7363472618163 -76.4068597634723 NULL NULL F8 避风塘 142.38873291015625 -89.830322265625 142.38848876953125 -92.31036376953125 F8 避风塘 142.38848876953125 -92.31036376953125 142.3878173828125 -95.64105224609375 F8 避风塘 142.3878173828125 -95.64105224609375 151.43511962890625 -95.62530517578125 F8 避风塘 151.43511962890625 -95.62530517578125 151.4322509765625 -114.43475341796875 F8 避风塘 151.4322509765625 -114.43475341796875 145.563720703125 -114.43389892578125 Time taken: 0.059 seconds, Fetched: 10 row(s) hive>