udtf一进多出案例2

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>hive_function</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>2.3.8</version>
            <!--排除依赖中的某个jar包-->
            <exclusions>
                <exclusion>
                    <groupId>org.pentaho</groupId>
                    <artifactId>pentaho-aggdesigner-algorithm</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
        </dependency>
    </dependencies>
</project>

java编写

package udtf;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;

public class JSONDataParseUDTF extends GenericUDTF {
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        List<String> columnNames = new ArrayList<String>();
        columnNames.add("name");
        columnNames.add("age");
        columnNames.add("sex");
        columnNames.add("phone");
        List<ObjectInspector> objectInspectors = new ArrayList<ObjectInspector>();
        objectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        objectInspectors.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
        objectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        objectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, objectInspectors);
    }

    public void process(Object[] args) throws HiveException {
        String jsonStr = args[0].toString();
        JSONObject jsonObject = JSON.parseObject(jsonStr);
        List<Object> line = new ArrayList<Object>();
        line.add(jsonObject.get("name"));
        line.add(jsonObject.getInteger("age"));
        line.add(jsonObject.get("sex"));
        line.add(jsonObject.get("phone"));
        forward(line);
    }

    public void close() throws HiveException {

    }
}

打jar包

因为这里有fastjson这个第三方工具包,所以不能用maven的方式导包







在shell中操作

数据
[root@node1 data]# cat aa.txt 
{"name":"zs","age":20,"sex":"man","phone":"13888888888"}
{"name":"ls","age":21,"sex":"woman","phone":"13123148888"}
{"name":"ww","age":22,"sex":"man","phone":"1388883456"}
{"name":"ml","age":23,"sex":"woman","phone":"1388883456"}
{"name":"zb","age":24,"sex":"man","phone":"1388885678"}
{"name":"wb","age":25,"sex":"woman","phone":"13888343488"}
{"name":"lb","age":26,"sex":"man","phone":"1388881188"}


hive (default)> create temporary function parse_json_data as "udtf.JSONDataParseUDTF" using jar "hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar";
Added [/tmp/d0389ef2-b8cf-4739-b438-d9c595057ef5_resources/hive_function-1.0-SNAPSHOT.jar] to class path
Added resources: [hdfs://node1:9000/hive_function-1.0-SNAPSHOT.jar]
FAILED: Class udtf.JSONDataParseUDTF not found
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.FunctionTask


hive (default)> create table user_info(userjson string) row format delimited fields terminated by '\n';
OK
Time taken: 4.959 seconds


hive (default)> load data local inpath "/opt/data/aa.txt" into table user_info;
Loading data to table default.user_info
OK
Time taken: 2.66 seconds


hive (default)> select * from user_info;
OK
user_info.userjson
{"name":"zs","age":20,"sex":"man","phone":"13888888888"}
{"name":"ls","age":21,"sex":"woman","phone":"13123148888"}
{"name":"ww","age":22,"sex":"man","phone":"1388883456"}
{"name":"ml","age":23,"sex":"woman","phone":"1388883456"}
{"name":"zb","age":24,"sex":"man","phone":"1388885678"}
{"name":"wb","age":25,"sex":"woman","phone":"13888343488"}
{"name":"lb","age":26,"sex":"man","phone":"1388881188"}
Time taken: 4.626 seconds, Fetched: 7 row(s)


hive (default)> create temporary function parse_json_data as "udtf.JSONDataParseUDTF" using jar "hdfs://node1:9000/hive_function.jar";
Added [/tmp/d0389ef2-b8cf-4739-b438-d9c595057ef5_resources/hive_function.jar] to class path
Added resources: [hdfs://node1:9000/hive_function.jar]
OK
Time taken: 0.273 seconds


hive (default)> select parse_json_data(userjson) from user_info;
OK
name	age	sex	phone
zs	20	man	13888888888
ls	21	woman	13123148888
ww	22	man	1388883456
ml	23	woman	1388883456
zb	24	man	1388885678
wb	25	woman	13888343488
lb	26	man	1388881188
Time taken: 1.129 seconds, Fetched: 7 row(s)
posted @ 2022-08-04 16:12  jsqup  阅读(22)  评论(0编辑  收藏  举报