Hive UDF 解析 JSON 字符串

有一张 hive 表,某个字段的数据如下:

[{"first_tag_id":1,"first_tag_name":"食品","second_tag_id":2,"second_tag_name":"酒水","third_tag_id":3,"third_tag_name":"酒类","fourth_tag_id":4,"fourth_tag_name":"啤酒","weight":0.37472842542476514},{"first_tag_id":5,"first_tag_name":"医药","second_tag_id":6,"second_tag_name":"xxx","third_tag_id":7,"third_tag_name":"yyy","fourth_tag_id":8,"fourth_tag_name":"zzz","weight":0.15397812328319802}]

它是json数组,通过 explode(自定义UDF),将数组中的每个元素打平成“row”,然后再使用 get_json_object 解析每个 tag

SELECT query_normal AS query_normal,
          get_json_object(item, "$.first_tag_id") AS first_tag_id,
          get_json_object(item, "$.second_tag_id") AS second_tag_id,
          get_json_object(item, "$.third_tag_id") AS third_tag_id,
          get_json_object(item, "$.fourth_tag_id") AS fourth_tag_id,
          get_json_object(item, "$.first_tag_name") AS first_tag_name,
          get_json_object(item, "$.second_tag_name") AS second_tag_name,
          get_json_object(item, "$.third_tag_name") AS third_tag_name,
          get_json_object(item, "$.fourth_tag_name") AS fourth_tag_name,
          get_json_object(item, "$.weight") AS weight

   FROM mart_xxx.xxx LATERAL VIEW explode(JsonHiveUDF(classify)) category_table AS item
   WHERE dt = '${now.delta(1).datekey}'

explode 函数参考:https://blog.csdn.net/bitcarmanlee/article/details/51926530
自定义UDF代码如下:

public final class JsonHiveUDF extends UDF {
    private final static Map<Character, Character> signMap;

    static {
        Map<Character, Character> signMapModifiable = new HashMap<Character, Character>();
        signMapModifiable.put('[', ']');
        signMapModifiable.put('{', '}');
        signMapModifiable.put('"', '"');
        signMap = Collections.unmodifiableMap(signMapModifiable);
    }

    private UDFJson udfJson = new UDFJson();

    public List<Text> evaluate(String jsonArray) {
        List<Text> result = new ArrayList<Text>();

        if (jsonArray == null) {
            return null;
        }

        if (jsonArray.length() < 1 || jsonArray.charAt(0) != '[') {
            result.add(new Text(jsonArray));
            return result;
        }

        // 结束符栈
        Stack<Character> stack = new Stack<Character>();

        int begin = -1;
        int length = jsonArray.length() - 1;

        for (int i = 1; i < length; i++) {
            char chr = jsonArray.charAt(i);

            // 跳过转义符
            if (chr == '\\') {
                i++;
                continue;
            }

            // 双引号特殊处理
            if (!stack.isEmpty() && stack.peek() == '"' && chr != '"') {
                continue;
            }

            if (!stack.isEmpty() && signMap.containsValue(chr)) {
                if (stack.peek() == chr) {
                    stack.pop();
                    if (stack.isEmpty()) {
                        result.add(parseString(jsonArray.substring(begin, i + 1)));
                    }
                    continue;
                } else if (!signMap.containsKey(chr) || signMap.get(chr) != chr) {
                    // 符号不匹配则直接返回
                    return result;
                }
            }

            if (signMap.containsKey(chr)) {
                if (stack.isEmpty()) {
                    begin = i;
                }
                // 入栈结束符
                stack.push(signMap.get(chr));
            }
        }

        return result;
    }

    public List<Text> evaluate(String jsonString, String pathString) {
        Text sourceText = udfJson.evaluate(jsonString, pathString);

        if (sourceText == null) {
            return null;
        }

        return evaluate(sourceText.toString());
    }

    private Text parseString(String str) {
        char firstStr = str.charAt(0);
        if (firstStr == '"') {
            str = str.substring(1, str.length() - 1);
            str = StringEscapeUtils.unescapeJavaScript(str);
        }

        return new Text(str);
    }

}
posted @ 2022-04-25 17:44  大熊猫同学  阅读(464)  评论(0编辑  收藏  举报