Hive UDF 解析 JSON 字符串
有一张 hive 表,某个字段的数据如下:
[{"first_tag_id":1,"first_tag_name":"食品","second_tag_id":2,"second_tag_name":"酒水","third_tag_id":3,"third_tag_name":"酒类","fourth_tag_id":4,"fourth_tag_name":"啤酒","weight":0.37472842542476514},{"first_tag_id":5,"first_tag_name":"医药","second_tag_id":6,"second_tag_name":"xxx","third_tag_id":7,"third_tag_name":"yyy","fourth_tag_id":8,"fourth_tag_name":"zzz","weight":0.15397812328319802}]
它是json数组,通过 explode(自定义UDF),将数组中的每个元素打平成“row”,然后再使用 get_json_object 解析每个 tag
SELECT query_normal AS query_normal,
get_json_object(item, "$.first_tag_id") AS first_tag_id,
get_json_object(item, "$.second_tag_id") AS second_tag_id,
get_json_object(item, "$.third_tag_id") AS third_tag_id,
get_json_object(item, "$.fourth_tag_id") AS fourth_tag_id,
get_json_object(item, "$.first_tag_name") AS first_tag_name,
get_json_object(item, "$.second_tag_name") AS second_tag_name,
get_json_object(item, "$.third_tag_name") AS third_tag_name,
get_json_object(item, "$.fourth_tag_name") AS fourth_tag_name,
get_json_object(item, "$.weight") AS weight
FROM mart_xxx.xxx LATERAL VIEW explode(JsonHiveUDF(classify)) category_table AS item
WHERE dt = '${now.delta(1).datekey}'
explode 函数参考:https://blog.csdn.net/bitcarmanlee/article/details/51926530
自定义UDF代码如下:
public final class JsonHiveUDF extends UDF {
private final static Map<Character, Character> signMap;
static {
Map<Character, Character> signMapModifiable = new HashMap<Character, Character>();
signMapModifiable.put('[', ']');
signMapModifiable.put('{', '}');
signMapModifiable.put('"', '"');
signMap = Collections.unmodifiableMap(signMapModifiable);
}
private UDFJson udfJson = new UDFJson();
public List<Text> evaluate(String jsonArray) {
List<Text> result = new ArrayList<Text>();
if (jsonArray == null) {
return null;
}
if (jsonArray.length() < 1 || jsonArray.charAt(0) != '[') {
result.add(new Text(jsonArray));
return result;
}
// 结束符栈
Stack<Character> stack = new Stack<Character>();
int begin = -1;
int length = jsonArray.length() - 1;
for (int i = 1; i < length; i++) {
char chr = jsonArray.charAt(i);
// 跳过转义符
if (chr == '\\') {
i++;
continue;
}
// 双引号特殊处理
if (!stack.isEmpty() && stack.peek() == '"' && chr != '"') {
continue;
}
if (!stack.isEmpty() && signMap.containsValue(chr)) {
if (stack.peek() == chr) {
stack.pop();
if (stack.isEmpty()) {
result.add(parseString(jsonArray.substring(begin, i + 1)));
}
continue;
} else if (!signMap.containsKey(chr) || signMap.get(chr) != chr) {
// 符号不匹配则直接返回
return result;
}
}
if (signMap.containsKey(chr)) {
if (stack.isEmpty()) {
begin = i;
}
// 入栈结束符
stack.push(signMap.get(chr));
}
}
return result;
}
public List<Text> evaluate(String jsonString, String pathString) {
Text sourceText = udfJson.evaluate(jsonString, pathString);
if (sourceText == null) {
return null;
}
return evaluate(sourceText.toString());
}
private Text parseString(String str) {
char firstStr = str.charAt(0);
if (firstStr == '"') {
str = str.substring(1, str.length() - 1);
str = StringEscapeUtils.unescapeJavaScript(str);
}
return new Text(str);
}
}