Hive中自定义序列化器(带编码)

hive SerDe的简介

https://www.jianshu.com/p/afee9acba686

问题

数据文件为文本文件,每一行为固定格式,每一列的长度都是定长或是有限制范围,考虑采用hive提供的RegexSerDe来实现记录解析,使用后发现hive查询出的数据中文字段乱码

解决过程

serialization.encoding=GBK

Hadoop中文件默认utf8编码,hive序列化操作时,默认按照utf8来解析,所以肯定会乱码,从网上查了下,解决方案是建表是指定serde的"serialization.encoding"="GBK",然而并没有解决我的问题

源码

Hive建表格式为ROW FORMAT,不指定SerDe时,默认用的是org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,继承了org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe,而该类确实可以通过设置"serialization.encoding"="GBK"来解决hive读取gbk文件乱码的问题,代码如下:

//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by Fernflower decompiler)
//

package org.apache.hadoop.hive.serde2;

import com.google.common.base.Charsets;
import java.nio.charset.Charset;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe {
    private static final Logger LOG = LoggerFactory.getLogger(AbstractEncodingAwareSerDe.class);
    protected Charset charset;

    public AbstractEncodingAwareSerDe() {
    }

    /** @deprecated */
    @Deprecated
    public void initialize(Configuration conf, Properties tbl) throws SerDeException {
        this.charset = Charset.forName(tbl.getProperty("serialization.encoding", "UTF-8"));
        if (this.charset.equals(Charsets.ISO_8859_1) || this.charset.equals(Charsets.US_ASCII)) {
            LOG.warn("The data may not be properly converted to target charset " + this.charset);
        }

    }

    public final Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
        Writable result = this.doSerialize(obj, objInspector);
        if (!this.charset.equals(Charsets.UTF_8)) {
            result = this.transformFromUTF8(result);
        }

        return result;
    }

    protected abstract Writable transformFromUTF8(Writable var1);

    protected abstract Writable doSerialize(Object var1, ObjectInspector var2) throws SerDeException;

    public final Object deserialize(Writable blob) throws SerDeException {
        if (!this.charset.equals(Charsets.UTF_8)) {
            blob = this.transformToUTF8(blob);
        }

        return this.doDeserialize(blob);
    }

    protected abstract Writable transformToUTF8(Writable var1);

    protected abstract Object doDeserialize(Writable var1) throws SerDeException;
}

继续查看org.apache.hadoop.hive.serde2.RegexSerDe,发现并没有用到serialization.encoding,难怪设置了也没有用,源码就不贴了

解决

解决方法也很简单,自定义类EncodingAwareRegexSerDe继承RegexSerDe,实现转UTF8的功能,代码如下:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.RegexSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.util.Properties;


@SerDeSpec(
        schemaProps = {"columns", "columns.types", "input.regex", "input.regex.case.insensitive","serialization.encoding"}
)
public class EncodingAwareRegexSerDe extends RegexSerDe {
    public static final Logger LOG = LoggerFactory.getLogger(EncodingAwareRegexSerDe.class.getName());
    protected String charsetName;
    public EncodingAwareRegexSerDe(){
        super();
    }

    @Override
    public void initialize(Configuration conf, Properties tbl) throws SerDeException {
        super.initialize(conf, tbl);
        this.charsetName = tbl.getProperty("serialization.encoding", "UTF-8").trim();
    }

    @Override
    public Object deserialize(Writable blob) throws SerDeException {
        Text rowText = (Text) blob;
        Text utf8Text = transformTextToYTF8(rowText,this.charsetName);
        return super.deserialize(utf8Text);
    }

    private Text transformTextToYTF8(Text text,String encoding){
        String value = "";
        try{
            value = new String(text.getBytes(),0,text.getLength(),encoding);
        }catch (UnsupportedEncodingException e){
            e.printStackTrace();
        }
        return new Text(value);
    }
}

使用自定义序列化器

将上述自定义的类打成jar包后,即可使用

操作hive shell
hive> add jar /home/dw_hbkal/przhang/hive-custom-serdes-1.0-SNAPSHOT.jar;
CREATE EXTERNAL TABLE IF NOT EXISTS test_tooldb.ind01acoma_tmp(
acq_ins_id_cd          STRING,
fwd_settle_at          DECIMAL(12, 0),
repl_at                DECIMAL(12, 0),
......
card_accptr_nm_addr    STRING,
resv5                  STRING
)PARTITIONED BY(ins_id_cd STRING, hp_settle_dt STRING)
 ROW FORMAT SERDE 'com.unionpay.bigdataTest.hive.serdes.EncodingAwareRegexSerDe'
      with serdeproperties (
      "input.regex"="(.{11}) (.{11}) (.{6}) (.{10}) (.{19}) (.{12}) (.{12}) (.{12}) (.{4}) (.{6}) (.{4}) (.{8}) (.{15}) (.{12}) (.{2}) (.{6}) (.{11}) (.{6}) (.{2}) (.{3}) (.{12}) (.{12}) (.{12}) (.{1}) (.{3}) (.{1}) (.{1}) (.{10}) (.{11}) (.{1}) (.{2}) (.{2}) (.{12}) (.{1})(.{2})(.{1})(.{1})(.{2})(.{1})(.{1})(.{2})(.{1})(.{2}) (.{11}) (.{11}) (.{1}) (.{1}) (.{4}) (.{2}) (.{1,40}) (.{3}) (.{9}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{11}) (.{9}) (.{9}) (.{9}) (.{9}) (.{19}) (.{2}) (.{40}) (.{4}) (.{1}) (.{2}) (.{10}) (.{6}) (.{1}) (.{12}) (.{193})",
      "serialization.encoding"="GBK" 
)
      STORED AS TEXTFILE
      LOCATION '/user/dw_hbkal/db/test_tooldb/ind01acoma_tmp';
load data local inpath '/home/dw_hbkal/przhang/IND18071032ACOMA' overwrite into table test_tooldb.ind01acoma_tmp partition(ins_id_cd='01055800',hp_settle_dt='20180710');

posted @ 2020-09-18 13:58  远去的列车  阅读(1602)  评论(0编辑  收藏  举报