Spark的dataframe转rdd为实体类通用工具类

需求解决问题

当每次读取hive表或者其他数据源,获取数据,相对其进行rdd操作,遇到任何类都需要df.rdd(row>row.getstring(0))去获取,就很麻烦,所以可以实现个通用的转换方式

1.dataframe转为rdd通用方法

  /**
   * df转为rdd 通用方法
   *
   * @param frame
   * @return
   */
  def dataFrameToRdd(frame: DataFrame): RDD[Array[Any]] = {
    val fields: Array[StructField] = frame.schema.toArray
    val zipData: Array[(StructField, Int)] = fields.zipWithIndex
    val rdd: RDD[Array[Any]] = frame.rdd.map(row => {
      val res: Array[Row => Any] = zipData.map(structField => squareRowkey2(structField._1, structField._2))
      val array: Array[Any] = res.map(fun => {
        val value = fun(row)
        if (value == null) String.valueOf(value) else value
      })
      array
    })
    rdd
  }

  对类型进行判断

/**
   * 根据schema信息进行判断与封装
   *
   * @param dataType
   * @return
   */
  //封装rowkey
  def squareRowkey2(dataType: (StructField, Int)): (Row) => Any = {
    val (structField, index) = dataType
    structField.dataType match {
      case StringType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getString(index)
      case LongType =>
        (row: Row) =>if (row.isNullAt(index)) null else row.getLong(index)
      case FloatType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getFloat(index)
      case DoubleType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getDouble(index)
      case IntegerType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getInt(index)
      case BooleanType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getBoolean(index)
      case DateType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getDate(index)
      case TimestampType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getTimestamp(index)
      case BinaryType =>
        (row: Row) => if (row.isNullAt(index)) null else row.getAs[Array[Byte]](index)
      case ArrayType(elementType, containsNull) =>
        (row: Row) => {
          val value: mutable.WrappedArray[_ >: Integer with String <: io.Serializable with Comparable[_ >: Integer with String]] = elementType match {
            case IntegerType => {
              row.getAs[mutable.WrappedArray[Integer]](index)
            }
            case StringType => {
              row.getAs[mutable.WrappedArray[String]](index)
            }
            case _ => row.getAs[mutable.WrappedArray[String]](index)
          }
          //这儿必须转换为java的list 防止map转json字符串不符合要求
          if (value == null) {
            util.Collections.emptyList()
          }
          JavaConversions.bufferAsJavaList(value.toBuffer)
        }
      case StructType(fields) =>
        (row: Row) => row.getAs[mutable.Map[String, String]](index)
      case _ =>
        (row: Row) => row.getString(index)
    }
  }

 二、rdd转实体对象

大多数是都是讲数据分装为case calss或者对象

  def  dataFrameToEntity [U: ClassTag] (frame: DataFrame, clazz: Class[U], hiveRdd: RDD[Array[Any]]) = {
    val fields: Array[StructField] = frame.schema.toArray
    val rdd = hiveRdd.map(array => {
      val map = new util.HashMap[String, Any]()
      fields.map(_.name).zip(array)
        .foreach {
          case (k, v) => (map.put(k, v))
        }
      val str = GsonUtil.toJsonString(map)
//这边转换工具类 就是gson的转为对象的方法 val value: U = GsonUtil.GsonToBean(str, clazz) value }) rdd }

 使用:

    lazy val df: DataFrame =spark.read.table("user")
    //将df转为rdd实体类
    val userRdd: RDD[Array[Any]] = RddUtils.dataFrameToRdd(df)
    val userRDD2: RDD[User] = RddUtils.dataFrameToEntity(df, classOf[User], userRdd)

  

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.TypeReference;
import com.alibaba.fastjson.serializer.SerializerFeature;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import org.apache.commons.lang3.StringUtils;

import java.lang.reflect.Field;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**

  • @ClassName: com.xxx.sbc.dw.spark.submit.service.GsonUtil
  • @Description: json工具类
  • @Author: imp
  • @Time: 2020/10/23 10:01
  • @Version: 1.0
    */

public class GsonUtil {

private static Gson gson = null;

//判断gson对象是否存在了,不存在则创建对象
static {
    if (gson == null) {
        //gson = new Gson();            //当使用GsonBuilder方式时属性为空的时候输出来的json字符串是有键值key的,显示形式是"key":null,而直接new出来的就没有"key":null的
        gson = new GsonBuilder().setDateFormat("yyyy-MM-dd HH:mm:ss").create();
    }
}

//无参的私有构造方法
private GsonUtil() {
}

/**
 * 将对象转成json格式
 *
 * @param object
 * @return String
 */
public static String GsonString(Object object) {
    String gsonString = null;
    if (gson != null) {
        gsonString = gson.toJson(object);
    }
    return gsonString;
}

/**
 * 将json转成特定的cls的对象
 *
 * @param gsonString
 * @param cls
 * @return
 */
public static &lt;T&gt; T GsonToBean(String gsonString, Class&lt;T&gt; cls) {
    T t = null;
    if (StringUtils.isNotEmpty(gsonString)) {
        //传入json对象和对象类型,将json转成对象
        t = JSONObject.parseObject(gsonString, cls);
    }
    return t;
}

/**
 * json字符串转成list
 *
 * @param gsonString
 * @param cls
 * @return
 */
public static &lt;T&gt; List&lt;T&gt; GsonToList(String gsonString, Class&lt;T&gt; cls) {
    List&lt;T&gt; list = null;
    if (gson != null) {
        //根据泛型返回解析指定的类型,TypeToken&lt;List&lt;T&gt;&gt;{}.getType()获取返回类型
        list = gson.fromJson(gsonString, new TypeToken&lt;List&lt;T&gt;&gt;() {
        }.getType());
    }
    return list;
}

/**
 * json字符串转成list中有map的
 *
 * @param gsonString
 * @return
 */
public static &lt;T&gt; List&lt;Map&lt;String, T&gt;&gt; GsonToListMaps(String gsonString) {
    List&lt;Map&lt;String, T&gt;&gt; list = null;
    if (gson != null) {
        list = gson.fromJson(gsonString,
                new TypeToken&lt;List&lt;Map&lt;String, T&gt;&gt;&gt;() {
                }.getType());
    }
    return list;
}


public static &lt;T&gt; List&lt;Map&lt;String, T&gt;&gt; gsonToListMaps(String str) {
    List&lt;Map&lt;String, T&gt;&gt; list = null;
    if (gson != null) {
        list = gson.fromJson(str,
                new TypeToken&lt;List&lt;Map&lt;String, T&gt;&gt;&gt;() {
                }.getType());
    }
    return list;
}

/**
 * json字符串转成map的
 *
 * @param gsonString
 * @return
 */
public static &lt;T&gt; Map&lt;String, String&gt; GsonToMaps(String gsonString) {
    Map&lt;String, String&gt; map = null;
    if (gson != null) {
        map = gson.fromJson(gsonString, new TypeToken&lt;Map&lt;String, String&gt;&gt;() {
        }.getType());
    }
    return map;
}

/**
 * 判断是否是json
 *
 * @param object
 * @return
 */
public static Boolean isJson(Object object) {
    try {
        gson.toJson(object);
        return true;
    } catch (Exception e) {
        System.err.format("{} is not json", object.toString());
        return false;
    }
}

/**
 * 对象转为json字符串
 *
 * @param o
 * @return
 */
public static String toJsonString(Object o) {
    return JSON.toJSONString(o, SerializerFeature.DisableCircularReferenceDetect);
}

/**
 * json转为map
 *
 * @param json
 * @return
 */
public static Map&lt;String, String&gt; jsonToMap(String json) {
    return JSON.parseObject(json, new TypeReference&lt;LinkedHashMap&lt;String, String&gt;&gt;() {
    });
}

public static Map&lt;String, Object&gt; entityToMap(Object obj) throws IllegalAccessException {
    Map&lt;String, Object&gt; map = new LinkedHashMap&lt;String, Object&gt;();
    Class&lt;?&gt; clazz = obj.getClass();
    System.out.println(clazz);
    for (Field field : clazz.getDeclaredFields()) {
        field.setAccessible(true);
        String fieldName = field.getName();
        Object value = field.get(obj);
        if (value == null) {
            value = "";
        }
        map.put(fieldName, value);
    }
    return map;
}

  

 

posted @ 2020-11-04 23:46  夜半钟声到客船  阅读(686)  评论(1编辑  收藏  举报