Spark 读取HDFS csv文件并写入hive

package com.grady

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SaveMode, SparkSession}

/**
 * csv 文件数据写入hive
 */
object CsvToHive {

  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    val spark: SparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()

    val fileName = "test1.csv"
    csvToHive(spark, fileName)
  }



  def csvToHive(spark: SparkSession, fileName: String): Unit = {
    val hdfsPath = s"/tmp/jiang/${fileName}"
    println(s"hdfsPath=${hdfsPath}")

    import spark.implicits._
    val csvRDD = spark.read
      .format("csv")
      .option("sep",",")
      .load(hdfsPath)
      .rdd
    csvRDD.foreach(println)
    val dataRDD = csvRDD.map(r => Row(r(0).toString.toInt, r(1), r(2), r(3).toString.toInt, r(4)))

    val schema = SchemaType.getStudentSchema()

    val csvDF = spark.createDataFrame(dataRDD, schema)
    csvDF.write.mode(SaveMode.Overwrite)
      .format("Hive")
      .insertInto("jiang.student")
  }

}

执行:spark-submit --master local[2] --num-executors 10 --class com.grady.CsvToHive /app/data/appdeploy/usehive1-1.0-SNAPSHOT.jar

posted @ 2022-02-08 17:48  明月照江江  阅读(675)  评论(0编辑  收藏  举报