一个spark SQL和DataFrames的故事

package com.lin.spark

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

/**
  * Created by Yaooo on 2019/6/8.
  */
object SparkSQLExample {
  case class Person(name:String,age:Long)
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("Spark SQL")
      .config("spark.come.config.option","some-value")
        .master("local[2]")
      .getOrCreate()

    runBasicDataFrameExample(spark)
    runDatasetCreationExample(spark)
    runInferSchemaExample(spark)
    runProgrammaticSchemaExample(spark)
  }
  private def runProgrammaticSchemaExample(spark:SparkSession): Unit ={
    import spark.implicits._
    val personRDD = spark.sparkContext.textFile("src/main/resources/people.txt")

    val schemaString = "name age"

    val fields = schemaString.split(" ")
      .map(fieldName => StructField(fieldName, StringType, nullable = true))

    val schema = StructType(fields)

    val rowRDD = personRDD
      .map(_.split(","))
      .map(att => Row(att(0),att(1).trim))

    val peopleDF = spark.createDataFrame(rowRDD,schema)

    peopleDF.createOrReplaceTempView("people")

    val results = spark.sql("select * from people")

    results.map(att=>"Name : "+att(0)).show()

  }

  private def runInferSchemaExample(spark:SparkSession): Unit ={
    import spark.implicits._
    val personDF = spark.sparkContext
      .textFile("src/main/resources/people.txt")
      .map(_.split(","))
      .map(attributes => Person(attributes(0),attributes(1).trim.toInt))
      .toDF()

    personDF.createOrReplaceTempView("people")

    val teenagersDF = spark.sql("select * from people where age between 13 and 19")
    teenagersDF.show()
    teenagersDF.map(teenager =>"name: "+teenager(0)).show()
    teenagersDF.map(teenager => "Name: "+ teenager.getAs[String]("name")).show()

    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
    teenagersDF.map(teenager => teenager.getValuesMap[Any](List("name","age"))).collect()
      .foreach(println)
  }

  private def runDatasetCreationExample(spark:SparkSession): Unit ={
    import spark.implicits._
    val caseClassDS = Seq(Person("Andy",18)).toDF()
    caseClassDS.show()

    val primitiveDS = Seq(1, 2, 3).toDS()
    primitiveDS.map(_+1).collect().foreach(println)

    val path = "src/main/resources/person.json"
    val personDS = spark.read.json(path).as[Person]
    personDS.show()
  }

  private def runBasicDataFrameExample(spark:SparkSession): Unit ={
    import spark.implicits._
    val df = spark.read.json("src/main/resources/person.json")
    df.show()
    df.printSchema()
    df.select("name").show()
    df.select($"name",$"age"+1).show()
    df.filter($"age">21).show()
    df.groupBy($"age").count().show()

    /*df.createOrReplaceTempView("people")
    val sqlDF = spark.sql("select * from people")
    sqlDF.show()*/

    df.createOrReplaceGlobalTempView("people")
    spark.sql("select * from global_temp.people").show()
  }
}

 

posted @ 2019-06-09 09:47  消失的白桦林  阅读(287)  评论(0编辑  收藏  举报