SparkSQL读写外部数据源--数据分区
import com.twq.dataset.Utils._ import org.apache.spark.sql.{SaveMode, SparkSession} object FilePartitionTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("FilePartitionTest") .getOrCreate() val sessions = spark.read.parquet(s"${BASE_PATH}/trackerSession") sessions.show() sessions.printSchema() sessions.createOrReplaceTempView("non_partition_table") spark.sql("select * from non_partition_table where day = 20170903").show() //对数据按照年月日进行分区 sessions.write.mode(SaveMode.Overwrite).partitionBy("cookie").parquet(s"${BASE_PATH}/trackerSession_partition") val partitionDF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition") partitionDF.show() partitionDF.printSchema() //用sql查询某20170903这天的数据 partitionDF.createOrReplaceTempView("partition_table") spark.sql("select * from partition_table where cookie='cookie1'").show() //取20170903这天的数据 val day03DF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition/year=2017/month=201709/day=20170903") day03DF.show() day03DF.printSchema() //bucket只能用于hive表中 //而且只用于parquet、json和orc文件格式的文件数据 sessions.write .partitionBy("year") .bucketBy(24, "cookie") .saveAsTable("session") spark.stop() } }