spark sql 技术说明与常见的操作(其二)
Parquet 支持元数据合并:
1,主要是针对多个 Parquet文件,并且有着可以互相兼容进行合并
2,开启自动合并的两种方式:
1),读取 Parquet文件时将数据源选项 mergeSchema 设置为true
2),使用 SQLContext.setConf() 将 spark.sql.parquet.mergeSchema设置为 true
package day02 import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object Merge { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("Mero").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // 隐式转换 import sqlContext.implicits._ val studentsWithNameAge = Array(("JIEK",19),("TOM",34)) val studentsWithNameAgeDF = sc.parallelize(studentsWithNameAge,2). toDF("name","age") // 写入到一个 parquet文件之中 studentsWithNameAgeDF.write. mode(saveMode = "append").format("parquet") .save("hdfs://master:9000/students") val studentsWithNameGrade= Array(("MERRAY","A"),("MATH","B")) val studentsWithNameGradeDf = sc.parallelize(studentsWithNameGrade,2). toDF("name","grade") // 写入到一个 parquet文件之中 studentsWithNameGradeDf.write. mode(saveMode = "append").format("parquet") .save("hdfs://master:9000/students") //第一个 DataFrame 之中的元数据(name,age)与 第二个元数据(name,grade)是不一样的 // 进行元数据合并 val students =sqlContext.read.option("mergeSchema","true") .parquet("hdfs://master:9000/students") students.show() students.printSchema() /* +------+----+-----+ | name| age|grade| +------+----+-----+ |MERRAY|null| A| | JIEK| 19| null| | MATH|null| B| | TOM| 34| null| +------+----+-----+ root |-- name: string (nullable = true) |-- age: integer (nullable = true) |-- grade: string (nullable = true) * */ }}
sparksql 操作hive
saveAsTable与 registerTempTable的区别
1,spark 允许将数据保存到 Hive 表中,使用 saveAsTable 将DataFrame数据保存到Hive表之中,saveAsTable 不仅创建元数据,还会数据物理化到Hive表之中
2,registerTempTable 会创建一张临时表,没有物理化操作.spark Applation 重新启动就会丢失
2, 需要将 hive-site.xml 拷贝到 spark/conf 下,mysql-connect 拷贝到 spark/jars 之中
package day02 import org.apache.spark.sql.SaveMode import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object hiveSparkSql { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("hive-sql") .setMaster("local[*]") val sc = new SparkContext(conf) // 以 SparkContext 为参数 val hiveContext = new HiveContext(sc) hiveContext.sql( "drop table if exists student_infos") hiveContext.sql("create table if not " + "exists student_infos(name string ,age int) row format delimited fields terminated by '\\t'") hiveContext.sql("" + "load data local inpath '/home/hadoop/" + "student_infos.txt' into table student_infos") hiveContext.sql("" + "drop table if exists student_scores") hiveContext.sql("" + "create table if not exists student_scores(name string,score int) " + "row format delimited fields terminated by '\\t'") hiveContext.sql("" + "load data local inpath '/home/hadoop/" + "student_scores.txt' into table student_scores") // 关联两张表 val goodStudentDf = hiveContext.sql("select " + "si.name,si.age,ss.score from student_infos si join " + "student_scores ss on si.name==ss.name where ss.score" + ">=80") hiveContext.sql("drop table if exists good_student_infos") goodStudentDf.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos") val goodStudents = hiveContext.table("good_student_infos" ).collect() val goodstudentsRows = hiveContext.table("" + "good_student_infos").collect() for(goodstudentsRow<-goodstudentsRows){ println(goodstudentsRow) }}}
使用 scala 操作 jdbc 的几种方式
package Day3 import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object Jdbc { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[4]").setAppName("JDBC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val lowerBound = 1 // 上界 val upperBound = 100000 //下界 val numPartitions = 5 // 分区数量 val url = "jdbc:mysql://localhost:3306/my_db?user=root&password=x" val prop = new Properties() // 使用 Properties(相关属性) // 设定分区上下限以及 rdd 个数,数据很大不适合 val df = sqlContext.read.jdbc( url, "student_info","name", lowerBound, upperBound, numPartitions, prop) val predicates = Array[String]("age <19") // 可以指定 查询限制 val df1 = sqlContext.read.jdbc(url, "student_info", predicates, prop) // 使用 load 来进行数据加载, load 还支持 json、orc // 其实 options 与上面 jdbc 是一样的方法 // dbtable 指定表 val df2 = sqlContext.read.format("jdbc").options( Map("url" -> "jdbc:mysql://localhost:3306/my_db?user=root&password=x", "dbtable" -> "student_info")).load() for (a <- df) println(a) /* [Len,18] [Jieaak,14] [Qang,57] * */ for (a <- df1) println(a) /* [Len,18] [Jieaak,14] * */ for (a <- df2) println(a) /* [Len,18] [Jieaak,14] [Qang,57] * */}}