spark常见算子总结
RDD编程API:
1,RDD中的所有转换(Transformation)都是延迟加载的,也就是说,它们并不会直接计算结果,只是记住这些应用到基础数据集(例如一个文件)上的转换动作。只有当发生一个要求返回结果给Driver的动作时,这些转换才会真正运行
spark 常见算子总结(其一)
package day02 import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.prop.Tables.Table object RddTest { def main(args: Array[String]): Unit = { // map() // 只能跑一个 RDD // filter() // flatmap() // groupbykey() // reducebykey() // sortByKey() join() } def map(): Unit = { // 将集合之中的每一个元素乘以2 val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val number = Array(1, 2, 3, 4, 5) val numberRDD = sc.parallelize(number, 1) // map 接受的是 function对象! // 每一个映射成 num * 2 val multipmentRdd = numberRDD.map { num => num * 2 } // 循环打印出每一个 num multipmentRdd.foreach { num => println(num) } } def filter(): Unit = { // filter 过滤操作! val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val number = Array(1, 2, 3, 4, 5) val numberRDD = sc.parallelize(number, 1) // {} 与 () 相同! //过滤集合之中的偶数!filter方法遍历整个集合, 参数是函数(是一种判断)! val evennumRDD = numberRDD.filter { num => num % 2 == 0 } evennumRDD.foreach { num => println(num) } } def flatmap(): Unit = { // 将文本行拆分成多个单词! val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val lineArry = Array("hello java", "hello python", "hello R", "hello you") val lines = sc.parallelize(lineArry, 1) val words = lines.flatMap { lines => lines.split(" ") } words.foreach { words => println(words) } } def groupbykey(): Unit = { val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val corelist = Array( Tuple2("class1", 34), Tuple2("class2", 26), Tuple2("class1", 69), Tuple2("class2", 87) ) val cores = sc.parallelize(corelist,1) val groupscore = cores.groupByKey() groupscore.foreach {score => println(score._1);score._2.foreach(sing=>println(sing))} /* class1 34 69 class2 26 87 */ } def reducebykey(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val corelist = Array( Tuple2("class1", 34), Tuple2("class2", 26), Tuple2("class1", 69), Tuple2("class2", 87) ) val scores = sc.parallelize(corelist,1) // 对于相同的 key 进行处理,最终每一条key 保留一条记录! val totalScore =scores.reduceByKey(_+_) totalScore.foreach(classScore =>println(classScore._1+" "+classScore._2)) /*class1 103 class2 113*/ } def sortByKey(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val sortlist = Array( Tuple2(3,"xiaoming"),Tuple2(113,"xiaoqiang"),Tuple2(132,"xiaolv"),Tuple2(43,"xiaoxiao") ) val scores = sc.parallelize(sortlist,1) val sortedScore = scores.sortByKey() sortedScore.foreach{ sortedScore =>println(sortedScore._1+" "+sortedScore._2) } /* 3 xiaoming 43 xiaoxiao 113 xiaoqiang 132 xiaolv * */ } def join(): Unit = { // 两个 rdd 关联起来 join 打印每一个学生的成绩! val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val studentList = Array( Tuple2(1,"loe"), Tuple2(2,"jiek"), Tuple2(3,"tom") ) val scoreList = Array( Tuple2(1,23), Tuple2(2,35), Tuple2(3,24) ) val students = sc.parallelize(studentList) val scores = sc.parallelize(scoreList) val studentsScores = students.join(scores) studentsScores.foreach( studentsScores =>{ println("student id: " +studentsScores._1); println("student name: "+ studentsScores._2); println("student name: "+ studentsScores._2); println("-------------------------------") } ) /* student id: 1 student name: (loe,23) student name: (loe,23) ------------------------------- student id: 3 student name: (tom,24) student name: (tom,24) ------------------------------- student id: 2 student name: (jiek,35) student name: (jiek,35) ------------------------------- * */ } }
常见的 action操作!
package day02 import org.apache.spark.{SparkConf, SparkContext} object ActionRDD { def main(args: Array[String]): Unit = { // reduce() // countBykey() // collect() take() } def reduce(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val numberArry = Array(1,2,3,4,5,6,7,8,9,10) val numbers = sc.parallelize(numberArry,1) val sum = numbers.reduce(_+_) //进行累加 println(sum) } def collect(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val numberArry = Array(1,2,3,4,5,6,7,8,9,10) val numbers = sc.parallelize(numberArry,1) val doubleNumbes = numbers.map( num => num * 2) println(doubleNumbes)//MapPartitionsRDD[1] at map at ActionRDD.scala:24 println("------------------") // 使用 collect操作时将分布式集群之上的 doubleNumbes RDD的数据拉取到本地之中, // 通常使用 foreach action 操作进行对RDD处理 val doubleNumerArry =doubleNumbes.collect() for (num <- doubleNumerArry){ // println(num) //2 4 6 8 10 12 14 16 18 20 } } def countBykey(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val studentlist = Array( Tuple2("class1", "jiek"), Tuple2("class2", "tpm"), Tuple2("class1", "root"), Tuple2("class2", "user") ) val students = sc.parallelize(studentlist,1) // 计算 key个数 val studentconut = students.countByKey() println(studentconut) println("------------------------") } def take(): Unit ={ val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val numberArry = Array(1,2,3,4,5,6,7,8,9,10) val numbers = sc.parallelize(numberArry,1) val doubleNumbes = numbers.map( num => num * 2) // take 排序 取出前三个元素 val top3Nums =doubleNumbes.take(3) for (num <- top3Nums){ println(num) // 2 4 6 } } }
join 算子操作总结
package day02 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object joinOperation { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SparkRDD").setMaster("local") val sc = new SparkContext(conf) val namelist = Array(Tuple2(1,"xiao"),Tuple2(2,"cww"),Tuple2(3,"wd"),Tuple2(4,"wd")) val scorelist = Array(Tuple2(1,123),Tuple2(2,34),Tuple2(3,87)) // makeRDD 将本地集合转换成 RDD, 3代表着生成三个分区 //parallelize 同上都是生成 RDD第一步 parallelize 接受的是 序列, 生成器与索引拼接 // RDD[(Int:NameRDD]代表着key类型,String:代表着 NameRDD的value类型)] val NameRDD:RDD[(Int,String)] = sc.makeRDD(namelist,3) val ScoreRDD =sc.parallelize(scorelist,1) // RDD[(Int:代表ScoreRDD key值,(Int:NameRDD key值,String NameRDD value值))] val resultRDD:RDD[(Int,(Int,String))] = ScoreRDD.join(NameRDD)// join 内连接 Tuple2(4,"wd")关联不了 // leftOuterJoin 是按照左边 RDD的内容作为标准 val leftOuterJoinResultEDD = NameRDD.leftOuterJoin(ScoreRDD) resultRDD.foreachPartition((x =>{ while (x.hasNext){ val log = x.next val id =log._1 val name = log._2._2 val core = log._2._1 println("id: "+id +"\t name:"+name+"\t core:"+core) /* id: 1 core:123 name:xiao id: 2 core:34 name:cww id: 3 core:87 name:wd * */ } })) leftOuterJoinResultEDD.foreachPartition((x =>{ while (x.hasNext){ val log = x.next val id =log._1 val namr = log._2._1 val core = log._2._2 println("id: "+id +"\t name:"+namr+"\t core:"+core) /* id: 3 name:wd core:Some(87) id: 4 name:wd core:None id: 1 name:xiao core:Some(123) id: 2 name:cww core:Some(34) */ } })) } }
常见的action操作
package day02 import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable.ListBuffer object Transformationses { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local").setAppName("test") val sc = new SparkContext(conf) val arr = Array( "ABC", "Abc1","Abc10") val rdds = sc.parallelize(arr,3) val rdd = sc.parallelize(arr, 3) val rdd1 = rdds.mapPartitionsWithIndex((index, iter) => { val list = new ListBuffer[String]() while (iter.hasNext) { list.+=("rdds partition index = " + index + ",value = " + iter.next()) } list.iterator // 返回一个迭代对象! }, true) rdd1.foreach(println) /*(ABC,0) (Abc1,1) (Abc10,2)*/ println("--------------------") rdd.zipWithIndex().foreach(println) /* * (ABC,1) (Abc1,1) (Abc10,1) * */ // rdd.zip(rdds).foreach(println) /* (ABC,ABC) (Abc1,Abc1) (Abc10,Abc10) * */ rdd.countByValue().foreach(println) /* * (ABC,1) (Abc1,1) (Abc10,1) * */ sc.stop() } }
maven 编译 jar包 集群提交任务
sudo ./bin/spark-submit --class day02.ActionRDD --executor-memory 20M --executor-cores 1 /home/hadoop/spark-1.4.0-bin-hadoop2.3/lib/sfd-1.0-SNAPSHOT.jar