spark关联表

Union

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo8Union {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("map")
      .setMaster("local")

    //spark  上下文对象
    val sc = new SparkContext(conf)

    val rdd1: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6))
    val rdd2: RDD[Int] = sc.parallelize(List(4, 5, 6, 7, 8, 9))

    /**
      * union： 合并两个RDD ,rdd的类型必须一致。不会去重
      *
      */
    val unionRDD: RDD[Int] = rdd1.union(rdd2)

    /**
      *去重
      *
      */

    val distinctRDD: RDD[Int] = unionRDD.distinct()

    distinctRDD.foreach(println)

    
  }

}

Join

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo9Join {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setAppName("map")
      .setMaster("local")

    //spark  上下文对象
    val sc = new SparkContext(conf)

    //读取学生表
    val students: RDD[String] = sc.textFile("data/students.txt")
    //读取分数表
    val scores: RDD[String] = sc.textFile("data/score.txt")

    //将rdd转换成kv格式
    val studentkvRDD: RDD[(String, String)] = students.map(student => {
      val split: Array[String] = student.split(",")
      val id: String = split(0)
      //一学号作为key,学生信息作为value
      (id, student)
    })

    val scoreKVRDD: RDD[(String, String)] = scores.map(score => {
      val split: Array[String] = score.split(",")
      val id: String = split(0)
      //一学号作为key ,学生信息作为value
      (id, score)
    })

    /**
      * join: 默认是内连接
      * 通过key进行关联
      */
    val innerJoinRDD: RDD[(String, (String, String))] = studentkvRDD.join(scoreKVRDD)

    //关联之后整理数据
    val resultRDD: RDD[(String, String)] =innerJoinRDD.map {
      case (id: String, (studentInfo: String, scoreInfo: String)) =>
        val name: String = studentInfo.split(",")(1)
        val score: String = scoreInfo.split(",")(2)
        (name, score)
    }
    
    /**
      * leftOuterJoin: 一左边为基础，如果右边没有用null 代替
      *
      */
    val leftOuterJoinRDD: RDD[(String, (String, Option[String]))] = studentkvRDD.leftOuterJoin(scoreKVRDD)

    val leftOuterResultRDD: RDD[String] =leftOuterJoinRDD.map{
      //关联上的处理方式
      case (id:String, (studentInfo:String, Some(scoreInfo)))=>
        studentInfo + "\t" + scoreInfo

        //没有关联上的处理方式
      case (id:String, (studentInfo:String, None))=>
        studentInfo + "\t" + "默认"
    }

    leftOuterResultRDD.foreach(println)

    /**
      * fullOuterJoin
      */

    val fullOuterJoinRDD: RDD[(String, (Option[String], Option[String]))] = studentkvRDD.fullOuterJoin(scoreKVRDD)

    fullOuterJoinRDD.map {
      //关联上处理方式
      case (id: String, (Some(studentInfo), Some(scoreInfo))) =>
        studentInfo + "\t" + scoreInfo

      //没有关联上处理方式
      case (id: String, (Some(studentInfo), None)) =>
        studentInfo + "\t" + "默认"

      //没有关联上处理方式
      case (id: String, (None, Some(scoreInfo))) =>
        "默认" + "\t" + scoreInfo

    }
  }
}

posted @ 2021-07-16 22:15 坤坤无敌阅读(180) 评论(0) 收藏举报

刷新页面返回顶部

spark关联表

公告