1
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo1Sess {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("Demo1Sess")
.config("spark.sql.shuffle.partitions",3)
.getOrCreate()
val stuDF: DataFrame = spark
.read
.format("json")
.load("spark/data/students.json")
stuDF.show()
val stucsDF: DataFrame = spark
.read
.format("csv")
.schema("id String,name String,age Int,gender String,clazz String")
.load("scala/data/students.txt")
stucsDF.show()
stucsDF.createOrReplaceTempView("stu")
val ageDF: DataFrame = spark.sql("select * from stu where age=22")
ageDF.show()
val dslDF: DataFrame = stucsDF.where("age=23")
.select("name", "age", "clazz")
dslDF.show()
stucsDF.groupBy("clazz")
.count()
.write
.mode(SaveMode.Overwrite)
.save("spark/data/clazz_cnt")
}
}
2
import Practice.Student
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
object Demo2CreateDF {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("Demo2CreateDF")
.config("spark.sql.shuffle.partitions", 3)
.getOrCreate()
val jsonDF: DataFrame = spark.read
.format("json")
.load("spark/data/students.json")
val csvDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("id String,name String,age Int,gender String,clazz String")
.load("scala/data/students.txt")
val jdbcDF: DataFrame = spark.read
.format("jdbc")
.option("url", "jdbc:mysql://master:3306/student")
.option("dbtable", "student")
.option("user", "root")
.option("password", "123456")
.load()
spark.read
.format("parquet")
.load("spark/data/stu_parquet")
val stuRDD: RDD[String] = spark.sparkContext.textFile("scala/data/students.txt")
val stuRDD2: RDD[Student] = stuRDD.map(line => {
val splits: Array[String] = line.split(",")
val id: String = splits(0)
val name: String = splits(1)
val age: String = splits(2)
val gender: String = splits(3)
val clazz: String = splits(4)
Student(id, name, age, gender, clazz)
})
import spark.implicits._
val sDF: DataFrame = stuRDD2.toDF()
sDF.show()
val rdd: RDD[Row] = sDF.rdd
rdd.foreach(row=>{
val id: String = row.getAs[String]("id")
val name: String = row.getAs[String]("name")
println(s"$id,$name")
})
}
case class Student(id:String,name:String,age: String, gender: String, clazz: String)
}
3
import org.apache.spark.sql.{DataFrame, SparkSession}
object DFapi {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("DFapi")
.config("spark.sql.shuffle.partitions", 2)
.getOrCreate()
import spark.implicits._
val stuDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("id String,name String,age String,gender String,clazz String")
.load("scala/data/students.txt")
stuDF.cache()
stuDF.where("age>23")
stuDF.where($"age" > 23)
stuDF.filter(row => {
val age: String = row.getAs[String]("age")
if (age.toInt > 23) {
true
}
else {
false
}
})
stuDF.select($"id", $"name", $"age" + 100 as "newage")
stuDF.groupBy($"clazz")
.count().show()
import org.apache.spark.sql.functions._
stuDF.groupBy($"clazz", $"gender")
.agg(count($"gender"))
.show()
stuDF.groupBy($"clazz")
.agg(countDistinct($"id") as "去重人数")
.show()
stuDF.createOrReplaceTempView("stu")
spark.sql(
"""
|select clazz,count(distinct id)
|from stu
|group by clazz
""".stripMargin
).show()
val scoreDF: DataFrame = spark.read
.format("csv")
.schema("sid String,sub_id String,score Int")
.load("scala/data/score.txt")
stuDF.join(scoreDF,$"id"===$"sid","left").show()
stuDF.unpersist()
}
}
4
import org.apache.spark.sql.{DataFrame, SparkSession}
object DianXin {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("DianXin")
.config("spark.sql.shuffle.partitions", 2)
.getOrCreate()
val dxDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("mdn String,grid_id String,city_id String,county_id String,t String,start_time String,end_time String,date String")
.load("spark/data/dianxin_data")
import spark.implicits._
import org.apache.spark.sql.functions._
dxDF.createOrReplaceTempView("dx")
spark.sql(
"""
|select tt1.city_id,tt1.county_id,tt1.sum,tt1.rk
|from
|(select t1. city_id,t1.county_id,t1.sum,row_number() over (partition by county_id order by t1.sum desc) as rk
|from
|(select city_id,county_id,count(distinct mdn) as sum
|from
|dx
|group by city_id,county_id) t1) tt1
|where tt1.rk<3
|
""".stripMargin
).show()
}
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构