代码
1.employee.json
import org.apache.spark.sql.sparksession
import spark.implicits._
import org.apache.spark.sql.types.{structType,structField,stringType,FloatType}
inport org.apache.spark.sql.{DataFrane,Row,SparkSession}
val spark = Sparksession.builder().getorcreate()
val df1 = spark.read.format("json").load("file:/ //home/gbc/employee.json")
df1.show()
df1.distinct().show()
df1.drop("id").show()
df1.filter(col( "age") > 30 ).show()
df1.groupBy("age").count().show()
df1.sort(col("name" ).asc).show()
df1.take(3)
df1.select(col("name").as("usernane")).show()
df1.agg("age"->"avg").show()
df1.agg("age"->"min").show()
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
import play.api.libs.json.{JsObject, Json}
object JsonExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JsonExample")
val sc = new SparkContext(conf)
val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
import spark.implicits._
// 读取文件
val jsonFile = sc.textFile("/home/gbc/employee.json")
// 将每一行JSON数据转化为JsObject
val jsonData = jsonFile.map(line => Json.parse(line).as[JsObject])
// 转化为DataFrame
val df = spark.createDataFrame(jsonData.rdd, Seq(classOf[JsObject]))
// 执行各种操作
// (1)查询所有数据
df.show()
// (2)查询所有数据,并去除重复的数据
df.distinct().show()
// (3)查询所有数据,打印时去除id字段
df.select("name", "age").show()
// (4)筛选出age>30的记录
df.filter(df("age") > 30).show()
// (5)将数据按age分组
df.groupBy("age").count().show()
// (6)将数据按name升序排列
df.sort("name").show()
// (7)取出前3行数据
df.limit(3).show()
// (8)查询所有记录的name列,并为其取别名为username
df.select("name").alias("username").show()
// (9)查询年龄age的平均值
df.agg(avg("age")).show()
// (10)查询年龄age的最小值
df.agg(min("age)).show()
}
}
2.Coffee Chain.csv
import org.apache.spark.sql.{SparkSession, functions}
object CoffeeChain {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("CoffeeChain")
.master("local[*]") // 在本地的所有CPU核心上运行
.getOrCreate()
import spark.implicits._
// 加载CSV文件
val df = spark.read.option("header", "true").csv("/home/gbc/Coffee Chain.csv")
// 显示咖啡连锁店的销售量排名,按照销售量降序排列
df.sort("Market", "Product Type", "Product", "Type").select("Market", "Product Type", "Product", "Type", "Marketing").show()
// 查看咖啡销售量和所在州的关系,按降序排列
df.sort("State", "Product Type", "Product", "Type").select("State", "Product Type", "Product", "Type", "Marketing").show()
// 查询咖啡的平均利润和售价,按平均利润降序排列
df.sort("Product Type", "Product", "Type", functions.avg("Profit").desc).select("Product Type", "Product", "Type", functions.avg("Profit"), functions.avg("Cogs")).show()
// 查询市场规模、市场地域与销售量的关系。按总销量降序排列
df.sort("Market Size", "Market", "Product Type", "Product", "Type", functions.sum("Marketing").desc).select("Market Size", "Market", "Product Type", "Product", "Type", functions.sum("Marketing")).show()
// 查询咖啡属性与平均售价、平均利润、销售量与其他成本的关系
df.sort("Product Type", "Product", "Type", functions.avg("Cogs"), functions.avg("Profit"), functions.sum("Marketing").desc).select("Product Type", "Product", "Type", functions.avg("Cogs"), functions.avg("Profit"), functions.sum("Marketing")).show()
}
}
3.Sport.txt
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder()
.appName("Sport数据分析")
.master("local[*]") // 在本地的所有CPU核心上运行
.getOrCreate()
val path = "/home/gbc/Sport.txt"
val df = spark.read.option("header", "true").csv(path)
val avg_scores = df.select("比赛项目", "成绩").groupBy("比赛项目").agg(avg("成绩"))
avg_scores.show()
val class_ranks = df.select("班级", "名次").groupBy("班级").agg(countDistinct("名次").alias("rank_count"))
class_ranks.show()
val track_events = df.filter(col("比赛项目").isin("100米短跑", "200米短跑"))
track_events.show()
val top_ranked_track_athletes = track_events.select("比赛项目", "运动员").groupBy("比赛项目", "运动员").agg(countDistinct("名次").alias("rank_count"))
top_ranked_track_athletes.show()
UserID, Timestamp, EventType, EventContent
1, 2023-04-15 08:30:00, Click, Product A
2, 2023-04-15 08:35:00, View, Product B
1, 2023-04-15 08:40:00, Add to Cart, Product A
3, 2023-04-15 08:42:00, Click, Product C
2, 2023-04-15 08:50:00, Purchase, Product B
1, 2023-04-15 08:55:00, Click, Product D
4, 2023-04-15 08:57:00, View, Product E
2, 2023-04-15 09:00:00, Click, Product F
3, 2023-04-15 09:05:00, Add to Cart, Product C
5, 2023-04-15 09:10:00, Click, Product G
1, 2023-04-15 08:30:00, Click, Product A
2, 2023-04-15 08:35:00, View, Product B
1, 2023-04-15 08:40:00, Add to Cart, Product A
3, 2023-04-15 08:42:00, Click, Product C
2, 2023-04-15 08:50:00, Purchase, Product B
1, 2023-04-15 08:55:00, Click, Product D
4, 2023-04-15 08:57:00, View, Product E
2, 2023-04-15 09:00:00, Click, Product F
3, 2023-04-15 09:05:00, Add to Cart, Product C
5, 2023-04-15 09:10:00, Click, Product G
1, 2023-04-15 09:20:00, Purchase, Product D
2, 2023-04-15 09:25:00, Click, Product F
3, 2023-04-15 09:30:00, View, Product C
1, 2023-04-15 09:35:00, Click, Product A
2, 2023-04-15 09:40:00, Add to Cart, Product F
4, 2023-04-15 09:45:00, Click, Product E
1, 2023-04-15 09:50:00, View, Product D
2, 2023-04-15 09:55:00, Click, Product B
3, 2023-04-15 10:00:00, Purchase, Product C
5, 2023-04-15 10:05:00, Click, Product G
UserID, UserName
1, Alice
2, Bob
3, Carol
4, Dave
5, Eve
比赛项目, 班级, 运动员, 成绩, 名次
100米短跑, A班, 张三, 12.45, 1
100米短跑, B班, 李四, 12.62, 2
100米短跑, C班, 王五, 12.75, 3
100米短跑, A班, 李华, 12.82, 4
100米短跑, C班, 王明, 13.05, 5
200米短跑, A班, 张三, 21.81, 2
200米短跑, A班, 刘强, 21.10, 1
200米短跑, C班, 王五, 22.35, 3
200米短跑, C班, 王明, 22.45, 4
200米短跑, B班, 李四, 22.60, 5
跳高, A班, 张三, 1.85, 2
跳高, B班, 李四, 1.90, 1
跳高, C班, 王五, 1.75, 3
铅球, A班, 张三, 12.34, 1
铅球, C班, 王明, 11.92, 2
铅球, C班, 王五, 11.50, 3
跳远, A班, 张三, 7.05, 2
跳远, B班, 李四, 6.95, 1
跳远, B班, 李华, 6.80, 3
import org.apache.spark.sql.{SparkSession, functions}
object Sports数据分析 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Sports数据分析")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// 读取数据集
val dataset = spark.read.csv("竞赛结果.csv", header = true, inferSchema = true)
dataset.show()
// 计算所有比赛项目的平均成绩
val averageScores = dataset.groupBy("比赛项目").agg(functions.avg("成绩"))
averageScores.show()
// 统计每个班级的名次总数
val classRankCounts = dataset.groupBy("班级").agg(functions.sum("名次").alias("总名次"))
classRankCounts.show()
val firstClass = classRankCounts.filter(col("总名次") === 1)
val secondClass = classRankCounts.filter(col("总名次") === 2)
val thirdClass = classRankCounts.filter(col("总名次") === 3)
firstClass.show()
secondClass.show()
thirdClass.show()
// 筛选并统计特定项目的成绩
val trackAndFieldDataset = dataset.filter(col("比赛项目").isin("100米短跑", "200米短跑"))
trackAndFieldDataset.show()
val top3Counts = trackAndFieldDataset.groupBy("比赛项目", "班级").agg(functions.countDistinct("运动员").alias("个人数量"))
top3Counts.show()
}
}
import org.apache.spark.sql.{SparkSession, functions}
object SportsDataAnalysis {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Sports Data Analysis")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// 读取用户信息数据集
val userInfo = spark.read.format("csv").option("header", "true").load("User Info Data.txt")
val userInfoWithID = userInfo.withColumn("用户ID", col("UserID").cast("string"))
// 读取用户行为数据集
val userActivity = spark.read.format("csv").option("header", "true").load("User Activity Data.txt")
// 按照用户ID升序,时间戳降序排序,筛选前10条记录
val sortedAndFiltered1 = userActivity.orderBy("用户ID").desc("时间戳").limit(10)
sortedAndFiltered1.show()
// 按照用户ID降序,事件类型升序排序,筛选前10条记录
val sortedAndFiltered2 = userActivity.orderBy("用户ID").asc("事件类型").limit(10)
sortedAndFiltered2.show()
// 去重和统计
val uniqueEvents = userActivity.select("用户ID", "事件类型", "事件内容").distinct()
val eventCounts = uniqueEvents.groupBy("事件类型").count()
eventCounts.show()
// 数据连接和格式化
val joined = userInfoWithID.join(userActivity, "用户ID")
val formattedReport = joined.select(col("UserName").alias("用户名称"), col("事件类型"), col("事件内容"))
.withColumn("事件数量", functions.countDistinct("事件内容").alias("事件数量"))
formattedReport.show()
}
}