spark常用语法
Driver:Driver是Spark中Application也即代码的发布程序,可以理解为我们编写spark代码的主程序,因此只有一个,负责对spark中SparkContext对象进行创建,其中SparkContext对象负责创建Spark中的RDD(Spark中的基本数据结构,是一种抽象的逻辑概念)
Driver的另外一个职责是将任务分配给各个Executor进行执行。任务分配的原则主要是就近原则,即数据在哪个Executor所在的机器上,则任务分发给哪个Exectuor。
简单来说就是:Driver就是new sparkcontext的那个应用程序类可以成为Driver ,而且Driver的职责是将任务分配给Exectuor执行计算
Executor:是Spark中执行任务的计算资源,可以理解为分布式的CPU,每台机器可能存在多个Executor(因为计算机的CPU有多个核),这些分布式的计算机集群会有很多的Executor,Executor主要负责Spark中的各种算子的实际计算(如map等)
package com.ts.app.task.Ld
import com.ts.config.{MysqlConnect, SparkBuilder}
import com.ts.model.calendar.BusinessCalendarModel
import com.ts.model.location.{RdcSaleQtyModel, SomeMysqlDataModel}
import com.ts.utils.UdfUtil
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DateType, DecimalType, IntegerType, StringType}
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable.ListBuffer
/**
* @author: wujiajian
* @time: 2021/7/5
* @description:
*/
object SparkDemo {
def main(args: Array[String]): Unit = {
// val spark = SparkSession.builder()
// .appName(this.getClass.getSimpleName)
// .master("local[2]")
// .config("dfs.client.use.datanode.hostname", "true")
// .enableHiveSupport()
// .getOrCreate()
val spark = SparkBuilder.getSparkSession("sf-app-gtm-dpt-sparkHiveTest")
spark.sql("use loreal_deploy").collect().foreach(println)
test28(spark)
spark.stop();
}
def test28(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("loreal", "CN30_A006", "93102", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "7"),
("loreal", "CN30_A006", "99103", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99103", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99103", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99104", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99104", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99104", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99102", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99102", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99102", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "2905", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "2905", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "2905", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "2907", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "2907", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "2907", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
("loreal", "CN30_A006", "99105", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
("loreal", "CN30_A006", "99105", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10")
)).toDF("company_code", "channel_code", "store_code", "warehouse_code", "brand_code", "business_day_string", "soq_day", "from_date", "end_date", "calendar_type", "calendar_day", "lead_time")
val orderCalendarDF = df.filter("calendar_day is not null") //发货日历异常的则不补货
.groupBy("company_code", "channel_code", "warehouse_code", "store_code", "brand_code", "soq_day", "business_day_string", "calendar_type", "calendar_day", "lead_time")
//collect_list, date_format
.agg(collect_list(struct(date_format($"from_date", "yyyy-MM-dd"), date_format($"end_date", "yyyy-MM-dd"))).as("no_ship_day_list"))
orderCalendarDF.show()
//UdfUtil
val resultDF = orderCalendarDF.withColumn("ship_day", UdfUtil.getShipDateByDateUdf(col("calendar_day"), col("calendar_type"), col("business_day_string"), col("no_ship_day_list")))
.withColumn("ship_day", UdfUtil.getShipDateByDateUdf(col("calendar_day"), col("calendar_type"), col("business_day_string")))
.withColumn("receive_day", UdfUtil.dateAddUdf(col("ship_day").cast(DateType), col("lead_time").cast(IntegerType)))
//lead
.withColumn("next_receive_day", lead($"receive_day", 1, "").over(Window.partitionBy("company_code", "channel_code", "store_code", "brand_code").orderBy("receive_day", "business_day_string")))
//row_number
.withColumn("rank", row_number().over(Window.partitionBy("company_code", "channel_code", "brand_code", "store_code").orderBy("business_day_string")))
.filter("rank = 1")
resultDF.show()
}
def test27(spark: SparkSession): Unit = {
import spark.implicits._
val itemInventoryDF = spark.sparkContext.parallelize(Seq(
("31", "LA1111", 20, "2021-09-24"),
("31", "LA1111", 20, "2021-09-25"),
("31", "LA1111", 20, "2021-09-28"),
("31", "LA1111", 20, "2021-09-21"),
("31", "LA1111", 20, "2021-09-23"),
("31", "LA1111", 20, "2021-09-30"),
("31", "LA1111", 20, "2021-09-27"),
("31", "LA1111", 20, "2021-09-26"),
("31", "LA1111", 20, "2021-09-22"),
("31", "LA1111", 20, "2021-09-29")
)).toDF("brand_code", "item_code", "inv_qty", "business_day_string")
val itemDemandFcstDF = spark.sparkContext.parallelize(Seq(
("31", "LA1111", 4, "2021-09-24"),
("31", "LA1111", 5, "2021-09-25"),
("31", "LA1111", 8, "2021-09-28"),
("31", "LA1111", 1, "2021-09-21"),
("31", "LA1111", 3, "2021-09-23"),
("31", "LA1111", 10, "2021-09-30"),
("31", "LA1111", 7, "2021-09-27"),
("31", "LA1111", 6, "2021-09-26"),
("31", "LA1111", 2, "2021-09-22"),
("31", "LA1111", 15, "2021-10-05"),
("31", "LA1111", 9, "2021-09-29"),
("31", "LA1111", 14, "2021-10-04"),
("31", "LA1111", 13, "2021-10-03"),
("31", "LA1111", 11, "2021-10-01"),
("31", "LA1111", 12, "2021-10-02"),
("31", "LA1111", 16, "2021-10-06")
)).toDF("brand_code", "item_code", "fcst_qty", "fcst_day_string")
//item维度coverage_days
val itemDemandCoverageDayDF = itemInventoryDF.join(itemDemandFcstDF, Seq("brand_code", "item_code"), "left")
.selectExpr("brand_code", "item_code", "ifnull(inv_qty,0) as inv_qty", "fcst_qty", "fcst_day_string", "business_day_string")
// .filter UdfUtil
.filter($"fcst_day_string".>=($"business_day_string") and $"fcst_day_string".<=(UdfUtil.dateAddUdf($"business_day_string".cast(DateType), lit(5).cast(IntegerType))))
.selectExpr("brand_code", "item_code", "inv_qty", "fcst_qty", "fcst_day_string", "business_day_string")
.groupBy("brand_code", "item_code", "inv_qty", "business_day_string")
//通过struct组合列的方式
.agg(concat_ws(",", sort_array(collect_list(struct("fcst_day_string", "fcst_qty"))).getField("fcst_qty")))
//通过UDF方式
// .agg(collect_list(struct("fcst_day_string", "fcst_qty")) as "fcst_day_qty")
// .select(col("brand_code"), col("item_code"), col("inv_qty"), col("business_day_string"), sortUdf(col("fcst_day_qty")).alias("fcst_list"))
//通过正则替换的方式
// .agg(concat_ws(",", sort_array(collect_list(concat_ws(":", col("fcst_day_string"), col("fcst_qty"))))).as("list"))
// .withColumn("fcst_list", regexp_replace(col("list"), "\\d{4}-\\d{2}-\\d{2}\\:", ""))
//通过选取最大列的方式,效率不好
// .withColumn("list", concat_ws(",", collect_list($"fcst_qty") over Window.partitionBy("brand_code", "item_code", "inv_qty", "business_day_string").orderBy("fcst_day_string")))
// .groupBy("brand_code", "item_code", "inv_qty", "business_day_string")
// .agg(max($"list"))
.show()
}
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
val sortUdf = udf((rows: Seq[Row]) => {
val tuples = rows.map { case Row(dayStr: String, value: Int) => (dayStr, value) }
val tuples1 = tuples.sortBy { case (dayStr, value) => dayStr } //id if asc
tuples1.map { case (dayStr, value) => value }
})
def test26(): Unit = {
val list = List((-1, -1), (-1, 2), (1, 2), (-1, -3), (1, -1), (0, 1), (1, 0), (0, 0))
val list2 = list.filter(x => {
x._1 <= 0 || x._2 <= 0
})
list2.foreach(println)
}
def test25(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("G1", "1221", null),
("G3", "2222", null),
("G2", "3333", null),
("G2", "1111", "第一轮")
)).toDF("item_code", "store_code", "order_name")
df.selectExpr("item_code", "order_name", "ifnull(order_name,concat('新店订单-', store_code))").show()
}
def test24(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
(1, "g1", "Boutique", "Y", "1", "0122"),
(2, "g1", "Department", "N", "2", "0123"),
(3, "g1", "Boutique", "Y", "3", "0124"),
(4, "g1", "Department", "N", "4", "0120"),
(5, "g1", "Boutique", "Y", "3", "0110")
)).toDF("id", "item_code", "store_class_code2", "store_class_code1", "counter_level_sort", "store_code")
val allocationItemMap = df.rdd.groupBy(r => {
r.getAs("item_code").toString
})
allocationItemMap.flatMap(lines => {
var resultList = ListBuffer[Row]()
lines._2.toList.sortBy(row => (
row.getAs("store_class_code2").toString, //boutique
row.getAs("store_class_code1").toString, //is-key
row.getAs("counter_level_sort").toString, //level
row.getAs("store_code").toString))(Ordering.Tuple4(Ordering.String, Ordering.String.reverse, Ordering.String, Ordering.String.reverse))
.foreach(row => {
println(row.getAs[Int]("id"))
})
resultList
}).foreach(println)
}
def test23(spark: SparkSession): Unit = {
val driver = MysqlConnect.getDbDriver("CN30")
val batchLogDF = SomeMysqlDataModel.getBatchLog(spark, "20210810131012000").selectExpr("decision_day")
println(batchLogDF.take(1)(0).getString(0))
}
def test22(spark: SparkSession): Unit = {
val df = BusinessCalendarModel.getLastWeekCalendarDay(spark, "2021-01-01", "channel_code = 'A005'")
df.show()
}
def test21(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("wh_9901", "G01", "2021-06-22 12:21:21"),
("wh_9901", "G01", "2021-06-23 12:21:21"),
("wh_9901", "G01", "2021-06-24 12:21:21"),
("wh_9901", "G01", "2021-06-26 12:21:21"),
("wh_9902", "G01", "2021-06-25 12:21:21"),
("wh_9902", "G01", "2021-06-27 12:21:21"),
("wh_9903", "G01", "2021-06-29 12:21:21")
)).toDF("location_code", "item_code", "receive_day")
df.select(date_format(col("receive_day"), "yyyy-MM-dd")).show()
}
def test20(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
(20, "1,2,1,2,1,2,2,2,2,2,1,1"),
(10, "1,1,1,1,1"),
(12, "1,1,1,1,,1,1,1,1,1")
)).toDF("inv_all_value", "fcst_list")
val resultDF = df.withColumn("coverage_days", UdfUtil.coverageDaysUdf(lit(0), col("fcst_list"), col("inv_all_value")))
resultDF.show()
}
def test19(spark: SparkSession): Unit = {
val businessMonthStringList = BusinessCalendarModel.getBusinessMonthListString(spark, "2022-01-01", "2022-01-01", "channel_code = 'A005'")
val monthPartitionConditionStr = s"business_month_string in (${businessMonthStringList})"
println(monthPartitionConditionStr)
}
def test18(spark: SparkSession): Unit = {
val rdc91AvgSaleDF = RdcSaleQtyModel.calculateRdcScopeAvgSale(spark, "2021-08-01", "2021-08-04", 2, "1=1", "channel_code = 'CN10_A005'", "CN10_A005")
.selectExpr("company_code", "channel_code", "location_code", "item_code", "business_day_string", "total_sale_qty", "avg_sale_qty_his")
.na.fill("", cols = Array("company_code", "channel_code", "location_code", "item_code", "business_day_string"))
rdc91AvgSaleDF.show()
}
def test17(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("CN30", "A005"),
("CN30", "A007"),
("CN10", "A005")
)).toDF("division_code", "channel_code")
val result = df.filter("division_code <> 'CN10'").groupBy("division_code")
.agg(concat_ws(",", collect_set($"channel_code")).as("channel_code_list"))
println(result.take(1)(0).getString(1))
}
def test16(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("wh_9901", "G01", "2021-06-22"),
("wh_9901", "G01", "2021-06-23"),
("wh_9901", "G01", "2021-06-24"),
("wh_9901", "G01", "2021-06-26"),
("wh_9902", "G01", "2021-06-25"),
("wh_9902", "G01", "2021-06-27"),
("wh_9903", "G01", "2021-06-29")
)).toDF("location_code", "item_code", "receive_day")
df.withColumn("firstDayOfMonth", UdfUtil.getFirstDayStrOfMonthUdf(col("receive_day")))
.withColumn("lastDayOfMonth", UdfUtil.getLastDayStrOfMonthUdf(col("receive_day")))
.withColumn("1st_receive", lead(col("receive_day"), 1, "9999-01-01").over(Window.partitionBy("location_code", "item_code").orderBy("receive_day")))
.withColumn("next_receive_day", when($"1st_receive".equalTo("9999-01-01"), col("lastDayOfMonth")).otherwise(col("1st_receive")))
.withColumn("rank", row_number().over(Window.partitionBy("location_code", "item_code").orderBy($"receive_day".asc_nulls_last)))
.filter("rank == 1")
.show()
}
def test15(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("wh_9901", "G01", "2021-06-22", "6"),
("wh_9901", "G01", "2021-06-23", "4"),
("wh_9901", "G02", "2021-06-24", "2"),
("wh_9901", "G02", "2021-06-22", "1"),
("wh_9901", "G03", "2021-06-24", "2"),
("wh_9902", "G01", "2021-06-23", "4"),
("wh_9902", "G01", "2021-06-24", "2"),
("wh_9902", "G02", "2021-06-24", "3"),
("wh_9903", "G02", "2021-06-22", "6"),
("wh_9903", "G01", "2021-06-24", "5"),
("wh_9903", "G02", "2021-06-22", "3"),
("wh_9903", "G03", "2021-06-22", "2")
)).toDF("location_code", "item_code", "business_day_string", "item_qty")
df.sort("business_day_string").groupBy("location_code", "item_code")
.agg(concat_ws(",", collect_list($"item_qty")).as("forecast_qty"),
collect_list($"item_qty").as("forecast_qty_list"))
.show()
}
def test14(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("wh_9901", "G01", "2021-06-22", "6"),
("wh_9901", "G01", "2021-06-23", "4"),
("wh_9901", "G02", "2021-06-24", "2"),
("wh_9901", "G02", "2021-06-22", "1"),
("wh_9901", "G03", "2021-06-24", "2"),
("wh_9902", "G01", "2021-06-23", "4"),
("wh_9902", "G01", "2021-06-24", "2"),
("wh_9902", "G02", "2021-06-24", "3"),
("wh_9903", "G02", "2021-06-22", "6"),
("wh_9903", "G01", "2021-06-24", "5"),
("wh_9903", "G02", "2021-06-22", "3"),
("wh_9903", "G03", "2021-06-22", "2")
)).toDF("location_code", "item_code", "rdd_day_string", "item_qty")
var itemSubstituteDF = Seq(
("G01", "G02", "2021-05-24")
).toDF("item_code", "item_code_latest", "business_day_string")
itemSubstituteDF = itemSubstituteDF.select("item_code", "item_code_latest")
df.join(itemSubstituteDF, Seq("item_code"), "left")
.selectExpr("location_code", "rdd_day_string", "if(item_code_latest is null, item_code, item_code_latest) as item_code", "item_qty")
.groupBy("location_code", "item_code", "rdd_day_string")
.agg(sum("item_qty").as("intransit_qty"))
.na.fill(value = 0, cols = Array("intransit_qty"))
.groupBy("location_code", "item_code")
// .agg(concat_ws(";", collect_list(concat($"rdd_day_string", lit(":"), $"intransit_qty"))) as "intransit_detail")
.agg(concat_ws(";", collect_list(concat_ws(":", $"rdd_day_string", $"intransit_qty"))) as "intransit_detail")
.show()
}
def test13(spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
("G1111", "st_1001", "6"),
("G1111", "st_1002", "1"),
("G1111", "st_1003", "2"),
("G2222", "st_1001", "2"),
("G2222", "st_1002", "3"),
("G2222", "st_1003", "3"),
("G3333", "st_1001", "2"),
("G3333", "st_1003", "1")
)).toDF("location_code", "item_code", "fcst_qty")
val allocationItemMap = df.rdd.groupBy(row => {
val location_code: String = row.getAs("location_code").toString
val item_code: String = row.getAs("item_code").toString
(location_code, item_code)
})
allocationItemMap.flatMap(line => {
val resultList = ListBuffer[Row]()
val list = line._2.to[ListBuffer]
list.foreach(row => {
//添加两个字段
val allocationInventoryQty: BigDecimal = 23
row.schema.add("allocationInventoryQty", IntegerType, true)
val newRow = Row.fromSeq(row.toSeq ++ Array[Any](allocationInventoryQty))
resultList.append(newRow)
})
resultList
}).foreach(println)
}
def test12(spark: SparkSession): Unit = {
import spark.implicits._
val data = spark.sparkContext.parallelize(Seq(
("G1111", "st_1001", "12"),
("G1111", "st_1002", "1"),
("G1111", "st_1003", "2"),
("G2222", "st_1001", "2"),
("G2222", "st_1002", "3"),
("G2222", "st_1003", "3"),
("G3333", "st_1001", "2"),
("G3333", "st_1003", "1")
)).toDF("location_code", "item_code", "fcst_qty")
val toMap = udf((header: String, line: Seq[String]) => {
Map(header -> line)
})
val toMapList = udf((header: String, line: Seq[String]) => {
line.map(l => Map(header -> l)).toList
})
val grouped = data.groupBy("location_code", "item_code").agg(collect_list("fcst_qty").alias("fcst_qty"))
grouped.withColumn("headerLineMapGroup", toMap($"item_code", $"fcst_qty"))
.drop("location_code", "item_code", "fcst_qty")
.show(false)
grouped.withColumn("headerLineMapGroupList", toMapList($"item_code", $"fcst_qty"))
.drop("item_code", "fcst_qty")
.show(false)
}
def test11(spark: SparkSession): Unit = {
//map操作
println(List(1, 2, 3, 4, 6) map (_ + 1)) //各元素加1,生成新列表 List(2, 3, 4, 5, 7)
/** 原始数据解析 */
val studentsScore = spark.sparkContext.textFile("file:///D:/data.txt").map(_.split(","))
val groups = studentsScore.map(scoreInfo => (scoreInfo(1), scoreInfo(2).toInt, scoreInfo(3).toInt, scoreInfo(4).toInt, scoreInfo(5), scoreInfo(6)))
/** 多次分组取TopK */
val topK = groups.groupBy(item => (item._6, item._5)).map(subG => {
val (departmentId, classId) = subG._1
//语文前3
val languageTopK = subG._2.toList.sortBy(_._2)(Ordering.Int.reverse).take(3).map(item => item._2 + "分:学号" + item._1)
//数学前3
val mathTopK = subG._2.toList.sortBy(_._3)(Ordering.Int.reverse).take(3).map(item => item._3 + "分:学号" + item._1)
//外语前3
val englishTopK = subG._2.toList.sortBy(_._4)(Ordering.Int.reverse).take(3).map(item => item._4 + "分:学号" + item._1)
(departmentId, classId, Map("语文前3" -> languageTopK, "数学前3" -> mathTopK, "外语前3" -> englishTopK))
})
/** 结果显示 */
topK.foreach(println)
}
def test10(spark: SparkSession): Unit = {
import org.apache.spark.sql.functions._
import spark.implicits._
val rdcScopeAllDaySaleDF = Seq(("wh_9901", "A1", "12", "2021-05-23"),
("wh_9901", "A1", "23", "2021-02-21"),
("wh_9901", "B1", "1", "2021-05-23"),
("wh_9901", "B2", "3", "2021-05-23"),
("wh_9901", "A1", "2", "2021-02-20")).toDF("location_code", "item_code", "item_qty", "business_day_string")
val itemSubstituteDF = Seq(("A1", "B1", "2021-05-23"),
// ("B1","B1","2021-05-23"),
("A1", "B2", "2021-05-24")
).toDF("item_code", "item_code_latest", "business_day_string")
rdcScopeAllDaySaleDF.where($"business_day_string".between("2021-02-21", "2021-05-23"))
.join(itemSubstituteDF.where($"business_day_string".equalTo("2021-05-23")), Seq("item_code"), "left")
.selectExpr("location_code", "if(item_code_latest is null, item_code, item_code_latest) as item_code", "item_qty")
.groupBy("location_code", "item_code")
.agg(sum("item_qty").as("total_sale_qty"))
.withColumn("avg_sale_qty_his", col("total_sale_qty").divide(91))
.show()
}
def test9(spark: SparkSession): Unit = {
import spark.implicits._
val storeQtyDF = Seq(("d1", "b1", "c1", "s1", 100, 1),
("d1", "b1", "c2", "s1", 100, 2),
("d1", "b2", "c3", "s1", 100, 3),
("d1", "b2", "c4", "s1", 40, 10),
("d2", "b3", "c5", "s1", 50, 10),
("d2", "b3", "c6", "s1", 200, 3),
("d1", "b1", "c1", "s2", 100, 7),
("d1", "b1", "c2", "s2", 200, 4),
("d1", "b2", "c3", "s2", 300, 3),
("d1", "b2", "c4", "s2", 500, 2),
("d2", "b3", "c5", "s2", 100, 11),
("d2", "b3", "c6", "s2", 200, 6)).toDF("division", "brand", "category", "store", "sale_qty", "price")
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
val storeQtyGroupDF = storeQtyDF
.withColumn("sale_value", col("sale_qty").multiply(col("price")))
.groupBy("division", "brand", "category", "store")
.agg(sum("sale_qty").as("sale_qty"), sum("sale_value").as("sale_value"))
storeQtyGroupDF.show()
val d = "2021-05-13"
storeQtyGroupDF
.withColumn("category_qty_value", sum("sale_value").over(Window.partitionBy("category")))
.withColumn("brand_qty_value", sum("sale_value").over(Window.partitionBy("brand")))
.withColumn("division_qty_value", sum("sale_value").over(Window.partitionBy("division")))
.withColumn("category_percent", format_number(col("sale_value").divide(col("category_qty_value")).multiply(100), 6))
.withColumn("brand_percent", format_number(col("sale_value").divide(col("brand_qty_value")).multiply(100), 6))
.withColumn("division_percent", format_number(col("sale_value").divide(col("division_qty_value")).multiply(100), 6))
.withColumn("business_day_string", lit(d))
.show()
}
def test8(spark: SparkSession): Unit = {
import org.apache.spark.sql.functions._
import spark.implicits._
val orderDF = Seq(
("新乡市", "华北", "上海市", 2),
("南昌市", "华北", "天津市", 3),
("抚州市", "华北", "温州市", 36),
("娄底市", "华东", "中山市", 1),
("广州市", "华东", "孝感市", 3),
("天门市", "华东", "岳阳市", 1),
("长春市", "华南", "沈阳市", 6),
("成都市", "东北", "眉山市", 10),
("广州市", "东北", "怀集县", 2),
("北海市", "东北", "广州市", 8)).toDF("from_city_name", "district", "to_city_name", "total_num")
orderDF.withColumn("percent", format_number(col("total_num").divide(sum("total_num").over).multiply(100), 5))
.show()
}
def test7(spark: SparkSession): Unit = {
import spark.implicits._
val orders = Seq(
("深圳", "钻石会员", "钻石会员1个月", 25),
("深圳", "钻石会员", "钻石会员1个月", 25),
("深圳", "钻石会员", "钻石会员3个月", 70),
("深圳", "钻石会员", "钻石会员12个月", 300),
("深圳", "铂金会员", "铂金会员3个月", 60),
("深圳", "铂金会员", "铂金会员3个月", 60),
("深圳", "铂金会员", "铂金会员6个月", 120),
("深圳", "黄金会员", "黄金会员1个月", 15),
("深圳", "黄金会员", "黄金会员1个月", 15),
("深圳", "黄金会员", "黄金会员3个月", 45),
("深圳", "黄金会员", "黄金会员12个月", 180),
("北京", "钻石会员", "钻石会员1个月", 25),
("北京", "钻石会员", "钻石会员2个月", 30),
("北京", "铂金会员", "铂金会员3个月", 60),
("北京", "黄金会员", "黄金会员3个月", 45),
("上海", "钻石会员", "钻石会员1个月", 25),
("上海", "钻石会员", "钻石会员1个月", 25),
("上海", "铂金会员", "铂金会员3个月", 60),
("上海", "黄金会员", "黄金会员3个月", 45)
)
//把seq转换成DataFrame
val memberDF = orders.toDF("area", "memberType", "product", "price")
memberDF.groupBy("area", "memberType", "price")
.count().withColumnRenamed("count", "cnt1")
.groupBy("area", "memberType")
.count().withColumnRenamed("count", "cnt2")
.show()
memberDF.createTempView("orderTempTable")
spark.sql("select area,memberType,product,sum(price) as total " +
"from orderTempTable group by area,memberType,product grouping sets((area,memberType),(memberType,product))").show()
spark.sql("select area,memberType,product,sum(price) as total " +
"from orderTempTable group by area,memberType,product with rollup").show()
}
def test6(spark: SparkSession) {
import org.apache.spark.sql.functions._
val df = spark.createDataFrame(Seq(
("2018-01", "项目1", 100), ("2018-01", "项目2", 200), ("2018-01", "项目3", 300),
("2018-02", "项目1", 1000), ("2018-02", "项目2", 2000), ("2018-03", "项目x", 999)
)).toDF("年月", "项目", "收入")
df.groupBy("年月")
.pivot("项目", Seq("项目1", "项目2", "项目3", "项目x"))
.agg(sum("收入"))
.na.fill(0).show()
}
def test5(spark: SparkSession) {
import spark.implicits._
val source = Seq(
("G1", "Off POG", 1),
("G2", "Off POG", 2),
("G3", "normal", 3),
("G4", null, 3)
).toDF("item_code", "item_status", "sale_value")
source.createTempView("kpi_view")
spark.sql(s"select item_code,item_status,IF(isnull(item_status) or item_status = 'Off POG',0, sale_value/100) as weigth " +
s"from kpi_view").show()
println(UdfUtil.getDataDelayDay("CN10"))
}
def test4(spark: SparkSession): Unit = {
val df = spark.createDataFrame(Seq(
(1, "A"),
(2, "B"),
(3, null)
)).toDF("id", "source")
import spark.implicits._
df.show()
df.filter($"source".notEqual("A")).show()
df.filter($"source".notEqual("A").or($"source".isNull)).show()
}
def test3(spark: SparkSession): Unit = {
import spark.implicits._
val left = Seq(("loreal", 1002, "G234238", 2), ("loreal", 1001, "G27453453", 1)).toDF("company_code", "location_code", "item_code", "item_qty")
val right = Seq(("loreal", 1001, "G27453453", 3), ("loreal", 1003, "Y236423", 5)).toDF("company_code", "location_code", "item_code", "item_qty")
left.as("a").join(right.as("b"), Seq("company_code", "location_code", "item_code"), "full")
.selectExpr("company_code", "location_code", "item_code", "a.item_qty as a_qty", "b.item_qty as b_qty")
.na.fill(value = 0, cols = Array("a_qty", "b_qty"))
// .withColumn("total", $"a_qty" + $"b_qty")
.withColumn("total", UdfUtil.addDoubleUdf($"a_qty", $"b_qty"))
.show()
}
def test2(spark: SparkSession): Unit = {
val sc = spark.sparkContext
//建立一个基本的键值对RDD,包含ID和名称,其中ID为1、2、3、4
val rdd1 = sc.makeRDD(Array(("1", "Spark"), ("2", "Hadoop"), ("3", "Scala"), ("4", "Java")), 2)
//建立一个行业薪水的键值对RDD,包含ID和薪水,其中ID为1、2、3、5
val rdd2 = sc.makeRDD(Array(("1", "30K"), ("2", "15K"), ("3", "25K"), ("5", "10K")), 2)
println("//下面做Join操作,预期要得到(1,×)、(2,×)、(3,×)")
val joinRDD = rdd1.join(rdd2).collect.foreach(println)
println("//下面做leftOutJoin操作,预期要得到(1,×)、(2,×)、(3,×)、(4,×)")
val leftJoinRDD = rdd1.leftOuterJoin(rdd2).collect.foreach(println)
println("//下面做rightOutJoin操作,预期要得到(1,×)、(2,×)、(3,×)、(5,×)")
val rightJoinRDD = rdd1.rightOuterJoin(rdd2).collect.foreach(println)
}
def test1(spark: SparkSession): Unit = {
val textRdd = spark.sparkContext.parallelize(Seq("hadoop spark", "hadoop flume", "spark sqoop"))
val splitRdd = textRdd.flatMap(_.split(" "))
val tupleRdd = splitRdd.map((_, 1))
// tupleRdd.reduceByKey((curr,agg)=> curr + agg)
val reduceRdd = tupleRdd.reduceByKey(_ + _)
reduceRdd.map(item => s"${item._1} ,${item._2}").collect().foreach(println(_))
}
}
文章持续更新中,可以微信关注公众号『小菜亦牛』,第一时间阅读。点滴积累,厚积薄发,小菜鸟也能成为大牛。