基于Spark ML的Titanic Challenge (Top 6%)

下面代码按照之前参加Kaggle的python代码改写,只完成了模型的训练过程,还需要对test集的数据进行转换和对test集进行预测。
scala 2.11.12
spark 2.2.2

package ML.Titanic

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.Bucketizer
import org.apache.spark.ml.feature.QuantileDiscretizer
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.OneHotEncoder
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{TrainValidationSplit, TrainValidationSplitModel}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.types._


/**
  * GBTClassifier for predicting survival in the Titanic ship
  */
object TitanicChallenge {

  def main(args: Array[String]) {

    val spark = SparkSession.builder.
      master("local[*]")
      .appName("example")
      .config("spark.sql.shuffle.partitions", 20)
      .config("spark.default.parallelism", 20)
      .config("spark.driver.memory", "4G")
      .config("spark.memory.fraction", 0.75)
      .getOrCreate()
    val sc = spark.sparkContext

    spark.sparkContext.setLogLevel("ERROR")

    val schemaArray = StructType(Array(
      StructField("PassengerId", IntegerType, true),
      StructField("Survived", IntegerType, true),
      StructField("Pclass", IntegerType, true),
      StructField("Name", StringType, true),
      StructField("Sex", StringType, true),
      StructField("Age", FloatType, true),
      StructField("SibSp", IntegerType, true),
      StructField("Parch", IntegerType, true),
      StructField("Ticket", StringType, true),
      StructField("Fare", FloatType, true),
      StructField("Cabin", StringType, true),
      StructField("Embarked", StringType, true)
    ))

    val path = "Titanic/"
    val df = spark.read
      .option("header", "true")
      .schema(schemaArray)
      .csv(path + "train.csv")
      .drop("PassengerId")
//    df.cache()

    val utils = new TitanicChallenge(spark)
    val df2 = utils.transCabin(df)
    val df3 = utils.transTicket(sc, df2)
    val df4 = utils.transEmbarked(df3)
    val df5 = utils.extractTitle(sc, df4)
    val df6 = utils.transAge(sc, df5)
    val df7 = utils.categorizeAge(df6)
    val df8 = utils.createFellow(df7)
    val df9 = utils.categorizeFellow(df8)
    val df10 = utils.extractFName(df9)
    val df11 = utils.transFare(df10)

    val prePipelineDF = df11.select("Survived", "Pclass", "Sex",
      "Age_categorized", "fellow_type", "Fare_categorized",
      "Embarked", "Cabin", "Ticket",
      "Title", "family_type")

//    prePipelineDF.show(1)
//    +--------+------+----+---------------+-----------+----------------+--------+-----+------+-----+-----------+
//    |Survived|Pclass| Sex|Age_categorized|fellow_type|Fare_categorized|Embarked|Cabin|Ticket|Title|family_type|
//    +--------+------+----+---------------+-----------+----------------+--------+-----+------+-----+-----------+
//    |       0|     3|male|            3.0|      Small|             0.0|       S|    U|     0|   Mr|          0|
//    +--------+------+----+---------------+-----------+----------------+--------+-----+------+-----+-----------+

    val (df_indexed, colsTrain) = utils.index_onehot(prePipelineDF)
    df_indexed.cache()

    //训练模型
    val validatorModel = utils.trainData(df_indexed, colsTrain)

    //打印最优模型的参数
    val bestModel = validatorModel.bestModel
    println(bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap)

    //打印各模型的成绩和参数
    val paramsAndMetrics = validatorModel.validationMetrics
      .zip(validatorModel.getEstimatorParamMaps)
      .sortBy(-_._1)
    paramsAndMetrics.foreach { case (metric, params) =>
      println(metric)
      println(params)
      println()
    }

    validatorModel.write.overwrite().save(path + "Titanic_gbtc")

    spark.stop()
  }
}

class TitanicChallenge(private val spark: SparkSession) extends Serializable {

  import spark.implicits._

  //Cabin,用“U”填充null,并提取Cabin的首字母
  def transCabin(df: Dataset[Row]): Dataset[Row] = {
    df.na.fill("U", Seq("Cabin"))
      .withColumn("Cabin", substring($"Cabin", 0, 1))
  }

  //
  def transTicket(sc: SparkContext, df: Dataset[Row]): Dataset[Row] = {

    ////提取船票的号码,如“A/5 21171”中的21171
    val medDF1 = df.withColumn("Ticket", split($"Ticket", " "))
      .withColumn("Ticket", $"Ticket"(size($"Ticket").minus(1)))
      .filter($"Ticket" =!= "LINE")//去掉某种特殊的船票

    //对船票号进行分类,小于四位号码的为“1”,四位号码的以第一个数字开头,后面接上“0”,大于4位号码的,取前三个数字开头。如21171变为211
    val ticketTransUdf = udf((ticket: String) => {
      if (ticket.length < 4) {
        "1"
      } else if (ticket.length == 4){
        ticket(0)+"0"
      } else {
        ticket.slice(0, 3)
      }
    })
    val medDF2 = medDF1.withColumn("Ticket", ticketTransUdf($"Ticket"))

    //将数量小于等于5的类别统一归为“0”。先统计小于5的名单,然后用udf进行转换。
    val filterList = medDF2.groupBy($"Ticket").count()
      .filter($"count" <= 5)
      .map(row => row.getString(0))
      .collect.toList

    val filterList_bc = sc.broadcast(filterList)

    val ticketTransAdjustUdf = udf((subticket: String) => {
      if (filterList_bc.value.contains(subticket)) "0"
      else subticket
    })

    medDF2.withColumn("Ticket", ticketTransAdjustUdf($"Ticket"))
  }

  //用“S”填充null
  def transEmbarked(df: Dataset[Row]): Dataset[Row] = {
    df.na.fill("S", Seq("Embarked"))
  }

  def extractTitle(sc: SparkContext, df: Dataset[Row]): Dataset[Row] = {
    val regex = ".*, (.*?)\\..*"

    //对头衔进行归类
    val titlesMap = Map(
      "Capt"-> "Officer",
      "Col"-> "Officer",
      "Major"-> "Officer",
      "Jonkheer"-> "Royalty",
      "Don"-> "Royalty",
      "Sir" -> "Royalty",
      "Dr"-> "Officer",
      "Rev"-> "Officer",
      "the Countess"->"Royalty",
      "Mme"-> "Mrs",
      "Mlle"-> "Miss",
      "Ms"-> "Mrs",
      "Mr" -> "Mr",
      "Mrs" -> "Mrs",
      "Miss" -> "Miss",
      "Master" -> "Master",
      "Lady" -> "Royalty"
    )

    val titlesMap_bc = sc.broadcast(titlesMap)

    df.withColumn("Title", regexp_extract(($"Name"), regex, 1))
      .na.replace("Title", titlesMap_bc.value)
  }

  //根据null age的records对应的Pclass和Name_final分组后的平均来填充缺失age。
  // 首先,生成分组key,并获取分组后的平均年龄map。然后广播map,当Age为null时,用udf返回需要填充的值。
  def transAge(sc: SparkContext, df: Dataset[Row]): Dataset[Row] = {
    val medDF = df.withColumn("Pclass_Title_key", concat($"Title", $"Pclass"))
    val meanAgeMap = medDF.groupBy("Pclass_Title_key")
      .mean("Age")
      .map(row => (row.getString(0), row.getDouble(1)))
      .collect().toMap

    val meanAgeMap_bc = sc.broadcast(meanAgeMap)

    val fillAgeUdf = udf((comb_key: String) => meanAgeMap_bc.value.getOrElse(comb_key, 0.0))

    medDF.withColumn("Age", when($"Age".isNull, fillAgeUdf($"Pclass_Title_key")).otherwise($"Age"))
  }

  //对Age进行分类
  def categorizeAge(df: Dataset[Row]): Dataset[Row] = {
    val ageBucketBorders = 0.0 +: (10.0 to 60.0 by 5.0).toArray :+ 150.0
    val ageBucketer = new Bucketizer().setSplits(ageBucketBorders).setInputCol("Age").setOutputCol("Age_categorized")
    ageBucketer.transform(df).drop("Pclass_Title_key")
  }

  //将SibSp和Parch相加,得出同行人数
  def createFellow(df: Dataset[Row]): Dataset[Row] = {
    df.withColumn("fellow", $"SibSp" + $"Parch")
  }

  //fellow_type, 对fellow进行分类。此处其实可以留到pipeline部分一次性完成。
  def categorizeFellow(df: Dataset[Row]): Dataset[Row] = {
    df.withColumn("fellow_type", when($"fellow" === 0, "Alone")
      .when($"fellow" <= 3, "Small")
      .otherwise("Large"))
  }

  def extractFName(df: Dataset[Row]): Dataset[Row] = {

    //检查df是否有Survived和fellow列
    if (!df.columns.contains("Survived") || !df.columns.contains("fellow")){
      throw new IllegalArgumentException(
        """
          |Check if the argument is a training set or if this training set contains column named \"fellow\"
        """.stripMargin)
    }

    //FName,提取家庭名称。例如:"Johnston, Miss. Catherine Helen ""Carrie""" 提取出Johnston
    // 由于spark的读取csv时,如果有引号,读取就会出现多余的引号,所以除了split逗号,还要再split一次引号。
    val medDF = df
      .withColumn("FArray", split($"Name", ","))
      .withColumn("FName", expr("FArray[0]"))
      .withColumn("FArray", split($"FName", "\""))
      .withColumn("FName", $"FArray"(size($"FArray").minus(1)))

    //family_type,分为三类,第一类是60岁以下女性遇难的家庭,第二类是18岁以上男性存活的家庭,第三类其他。
    val femaleDiedFamily_filter = $"Sex" === "female" and $"Age" < 60 and $"Survived" === 0 and $"fellow" > 0

    val maleSurvivedFamily_filter = $"Sex" === "male" and $"Age" >= 18 and $"Survived" === 1 and $"fellow" > 1

    val resDF = medDF.withColumn("family_type", when(femaleDiedFamily_filter, 1)
      .when(maleSurvivedFamily_filter, 2).otherwise(0))

    //familyTable,家庭分类名单,用于后续test集的转化。此处用${FName}_${family_type}的形式保存。
    resDF.filter($"family_type".isin(1,2))
      .select(concat($"FName", lit("_"), $"family_type"))
      .dropDuplicates()
      .write.format("text").mode("overwrite").save("familyTable")

    //如果需要直接收集成Map的话,可用下面代码。
    // 此代码先利用mapPartitions对各分块的数据进行聚合,降低直接调用count而使driver挂掉的风险。
    //另外新建一个默认Set是为了防止某个partition并没有数据的情况(出现概率可能比较少),
    // 从而使得Set的类型变为Set[_>:Tuple]而不能直接flatten

    // val familyMap = df10
    //   .filter($"family_type" === 1 || $"family_type" === 2)
    //   .select("FName", "family_type")
    //   .rdd
    //   .mapPartitions{iter => {
    //       if (!iter.isEmpty) {
    //       Iterator(iter.map(row => (row.getString(0), row.getInt(1))).toSet)}
    //       else Iterator(Set(("defualt", 9)))}
    //                 }
    //   .collect()
    //   .flatten
    //   .toMap

    resDF
  }

  //Fare。首先去掉缺失的(test集合中有一个,如果量多的话,也可以像Age那样通过头衔,年龄等因数来推断)
  //然后对Fare进行分类
  def transFare(df: Dataset[Row]): Dataset[Row] = {

    val medDF = df.na.drop("any", Seq("Fare"))
    val fareBucketer = new QuantileDiscretizer()
      .setInputCol("Fare")
      .setOutputCol("Fare_categorized")
      .setNumBuckets(4)

    fareBucketer.fit(medDF).transform(medDF)
  }

  def index_onehot(df: Dataset[Row]): Tuple2[Dataset[Row], Array[String]] = {
    val stringCols = Array("Sex","fellow_type", "Embarked", "Cabin", "Ticket", "Title")
    val subOneHotCols = stringCols.map(cname => s"${cname}_index")
    val index_transformers: Array[org.apache.spark.ml.PipelineStage] = stringCols.map(
      cname => new StringIndexer()
        .setInputCol(cname)
        .setOutputCol(s"${cname}_index")
        .setHandleInvalid("skip")
    )


    val oneHotCols = subOneHotCols ++ Array("Pclass", "Age_categorized", "Fare_categorized", "family_type")
    val vectorCols = oneHotCols.map(cname => s"${cname}_encoded")
    val encode_transformers: Array[org.apache.spark.ml.PipelineStage] = oneHotCols.map(
      cname => new OneHotEncoder()
        .setInputCol(cname)
        .setOutputCol(s"${cname}_encoded")
    )

    val pipelineStage = index_transformers ++ encode_transformers
    val index_onehot_pipeline = new Pipeline().setStages(pipelineStage)
    val index_onehot_pipelineModel = index_onehot_pipeline.fit(df)

    val resDF = index_onehot_pipelineModel.transform(df).drop(stringCols:_*).drop(subOneHotCols:_*)
    println(resDF.columns.size)
    (resDF, vectorCols)
  }

  def trainData(df: Dataset[Row], vectorCols: Array[String]): TrainValidationSplitModel = {
    //separate and model pipeline,包含划分label和features,机器学习模型的pipeline
    val vectorAssembler = new VectorAssembler()
      .setInputCols(vectorCols)
      .setOutputCol("features")

    val gbtc = new GBTClassifier()
      .setLabelCol("Survived")
      .setFeaturesCol("features")
      .setPredictionCol("prediction")

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, gbtc))

    val paramGrid = new ParamGridBuilder()
      .addGrid(gbtc.stepSize, Seq(0.1))
      .addGrid(gbtc.maxDepth, Seq(5))
      .addGrid(gbtc.maxIter, Seq(20))
      .build()

    val multiclassEval = new MulticlassClassificationEvaluator()
      .setLabelCol("Survived")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")

    val tvs = new TrainValidationSplit()
      .setTrainRatio(0.75)
      .setEstimatorParamMaps(paramGrid)
      .setEstimator(pipeline)
      .setEvaluator(multiclassEval)

    tvs.fit(df)
  }
}
posted @ 2018-10-29 19:07  justcodeit  阅读(409)  评论(0编辑  收藏  举报