package com.fiveonevv.app.Model
import java.io.{FileInputStream, IOException, ObjectInputStream}
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, Node}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
class GBDTPreprocessor extends Serializable {
/**
*
* @param node 节点
* @return 树的叶子节点
*/
def getLeafNodes(node: Node): Array[Int] = {
var treeLeafNodes = new Array[Int](0)
if (node.isLeaf) {
treeLeafNodes = treeLeafNodes.:+(node.id)
} else {
treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get)
treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get)
}
treeLeafNodes
}
/**
*
* @param node 树节点
* @param features 特征数据
* @return 返回样本所在的叶节点id
*/
def predictModify(node: Node, features: Vector): Int = {
val split = node.split
if (node.isLeaf) {
node.id
} else {
// 判断是连续或者离散特征
if (split.get.featureType == FeatureType.Continuous) {
if (features(split.get.feature) <= split.get.threshold) {
predictModify(node.leftNode.get, features)
} else {
predictModify(node.rightNode.get, features)
}
} else {
if (split.get.categories.contains(features(split.get.feature))) {
predictModify(node.leftNode.get, features)
} else {
predictModify(node.rightNode.get, features)
}
}
}
}
def gbtTrain(gbtTrainData: RDD[LabeledPoint], numTrees: Int): (GradientBoostedTreesModel, Array[Array[Int]]) = {
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.setNumIterations(numTrees)
val gbdtModel = GradientBoostedTrees.train(gbtTrainData, boostingStrategy)
val treeLeafArray = new Array[Array[Int]](numTrees)
for (i <- 0.until(numTrees)) {
// 获取所有树的叶子节点
treeLeafArray(i) = getLeafNodes(gbdtModel.trees(i).topNode)
}
(gbdtModel, treeLeafArray)
}
/**
*
* @param gbtTestData 需要生成特征的数据
* @param gbtModel gbt模型
* @param treeLeafArray gbt模型树的所有叶子节点
* @param numTrees 树的数量
* @return
*/
def gbtFeaturePredict(gbtTestData: RDD[(String, (Double, DenseVector))], gbtModel: GradientBoostedTreesModel, treeLeafArray: Array[Array[Int]], numTrees: Int): RDD[(String, LabeledPoint)] = {
val newFeature = gbtTestData.map(line => {
var gbtFeatures = new Array[Double](0)
for (i <- 0.until(numTrees)) {
val treePredict = predictModify(gbtModel.trees(i).topNode, line._2._2)
val leafArray = new Array[Double]((gbtModel.trees(i).numNodes + 1) / 2) // 完全二叉树叶节点的数量
// 将叶子节点处置为1
leafArray(treeLeafArray(i).indexOf(treePredict)) = 1 // 输入样本落入叶子节点的位置
gbtFeatures = gbtFeatures ++ leafArray
}
(line._1, line._2._1, gbtFeatures) // id, label, gbtFeatures
})
val gbtFeatureRDD = newFeature.map(
x => (x._1, LabeledPoint(x._2, Vectors.dense(x._3)))
)
gbtFeatureRDD
}
/**
*
* @param data 标签
* @param model 模型
* @param isAppend
* @return G B D T 构造新的特征
*/
def getNodeListWithGBDT(data: RDD[LabeledPoint], model: GradientBoostedTreesModel, spark: SparkSession, isAppend: Boolean): Option[RDD[LabeledPoint]] = {
val numTrees = model.numTrees
// 存放每棵树的叶子节点编号
val treeLeafArray = new Array[Array[Int]](numTrees)
for (i <- 0.until(numTrees)) {
treeLeafArray(i) = getLeafNodes(model.trees(i).topNode)
}
// 构造新的特征
val newData:RDD[LabeledPoint] = data.map(line => {
var newFeatures = new Array[Double](0)
for (i <- 0.until(numTrees)) {
// 获取特征所在的节点编号
val treePredict = predictModify(model.trees(i).topNode, line.features)
val treeArray = new Array[Double]((model.trees(i).numNodes + 1) / 2)
treeArray(treeLeafArray(i).indexOf(treePredict)) = 1
newFeatures = newFeatures ++ treeArray
}
if (isAppend) {
new LabeledPoint(line.label, Vectors.dense(newFeatures ++ line.features.toArray))
} else {
new LabeledPoint(line.label, Vectors.dense(newFeatures))
}
})
Option(newData)
}
def loadModel(path: String): Option[GradientBoostedTreesModel] = {
try {
val in = new ObjectInputStream(new FileInputStream(path))
val model = Option(in.readObject().asInstanceOf[GradientBoostedTreesModel])
in.close()
model
} catch {
case ex: ClassNotFoundException =>
println(ex.printStackTrace())
None
case ex: IOException =>
println(ex.printStackTrace())
None
case _: Throwable =>
throw new Exception
}
}
}
package com.fiveonevv.app.Model
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature._
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
class GBDTLRModelProcess {
/**
* 本地读取数据预处理,处理成labeledPoint和DenseVector
* @param rdd 本地读取txt数据 包含features,label
* @return denseVectorRDD
*/
def localDataProcess(rdd:RDD[String]): RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))] = {
val denseVectorRDD = rdd.map{
line =>{
val arr = line.toString.split("\t")
val userInfo = arr(0)
val nonFeatures = arr(1).split("#").map(_.toDouble)
val features = arr(2).split("#").map(_.toDouble)
val label = arr(3).toDouble
//创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
(userInfo,LabeledPoint(label, new DenseVector(features)), LabeledPoint(label, new DenseVector(nonFeatures)),
(label, new DenseVector(nonFeatures)))
}
}
denseVectorRDD
}
/**
* yarn集群读取hive数据预处理,处理成labeledPoint和DenseVector
* @param rdd 读取hive dataFrame转换成rdd
* @return denseVectorRDD
*/
def hiveDataProcess(rdd:RDD[(String, Array[Double], Array[Double], String)]): RDD[(String, LabeledPoint, LabeledPoint,
(Double, DenseVector))] = {
val denseVectorRDD = rdd.map{
line => {
val userInfo = line._1
val numFeatures = line._2 // 数值型特征
val cateFeatures = line._3 // 类别型特征
val label = line._4.toDouble
//创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
(userInfo,
LabeledPoint(label, new DenseVector(cateFeatures)),
LabeledPoint(label,new DenseVector(numFeatures)),
(label, new DenseVector(numFeatures)))
}
}
denseVectorRDD
}
/**
* 用gbdt将连续型的特征离散化处理
* @param train 训练用数据
* @param test 测试用数据
* @return 离散化处理后的训练集和测试集
*/
def gbtFeatureProcess(train:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
test:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
spark:SparkSession): (DataFrame, DataFrame) = {
// 离散特征
val trainRDD = train.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
val testRDD = test.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
// 连续型特征
val gbtTrain = train.map(x => x._3)
val gbtTrainData = train.map(x => (x._1,x._4))
val gbtTestData = test.map(x => (x._1,x._4))
// 连续特征离散化处理
val gbdtPreprocessor = new GBDTPreprocessor
val numTrees = 10
// treeLeafArray所有树的叶子节点
val (gbtModel, treeLeafArray) = gbdtPreprocessor.gbtTrain(gbtTrain,numTrees)
val gbtTrainRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTrainData,gbtModel,treeLeafArray,numTrees)
.map(x => ((x._1,x._2.label),x._2.features.asML))
val allTrainRDD = trainRDD.join(gbtTrainRDD)
val trainDF = spark.createDataFrame(allTrainRDD.map(x => (
x._1._1,
x._1._2,
x._2._1,
x._2._2)))
.toDF("userInfo","label","feature1","feature2")
val gbtTestRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTestData,gbtModel,treeLeafArray,numTrees)
.map(x => ((x._1,x._2.label),x._2.features.asML))
val allTestRDD = testRDD.join(gbtTestRDD)
val testDF = spark.createDataFrame(allTestRDD.map(x => (
x._1._1,
x._1._2,
x._2._1,
x._2._2
)))
.toDF("userInfo","label","feature1","feature2")
(trainDF,testDF)
}
/**
* 构建管道训练流程:归一化、特征选择、网格搜索
* @param data 训练集
* @return pipelineModel
*/
def pipelineTrain(data:DataFrame): PipelineModel = {
data.persist()
val featureScaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
val featureSelector = new ChiSqSelector()
.setFeaturesCol("scaledFeatures")
.setLabelCol("label")
.setNumTopFeatures(80)
.setOutputCol("selectedFeatures")
val lr = new LogisticRegression()
.setMaxIter(200)
.setElasticNetParam(1.0)
.setRegParam(0.001)
.setThreshold(0.5)
.setLabelCol("label")
.setFeaturesCol("selectedFeatures")
// build pipeline
val pipeline = new Pipeline()
.setStages(Array(featureScaler,featureSelector,lr))
// 网格搜索:特征数量、正则化系数、弹性网络参数、迭代次数
val paramGrid = new ParamGridBuilder()
.addGrid(featureSelector.numTopFeatures,Array(70))
.addGrid(lr.maxIter,Array(100))
.addGrid(lr.elasticNetParam,Array(1.0,0.0))
.addGrid(lr.regParam,Array(0.00075))
.build()
// 交叉验证
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator())
.setEstimatorParamMaps(paramGrid)
.setNumFolds(5)
val cvModel = cv.fit(data)
val pipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
data.unpersist()
pipelineModel
}
/**
* pipeline的中间计算结果
* @return 归一化结果、特征选择结果、lr分类结果
*/
def pipelinePredict(data: DataFrame,pipelineModel: PipelineModel): (DataFrame, DataFrame, DataFrame) = {
data.persist()
val featureScaleModel = pipelineModel.stages(0).asInstanceOf[MinMaxScalerModel]
val chiSqSelectorModel = pipelineModel.stages(1).asInstanceOf[ChiSqSelectorModel]
val lrModel = pipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("特征选择个数:",chiSqSelectorModel.explainParam(chiSqSelectorModel.numTopFeatures))
println("LR迭代次数:",lrModel.explainParam(lrModel.maxIter))
println("LR正则化系数:",lrModel.explainParam(lrModel.regParam))
println("LR分类阈值:",lrModel.explainParam(lrModel.threshold))
println("L1L2正则比例:",lrModel.explainParam(lrModel.elasticNetParam))
println("LR特征个数:",lrModel.numFeatures)
val scaledData = featureScaleModel.transform(data) //归一化
val selectedData = chiSqSelectorModel.transform(scaledData) //特征选择
val predictions = lrModel.transform(selectedData) //lr预测
data.unpersist()
(scaledData,selectedData,predictions)
}
/**
* 特征合并
* @param data 数据集dataFrame 包含features1和features2
* @return 合并后的features的数据集
*/
def featureAssembler(data:DataFrame):DataFrame ={
val assembler = new VectorAssembler()
.setInputCols(Array("feature1", "feature2"))
.setOutputCol("features")
val output = assembler.transform(data)
output
}
/**
* 评估模型的效果
* @return 准确率、加权精确率、加权召回率、F1值
*/
def multiClassEvaluate(data: RDD[(Double,Double)]): (Double,Double,Double,Double) = {
val metrics = new MulticlassMetrics(data)
val accuracy = metrics.accuracy
val weightedPrecision = metrics.weightedPrecision
val weightedRecall = metrics.weightedRecall
val f1 = metrics.weightedFMeasure
(accuracy,weightedPrecision,weightedRecall,f1)
}
}
package com.fiveonevv.app.core
import com.fiveonevv.app.Model.GBDTLRModelProcess
import com.fiveonevv.app.util.SparkSqlUtil
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.functions.{udf, _}
import scala.collection.mutable.ListBuffer
object GBDTLrTrain {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
def main(args: Array[String]): Unit = {
val spark = SparkSqlUtil.initSparkSession(SparkSqlUtil.initSparkBuilder(),"GBDTLRTrainDemo")
// 从hive读取数据
val rawDF = spark
.sql("""SELECT * FROM tmp.telco_churn""")
.na.fill(0.0,Seq("TotalCharges"))
// 类别型字段和数值型字段
val cateCols = Array("gender","partner","dependents","phone_service","multiple_lines","internet_service","online_security",
"online_backup","device_protection","tech_support","streaming_tv","streaming_movies","paperless_billing","payment_method")
val numCols = Array("senior_citizen","tenure","monthly_charges","total_charges")
// 建立类别索引
val indexer = cateCols.map(colName => new StringIndexer().setInputCol(colName).setOutputCol(s"${colName}Index"))
//val encoder = new OneHotEncoderEstimator().setInputCols(indexCols).setOutputCols(cateCols map (name => s"${name}Vec"))
// 合并类别型特征
val cateAssembler = new VectorAssembler().setInputCols(cateCols.map(_ + "Index")).setOutputCol("cateFeatures")
// 合并数值型特征
val numAssembler = new VectorAssembler().setInputCols(numCols).setOutputCol("numFeatures").setHandleInvalid("skip")
val stagesArray = new ListBuffer[PipelineStage]()
for (stringIndexer <- indexer) {
stagesArray.append(stringIndexer)
}
stagesArray.append(cateAssembler,numAssembler)
val dataPrePipeline = new Pipeline().setStages(stagesArray.toArray)
// pipeline转换的结果中混杂了稀疏向量和稠密向量,统一转换为稠密向量
val toDense = udf((v: org.apache.spark.ml.linalg.Vector) => v.toDense)
val processedRDD = dataPrePipeline.fit(rawDF).transform(rawDF)
.selectExpr("customerid","numFeatures","cateFeatures","case when churn = 'Yes' then 1.0 else 0.0 end as label")
.withColumn("cateDenseFeatures",toDense(col("cateFeatures")))
.selectExpr("customerid","numFeatures","cateDenseFeatures cateFeatures","label")
.rdd.map(x => (
x(0).toString,
// ml向量不能直接转换为mllib向量,先转成Array然后再转成mllib的稠密向量
x(1).asInstanceOf[org.apache.spark.ml.linalg.Vector].toArray,
x(2).asInstanceOf[org.apache.spark.ml.linalg.DenseVector].toArray,
x(3).toString)
)
val Array(trainRDD, testRDD) = processedRDD.randomSplit(weights=Array(0.7,0.3),1234)
val modelProcess = new GBDTLRModelProcess
val denseVectorTrainRDD = modelProcess.hiveDataProcess(trainRDD)
val denseVectorTestRDD = modelProcess.hiveDataProcess(testRDD)
//gbt训练 将连续型特征离散化并和原离散特征合并成新特征
val (gbtFeatureTrainDF, gbtFeatureTestDF) = modelProcess.gbtFeatureProcess(denseVectorTrainRDD, denseVectorTestRDD, spark)
val unionTrainDF = modelProcess.featureAssembler(gbtFeatureTrainDF) //gbt离散化后特征合并原特征
val unionTestDF = modelProcess.featureAssembler(gbtFeatureTestDF)
//训练数据上采样 正样本复制2倍
val positiveDF = unionTrainDF.filter("label=1")
val negativeDF = unionTrainDF.filter("label=0")
val upPositiveDF = positiveDF//.union(positiveDF).union(positiveDF)
val upSampleDF = negativeDF.union(upPositiveDF)
//管道训练和预测
val pipelineModel = modelProcess.pipelineTrain(upSampleDF)
val (scaledDF, selectedDF, predictions) = modelProcess.pipelinePredict(unionTestDF, pipelineModel)
// 评估模型效果
predictions.select("customerid","label","rawPrediction","probability","prediction").show(50)
val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)
// 检查模型在测试集上的表现
val lp = predictions.select( "label", "prediction")
val countTotal = predictions.count()
val correct = lp.filter(lp("label") === lp("prediction")).count() // 预测正确的样本数量
lp.show(200)
val ratioCorrect = correct.toDouble / countTotal.toDouble
// 1 流失 0 留存
val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count() // 真流失用户
val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count() // 假流失用户
val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count() // 真留存用户
val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count() // 假留存用户
// 真正例率、假正例率
val tpr = truePositive.toDouble / (truePositive + falseNegative)
val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
// 流失用户查准率
val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
// 流失用户召回率
val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
// 留存用户查准率
val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
// 留存用户召回率
val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
println(s"预测样本总数: $countTotal")
println(s"正确预测样本数量: $correct")
println(s"模型准确率: $ratioCorrect")
println(s"模型ROC值:$areaUnderROC")
println(s"模型PR值:$areaUnderPR")
println(s"预测结果中真流失用户个数:$truePositive")
println(s"预测结果中假流失用户个数:$falsePositive")
println(s"预测结果中真流失用户比例: $tpr")
println(s"预测结果中假流失用户比例: $fpr")
println(s"流失用户查准率:$positivePrecision")
println(s"流失用户召回率:$positiveRecall")
println(s"留存用户查准率:$negativePrecision")
println(s"留存用户召回率:$negativeRecall")
spark.stop()
}
}
scala> val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_f0b527f4e73d
scala> val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
areaUnderROC: Double = 0.8306899086101781
scala> val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)
areaUnderPR: Double = 0.6296575868466127
scala> val lp = predictions.select( "label", "prediction")
lp: org.apache.spark.sql.DataFrame = [label: double, prediction: double]
scala> val countTotal = predictions.count()
countTotal: Long = 2095
scala> val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count() // 真流失用户
truePositive: Long = 270
scala> val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count() // 假流失用户
falsePositive: Long = 146
scala> val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count() // 真留存用户
trueNegative: Long = 1397
scala> val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count() // 假留存用户
falseNegative: Long = 282
scala> val tpr = truePositive.toDouble / (truePositive + falseNegative)
tpr: Double = 0.4891304347826087
scala> val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
fpr: Double = 0.09462086843810759
scala> val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
positivePrecision: Double = 0.6490384615384616
scala> val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
positiveRecall: Double = 0.4891304347826087
scala> val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
negativePrecision: Double = 0.8320428826682549
scala> val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
negativeRecall: Double = 0.9053791315618924
scala> println(s"预测样本总数: $countTotal")
预测样本总数: 2095
scala> println(s"正确预测样本数量: $correct")
正确预测样本数量: 1667
scala> println(s"模型准确率: $ratioCorrect")
模型准确率: 0.7957040572792363
scala> println(s"模型ROC值:$areaUnderROC")
模型ROC值:0.8306899086101781
scala> println(s"模型PR值:$areaUnderPR")
模型PR值:0.6296575868466127
scala> println(s"预测结果中真流失用户个数:$truePositive")
预测结果中真流失用户个数:270
scala> println(s"预测结果中假流失用户个数:$falsePositive")
预测结果中假流失用户个数:146
scala> println(s"预测结果中真流失用户比例: $tpr")
预测结果中真流失用户比例: 0.4891304347826087
scala> println(s"预测结果中假流失用户比例: $fpr")
预测结果中假流失用户比例: 0.0946208