需要事先将IsolationForest算法源码利用mvn方式jar包,才可以使用import org.apache.spark.ml.iforest.IForest


import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.iforest.IForest
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

// Wisconsin Breast Cancer Dataset
val dataset = (spark.read.option("inferSchema", "true")

// Index label values: 2 -> 0, 4 -> 1
val indexer = (new StringIndexer()

val assembler = (new VectorAssembler()

val iForest = (new IForest()

val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest))

// let's split the dataset into a training and test dataframe
val Array(trainDF, testDF) = dataset.randomSplit(Array(0.8, 0.2),seed = 123456L)

val model = pipeline.fit(trainDF)
val predictions = model.transform(testDF)

// What was the overall accuracy of the model, using AUC
val evaluator = (new BinaryClassificationEvaluator()

val auc = evaluator.evaluate(predictions)
println(s"The model's auc: $auc")


scala> val auc = evaluator.evaluate(predictions)
auc: Double = 0.9311653116531164

scala> println(s"The model's auc: $auc")
The model's auc: 0.9311653116531164



posted on 2020-04-07 16:04  脆皮软心  阅读(570)  评论(0编辑  收藏  举报