
/*
Notice:
需要事先将IsolationForest算法源码利用mvn方式jar包,才可以使用import org.apache.spark.ml.iforest.IForest
scala源代码地址:https://github.com/titicaca/spark-iforest
python库sklearn.ensemble.IsolationForest官方文档地址:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
*/
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.iforest.IForest
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
// Wisconsin Breast Cancer Dataset
val dataset = (spark.read.option("inferSchema", "true")
.csv("/anomaly-detection/breastw.csv"))
// Index label values: 2 -> 0, 4 -> 1
val indexer = (new StringIndexer()
.setInputCol("_c10")
.setOutputCol("label"))
val assembler = (new VectorAssembler()
.setInputCols(dataset.columns.filter(!_.contains("label")))
.setOutputCol("features"))
val iForest = (new IForest()
.setNumTrees(100)
.setMaxSamples(256)
.setContamination(0.35)
.setBootstrap(false)
.setMaxDepth(100)
.setSeed(123456L))
val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest))
// let's split the dataset into a training and test dataframe
val Array(trainDF, testDF) = dataset.randomSplit(Array(0.8, 0.2),seed = 123456L)
val model = pipeline.fit(trainDF)
val predictions = model.transform(testDF)
// What was the overall accuracy of the model, using AUC
val evaluator = (new BinaryClassificationEvaluator()
.setLabelCol("label")
.setRawPredictionCol("prediction")
.setMetricName("areaUnderROC"))
val auc = evaluator.evaluate(predictions)
println(s"The model's auc: $auc")
/*
scala> val auc = evaluator.evaluate(predictions)
auc: Double = 0.9311653116531164
scala> println(s"The model's auc: $auc")
The model's auc: 0.9311653116531164
*/

https://www.liangzl.com/get-article-detail-36344.html
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步