case class Adult(features: org.apache.spark.ml.linalg.Vector, label: String)
val df=sc.textFile("file:///opt/software/adult.data.txt").map(_.split(",")).map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble,p(10).toDouble,p(11).toDouble,p(12).toDouble),p(14).toString())).toDF() val test=sc.textFile("file:///opt/software/adult.test.txt").map(_.split(",")).map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble,p(10).toDouble,p(11).toDouble,p(12).toDouble),p(14).toString())).toDF()
val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df) val result = pca.transform(df) val testdata = pca.transform(test) result.show(false) testdata.show(false)
3. 训练分类模型并预测居民收入
在主成分分析的基础上,采用逻辑斯蒂回归,或者决策树模型预测居民收入是否超过
50K;对 Test 数据集进行验证。
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(result) labelIndexer.labels.foreach(println) val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures").fit(result) println(featureIndexer.numFeatures) val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer. labels) val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter( 100) val lrPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr, labelConverter)) val lrPipelineModel = lrPipeline.fit(result) val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel] println("Coefficients: " + lrModel.coefficientMatrix+"Intercept: "+lrModel.interceptVector+"numClasses: "+lrModel.numClasses+"numFeatures: "+lrModel.numFeatures) val lrPredictions = lrPipelineModel.transform(testdata) val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") val lrAccuracy = evaluator.evaluate(lrPredictions) println("Test Error = " + (1.0 - lrAccuracy))
val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures") val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df) val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures") val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(100) val lrPipeline = new Pipeline().setStages(Array(pca, labelIndexer, featureIndexer, lr, labelConverter)) import org.apache.spark.ml.tuning.ParamGridBuilder val paramGrid = new ParamGridBuilder().addGrid(pca.k, Array(1,2,3,4,5,6)).addGrid(lr.elasticNetParam, Array(0.2,0.8)).addGrid(lr.regParam, Array(0.01, 0.1, 0.5)).build() import org.apache.spark.ml.tuning.CrossValidator val cv = new CrossValidator().setEstimator(lrPipeline).setEvaluator(new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")).setEstimatorParamMaps(paramGrid).setNumFolds(3) val cvModel = cv.fit(df) val lrPredictions=cvModel.transform(test) val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") val lrAccuracy =evaluator.evaluate(lrPredictions) println("准确率为"+lrAccuracy) import org.apache.spark.ml.feature.PCAModel val bestModel= cvModel.bestModel.asInstanceOf[PipelineModel] val lrModel = bestModel.stages(3).asInstanceOf[LogisticRegressionModel] println("Coefficients: " + lrModel.coefficientMatrix + "Intercept: "+lrModel.interceptVector+ "numClasses: "+lrModel.numClasses+"numFeatures: "+lrModel.numFeatures) val pcaModel = bestModel.stages(0).asInstanceOf[PCAModel] println("Primary Component: " + pcaModel.pc)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
2021-01-22 构建之法读后感(二)