2020.02.15
1.数据导入
从文件中导入数据,并转化为 DataFrame。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
import org.apache.spark.ml.feature.PCA import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{Vector,Vectors} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.{Pipeline,PipelineModel} import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} import org.apache.spark.sql.functions; scala> import spark.implicits. _ import spark.implicits. _ scala> case class Adult(features : org.apache.spark.ml.linalg.Vector, label : String) < br > defined class Adult scala> val df = sc.textFile( "adult.data.txt" ).map( _ .split( "," )).map(p = >Adult(Vectors.dense(p( 0 ).toDouble,p( 2 ).toDouble,p( 4 ).toDouble,p( 10 ).toDouble,p( 11 ).toDouble,p( 12 ).toDouble),p( 14 ).toString())).toDF() < br > df : org.apache.spark.sql.DataFrame = [features : vector, label : string] scala> val test = sc.textFile( "adult.test.txt" ).map( _ .split( "," )).map(p = >Adult(Vectors.dense(p( 0 ).toDouble,p( 2 ).toDouble,p( 4 ).toDouble, p( 10 ).toDouble, p( 11 ).toDouble, p( 12 ).toDouble), p( 14 ).toString())).toDF() < br > test : org.apache.spark.sql.DataFrame = [features : vector, label : string] |
2.进行主成分分析(PCA)
对 6 个连续型的数值型变量进行主成分分析。PCA(主成分分析)是通过正交变换把一 组相关变量的观测值转化成一组线性无关的变量值,即主成分的一种方法。PCA 通过使用 主成分把特征向量投影到低维空间,实现对特征向量的降维。请通过 setK()方法将主成分数 量设置为 3,把连续型的特征向量转化成一个 3 维的主成分。
1
2
3
4
5
|
scala> val pca = new PCA().setInputCol( "features" ).setOutputCol( "pcaFeatures" ).setK( 3 ).fit(df) scala> val result = pca.transform(df) scala> val testdata = pca.transform(test) scala> result.show( false ) scala> testdata.show( false ) |
3.训练分类模型并预测居民收入
在主成分分析的基础上,采用逻辑斯蒂回归,或者决策树模型预测居民收入是否超过 50K;对 Test 数据集进行验证。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
scala> val labelIndexer = new StringIndexer().setInputCol( "label" ).setOutputCol( "indexedLabel" ).fit(result) scala> labelIndexer.labels.foreach(println) scala> val featureIndexer = new VectorIndexer().setInputCol( "pcaFeatures" ).setOutputCol( "indexedFeatures" ).fit(result) scala> println(featureIndexer.numFeatures) scala> val labelConverter = new IndexToString().setInputCol( "prediction" ).setOutputCol( "predictedLabel" ).setLabels(labelIndexer. labels) scala> val lr = new LogisticRegression().setLabelCol( "indexedLabel" ).setFeaturesCol( "indexedFeatures" ).setMaxIter( 100 ) scala> val lrPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr, labelConverter)) scala> val lrPipelineModel = lrPipeline.fit(result) scala> val lrModel = lrPipelineModel.stages( 2 ).asInstanceOf[LogisticRegressionModel] scala> println( "Coefficients: " + lrModel.coefficientMatrix+ "Intercept: " +lrModel.interceptVector+ "numClasses: " +lrModel.numClasses+ "numFeatures: " +lrModel.numFeatures) scala> val lrPredictions = lrPipelineModel.transform(testdata) scala> val evaluator = new MulticlassClassificationEvaluator().setLabelCol( "indexedLabel" ).setPredictionCol( "prediction" ) scala> val lrAccuracy = evaluator.evaluate(lrPredictions) scala> println( "Test Error = " + ( 1.0 - lrAccuracy)) |
4.超参数调优
利用 CrossValidator 确定最优的参数,包括最优主成分 PCA 的维数、分类器自身的参数 等。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
scala> val pca = new PCA().setInputCol( "features" ).setOutputCol( "pcaFeatures" ) scala> val labelIndexer = new StringIndexer().setInputCol( "label" ).setOutputCol( "indexedLabel" ).fit(df) scala> val featureIndexer = new VectorIndexer().setInputCol( "pcaFeatures" ).setOutputCol( "indexedFeatures" ) scala> val labelConverter = new IndexToString().setInputCol( "prediction" ).setOutputCol( "predictedLabel" ).setLabels(labelIndexer.l abels) scala> val lr = new LogisticRegression().setLabelCol( "indexedLabel" ).setFeaturesCol( "indexedFeatures" ).setMaxIter( 1 00 ) scala> val lrPipeline = new Pipeline().setStages(Array(pca, labelIndexer, featureIndexer, lr, labelConverter)) scala> val paramGrid = new ParamGridBuilder().addGrid(pca.k, Array( 1 , 2 , 3 , 4 , 5 , 6 )).addGrid(lr.elasticNetParam, Array( 0.2 , 0.8 )).addGrid(lr.regParam, Array( 0.01 , 0.1 , 0.5 )).build() scala> val cv = new CrossValidator().setEstimator(lrPipeline).setEvaluator( new MulticlassClassificationEvaluator().setLabelCol( "indexedLabel" ).setPredictionCol( "prediction" )).se tEstimatorParamMaps(paramGrid).setNumFolds( 3 ) scala> val cvModel = cv.fit(df) scala> val lrPredictions = cvModel.transform(test) scala> val evaluator = new MulticlassClassificationEvaluator().setLabelCol( "indexedLabel" ).setPredictionCol( "prediction" ) scala> val lrAccuracy = evaluator.evaluate(lrPredictions) scala> println( "准确率为" +lrAccuracy) scala> val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel] scala> val lrModel = bestModel.stages( 3 ).asInstanceOf[LogisticRegressionModel] scala> println( "Coefficients: " + lrModel.coefficientMatrix + "Intercept: " +lrModel.interceptVector+ "numClasses: " +lrModel.numClasses+ "numFeatures: " +lrModel.numFeatures) scala> val pcaModel = bestModel.stages( 0 ).asInstanceOf[PCAModel] scala> println( "Primary Component: " + pcaModel.pc) |