特征变化--->索引到标签的转换(IndexToString)
package Spark_MLlib import org.apache.spark.ml.feature.{IndexToString, StringIndexer} import org.apache.spark.sql.SparkSession object 特征变换_IndexToString { val spark=SparkSession.builder().master("local").appName("IndexToString").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val df=spark.createDataFrame(Seq( (0,"log"), (1,"text"), (2,"text"), (3,"soyo"), (4,"text"), (5,"log"), (6,"log"), (7,"log") )).toDF("id","label") val model=new StringIndexer().setInputCol("label").setOutputCol("label_index").fit(df) val indexed=model.transform(df) indexed.createOrReplaceTempView("soyo") spark.sql("select * from soyo ").show() spark.sql("select distinct label,label_index from soyo ").show() //去重 //把标签索引的一列重新映射回原有的字符型标签 val converter=new IndexToString().setInputCol("label_index").setOutputCol("original_index") val converted=converter.transform(indexed) converted.show() } }
结果:
+---+-----+-----------+
| id|label|label_index|
+---+-----+-----------+
| 0| log| 0.0|
| 1| text| 1.0|
| 2| text| 1.0|
| 3| soyo| 2.0|
| 4| text| 1.0|
| 5| log| 0.0|
| 6| log| 0.0|
| 7| log| 0.0|
+---+-----+-----------+
+-----+-----------+
|label|label_index|
+-----+-----------+
| soyo| 2.0|
| text| 1.0|
| log| 0.0|
+-----+-----------+
+---+-----+-----------+--------------+
| id|label|label_index|original_index|
+---+-----+-----------+--------------+
| 0| log| 0.0| log|
| 1| text| 1.0| text|
| 2| text| 1.0| text|
| 3| soyo| 2.0| soyo|
| 4| text| 1.0| text|
| 5| log| 0.0| log|
| 6| log| 0.0| log|
| 7| log| 0.0| log|
+---+-----+-----------+--------------+