特征变化--->索引到标签的转换(IndexToString)

package Spark_MLlib

import org.apache.spark.ml.feature.{IndexToString, StringIndexer}
import org.apache.spark.sql.SparkSession

object 特征变换_IndexToString {
         val spark=SparkSession.builder().master("local").appName("IndexToString").getOrCreate()
        import spark.implicits._
  def main(args: Array[String]): Unit = {
      val df=spark.createDataFrame(Seq(
        (0,"log"),
        (1,"text"),
        (2,"text"),
        (3,"soyo"),
        (4,"text"),
        (5,"log"),
        (6,"log"),
        (7,"log")
      )).toDF("id","label")
    val model=new StringIndexer().setInputCol("label").setOutputCol("label_index").fit(df)
    val indexed=model.transform(df)
    indexed.createOrReplaceTempView("soyo")
       spark.sql("select * from soyo ").show()
       spark.sql("select distinct label,label_index from soyo ").show()  //去重
    //把标签索引的一列重新映射回原有的字符型标签
    val converter=new IndexToString().setInputCol("label_index").setOutputCol("original_index")
    val converted=converter.transform(indexed)
    converted.show()

  }
}

结果:

+---+-----+-----------+
| id|label|label_index|
+---+-----+-----------+
|  0|  log|        0.0|
|  1| text|        1.0|
|  2| text|        1.0|
|  3| soyo|        2.0|
|  4| text|        1.0|
|  5|  log|        0.0|
|  6|  log|        0.0|
|  7|  log|        0.0|
+---+-----+-----------+

+-----+-----------+
|label|label_index|
+-----+-----------+
| soyo|        2.0|
| text|        1.0|
|  log|        0.0|
+-----+-----------+

+---+-----+-----------+--------------+
| id|label|label_index|original_index|
+---+-----+-----------+--------------+
|  0|  log|        0.0|           log|
|  1| text|        1.0|          text|
|  2| text|        1.0|          text|
|  3| soyo|        2.0|          soyo|
|  4| text|        1.0|          text|
|  5|  log|        0.0|           log|
|  6|  log|        0.0|           log|
|  7|  log|        0.0|           log|
+---+-----+-----------+--------------+


posted @ 2017-10-31 19:11  soyosuyang  阅读(1058)  评论(0编辑  收藏  举报