特征变化--->标签到索引的转换(StringIndexer)

package Spark_MLlib

import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.SparkSession


object 特征变换_StringIndexer {
     val spark = SparkSession.builder().master("local[2]").appName("标签和索引的转换").getOrCreate()
     import spark.implicits._
  def main(args: Array[String]): Unit = {
       val df=spark.createDataFrame(Seq(
         (0,"log"),
         (1,"text"),
         (2,"text"),
         (3,"soyo"),
         (4,"text"),
         (5,"log"),
         (6,"log"),
         (7,"log")
       )).toDF("id","type")
     val indexer=new StringIndexer().setInputCol("type").setOutputCol("type_index")
     val model=indexer.fit(df)
       model.labels.foreach(println)   //类型的频率顺序(高-->低)
     val index=model.transform(df)     //索引先排频率高的即log为0
      index.show(false)

  }
}

结果:

log
text
soyo
+---+----+----------+
|id |type|type_index|
+---+----+----------+
|0  |log |0.0       |
|1  |text|1.0       |
|2  |text|1.0       |
|3  |soyo|2.0       |
|4  |text|1.0       |
|5  |log |0.0       |
|6  |log |0.0       |
|7  |log |0.0       |
+---+----+----------+

posted @ 2017-10-31 11:03  soyosuyang  阅读(1763)  评论(0编辑  收藏  举报