Spark实现tf-idf
scala代码:
package offline import org.apache.spark.ml.feature.{HashingTF, IDF} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ object TfIdfTransform { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("tf-idf") .enableHiveSupport() .getOrCreate() // allfiles.txt 所有的文本已经放大一个文件中 val df = spark.sql("select sentence from badou.news_seg") val df_seg = df.selectExpr("split(split(sentence,'##@@##')[0],' ') as seg") val doc_size = df_seg.count() // spark中自带的tf-idf方法 // hashingtf 把文章[word1,word2,...] => sparseVector(2^18,[word hash code(index)][word count]) val hashingtf = new HashingTF().setBinary(false) .setInputCol("seg").setOutputCol("feature_tf") .setNumFeatures(1<<18) //26,2144 // val hashingtf_bn = new HashingTF().setBinary(true) // .setInputCol("seg").setOutputCol("feature_tf") // .setNumFeatures(1<<18) // // val df_tf_bn = hashingtf_bn.transform(df_seg).select("feature_tf") val df_tf = hashingtf.transform(df_seg).select("feature_tf") // idf 对word进行idf加权 val idf = new IDF().setInputCol("feature_tf").setOutputCol("feature_tfidf") .setMinDocFreq(2) val idfModel =idf.fit(df_tf) val df_tfIdf = idfModel.transform(df_tf).select("feature_tfidf") // 自己实现tf-idf // 1. doc Freq 文档频率计算 -> 所有文章的单词集合(词典) val setUDF = udf((str:String)=>str.split(" ").distinct) val df_set = df.withColumn("words_set",setUDF(col("sentence"))) val docFreq_map = df_set.select(explode(col("words_set")).as("word")) .groupBy("word").count().rdd.map(x=>(x(0).toString,x(1).toString)) .collectAsMap() val wordEncode = docFreq_map.keys.zipWithIndex.toMap // [0-42362] val dictSize = docFreq_map.size // 共有4,2363 // docFreq.count() // 2. term Freq 词频计算 对每篇文章(一行数据)统计词频 val mapUDF = udf{(str:String)=> val tfMap = str.split("##@@##")(0).split(" ") .map((_,1L)).groupBy(_._1).mapValues(_.length) val tfIDFMap = tfMap.map{x=> val idf_v = math.log10(doc_size.toDouble/(docFreq_map.getOrElse(x._1,"0.0").toDouble+1.0)) (wordEncode.getOrElse(x._1,0),x._2.toDouble * idf_v) } Vectors.sparse(dictSize,tfIDFMap.toSeq) } val dfTF = df.withColumn("tf_idf",mapUDF(col("sentence"))) } }