Scala余弦相似度

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable
import scala.collection.mutable.ListBuffer


object test423_cosvec {
  def main(args: Array[String]): Unit = {
    val str1 = "听说菠萝就是凤梨"
    val str2 = "凤梨肯定不会是菠萝"

    val result=textCosine(str1,str2)
    println("两句话的余弦距离: "+result)


  }

  /**
    * 向量的模长
    * @param vec
    */
  def module(vec:Vector[Double]): Double ={
   // math.sqrt( vec.map(x=>x*x).sum )
    math.sqrt(vec.map(math.pow(_,2)).sum)
  }

  /**
    * 求两个向量的内积
    * @param v1
    * @param v2
    */
  def innerProduct(v1:Vector[Double],v2:Vector[Double]): Double ={
    val listBuffer=ListBuffer[Double]()
    for(i<- 0 until v1.length; j<- 0 until v2.length;if i==j){
      if(i==j){
        listBuffer.append( v1(i)*v2(j) )
      }
    }
    listBuffer.sum
  }

  /**
    * 求两个向量的余弦值
    * @param v1
    * @param v2
    */
  def cosvec(v1:Vector[Double],v2:Vector[Double]):Double ={
    val cos=innerProduct(v1,v2) / (module(v1)* module(v2))
    if (cos <= 1) cos else 1.0
  }

  def textCosine(str1:String,str2:String):Double={
    val set=mutable.Set[Char]() //统计两句话所有的字
    str1.foreach(set +=_)
    str2.foreach(set +=_)
    println(set)
    val ints1: Vector[Double] = set.toList.sorted.map(ch => {
      str1.count(s => s == ch).toDouble
    }).toVector
    println("===ints1: "+ints1)
    val ints2: Vector[Double] = set.toList.sorted.map(ch => {
      str2.count(s => s == ch).toDouble
    }).toVector
    println("===ints2: "+ints2)
    cosvec(ints1,ints2)

  }

}

 

posted @ 2019-06-28 16:46  等木鱼的猫  阅读(888)  评论(0编辑  收藏  举报