spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Meeraj Kunnumpurath <mee...@servicesymphony.com>
Subject Re: Nearest neighbour search
Date Sun, 13 Nov 2016 18:56:33 GMT
This is what I have done, is there a better way of doing this?

  val df = spark.read.option("header", "false").csv("data")


  val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words")

  val tf = new HashingTF().setInputCol("words").setOutputCol("tf")

  val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf")


  val df1 = tf.transform(tk.transform(df))

  val idfs = idf.fit(df1).transform(df1)


  println(nearestNeighbour("http://dbpedia.org/resource/Barack_Obama",
idfs))


  def nearestNeighbour(uri: String, ds: DataFrame) : String = {

    var res : Row = null

    var metric : Double = 0

    val tfIdfSrc = ds.filter(s"_c0 ==
'$uri'").take(1)(0).getAs[Vector]("tf-idf")

    ds.filter("_c0 != '" + uri + "'").foreach { r =>

      val tfIdfDst = r.getAs[Vector]("tf-idf")

      val dp = dorProduct(tfIdfSrc, tfIdfDst)

      if (dp > metric) {

        res = r

        metric = dp

      }

    }

    return res.getAs[String]("_c1")

  }


  def cosineSimilarity(vectorA: Vector, vectorB: Vector) = {

    var dotProduct = 0.0

    var normA = 0.0

    var normB = 0.0

    var index = vectorA.size - 1

    for (i <- 0 to index) {

      dotProduct += vectorA(i) * vectorB(i)

      normA += Math.pow(vectorA(i), 2)

      normB += Math.pow(vectorB(i), 2)

    }

    (dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)))

  }


  def dorProduct(vectorA: Vector, vectorB: Vector) = {

    var dp = 0.0

    var index = vectorA.size - 1

    for (i <- 0 to index) {

      dp += vectorA(i) * vectorB(i)

    }

    dp

  }

On Sun, Nov 13, 2016 at 7:04 PM, Meeraj Kunnumpurath <
meeraj@servicesymphony.com> wrote:

> Hello,
>
> I have a dataset containing TF-IDF vectors for a corpus of documents. How
> do I perform a nearest neighbour search on the dataset, using cosine
> similarity?
>
>   val df = spark.read.option("header", "false").csv("data")
>
>   val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words")
>
>   val tf = new HashingTF().setInputCol("words").setOutputCol("tf")
>
>   val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf")
>
>   val df1 = tf.transform(tk.transform(df))
>
>   idf.fit(df1).transform(df1).select("tf-idf").show(10)
> Thank you
>
> --
> *Meeraj Kunnumpurath*
>
>
> *Director and Executive PrincipalService Symphony Ltd00 44 7702 693597*
>
> *00 971 50 409 0169meeraj@servicesymphony.com <meeraj@servicesymphony.com>*
>



-- 
*Meeraj Kunnumpurath*


*Director and Executive PrincipalService Symphony Ltd00 44 7702 693597*

*00 971 50 409 0169meeraj@servicesymphony.com <meeraj@servicesymphony.com>*

Mime
View raw message