From cb12ad4af2c649d88646e4f4548fb208d20e88e0 Mon Sep 17 00:00:00 2001 From: lawlietAi <627766098@qq.com> Date: Tue, 20 Jun 2017 10:56:12 +0800 Subject: [PATCH 1/3] Update Word2Vec.scala the word2vec model needs an independent function to calculate the cosine similarity.we also desire a function to transform the single document to a vector.so i contribute the two function. --- .../apache/spark/ml/feature/Word2Vec.scala | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index b6909b3386b71..bf75292c35a2e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -273,6 +273,30 @@ class Word2VecModel private[ml] ( def findSynonymsArray(word: String, num: Int): Array[(String, Double)] = { wordVectors.findSynonyms(word, num) } + + + def doc2Vector(text: String, d: Int): SDV = { + val bVectors = wordVectors.getVectors.collect() + val textArray = text.split(" ") + var sum = Vectors.zeros(d) + textArray.foreach { word => + bVectors.value.filter(_.getAs[String]("word") == word).foreach { v => + val sv = v.getAs[SDV]("vector") + BLAS.axpy(1.0, sv, sum) + } + } + BLAS.scal(1.0 / textArray.size, sum) + sum.toDense + } + + def cosineSimilarity(v1: SDV, v2: SDV): Double = { + val bdv1 = new BDV[Double](v1.values) + val bdv2 = new BDV[Double](v2.values) + val modeV1 = sqrt(bdv1 dot bdv1) + val modeV2 = sqrt(bdv2 dot bdv2) + val v1DOTv2 = bdv1 dot bdv2 + v1DOTv2 / (modeV1 * modeV2) + } /** @group setParam */ @Since("1.4.0") From 3fc03d0239490e9d3db71256475974e998f9091f Mon Sep 17 00:00:00 2001 From: lawlietAi <627766098@qq.com> Date: Tue, 20 Jun 2017 11:16:07 +0800 Subject: [PATCH 2/3] Update Word2Vec.scala --- .../src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index bf75292c35a2e..b7892813192b7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT, DenseVector => SDV} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ @@ -32,7 +32,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.{Utils, VersionUtils} - +import breeze.linalg.{DenseVector => BDV} /** * Params for [[Word2Vec]] and [[Word2VecModel]]. */ From d204612a8159cd0672633c753e75335cc99da7ff Mon Sep 17 00:00:00 2001 From: lawlietAi <627766098@qq.com> Date: Tue, 20 Jun 2017 14:18:06 +0800 Subject: [PATCH 3/3] Update Word2Vec.scala --- .../scala/org/apache/spark/ml/feature/Word2Vec.scala | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index b7892813192b7..d5debea0c8284 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -274,13 +274,14 @@ class Word2VecModel private[ml] ( wordVectors.findSynonyms(word, num) } - - def doc2Vector(text: String, d: Int): SDV = { - val bVectors = wordVectors.getVectors.collect() - val textArray = text.split(" ") + /** + * using model.getVectors can get the wordVectors then you must convert the DataFrame + * to an Array. + */ + def doc2Vector(textArray: Array[String], d: Int, wordVectors: Array[Row]): SDV = { var sum = Vectors.zeros(d) textArray.foreach { word => - bVectors.value.filter(_.getAs[String]("word") == word).foreach { v => + wordVectors.value.filter(_.getAs[String]("word") == word).foreach { v => val sv = v.getAs[SDV]("vector") BLAS.axpy(1.0, sv, sum) }