From 5eb91000cd74ddd7704c79ca69259ee48c5840f9 Mon Sep 17 00:00:00 2001 From: ganonp Date: Sun, 14 Dec 2014 15:56:19 -0600 Subject: [PATCH 1/7] Added setMinCount to Word2Vec.scala Wanted to customize the minCount variable in the Word2Vec class. Added a method to do so. --- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 7960f3cab576f..77b229355d7af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -122,8 +122,13 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private val window = 5 - /** minimum frequency to consider a vocabulary word */ - private val minCount = 5 +/** minimum frequency to consider a vocabulary word */ +private var minCount = 5 + +def setMinCount(minCount: Int): this.type = { +this.minCount = minCount +this +} private var trainWordsCount = 0 private var vocabSize = 0 From ffb88bbcd389cd04bccc60ca0f655fe29e0ad464 Mon Sep 17 00:00:00 2001 From: ganonp Date: Mon, 15 Dec 2014 18:21:29 -0600 Subject: [PATCH 2/7] Update Word2Vec.scala Added javadoc --- .../apache/spark/mllib/feature/Word2Vec.scala | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 77b229355d7af..249c75b10ecf3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -121,14 +121,17 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private val window = 5 - -/** minimum frequency to consider a vocabulary word */ -private var minCount = 5 - -def setMinCount(minCount: Int): this.type = { -this.minCount = minCount -this -} + + /** minimum frequency to consider a vocabulary word */ + private var minCount = 5 + + /** Sets the minimum frequency a token must appear to be included in the word2vec model's + * vocabulary (default: 5). + */ + def setMinCount(minCount: Int): this.type = { + this.minCount = minCount + this + } private var trainWordsCount = 0 private var vocabSize = 0 From 76bdf5a207f74144d2964b6a34fcf74547923912 Mon Sep 17 00:00:00 2001 From: ganonp Date: Mon, 15 Dec 2014 18:39:01 -0600 Subject: [PATCH 3/7] Update Word2Vec.scala --- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 249c75b10ecf3..83975e9d57390 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -122,10 +122,12 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private val window = 5 - /** minimum frequency to consider a vocabulary word */ + /** The minimum number of times a token must occur in the training corpus to be + * included in the word2vec model (default: 5). + */ private var minCount = 5 - /** Sets the minimum frequency a token must appear to be included in the word2vec model's + /** Sets minCount, the minimum number of times a token must appear to be included in the word2vec model's * vocabulary (default: 5). */ def setMinCount(minCount: Int): this.type = { From 12ed8f9e2376a55bf58a798c4d3f7e016892b72f Mon Sep 17 00:00:00 2001 From: ganonp Date: Wed, 17 Dec 2014 18:32:23 -0600 Subject: [PATCH 4/7] Update Word2Vec.scala --- .../org/apache/spark/mllib/feature/Word2Vec.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 83975e9d57390..fe5ff996d4447 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -122,14 +122,16 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private val window = 5 - /** The minimum number of times a token must occur in the training corpus to be - * included in the word2vec model (default: 5). - */ + /** + * The minimum number of times a token must occur in the training corpus to be + * included in the word2vec model (default: 5). + */ private var minCount = 5 - /** Sets minCount, the minimum number of times a token must appear to be included in the word2vec model's - * vocabulary (default: 5). - */ + /** + * Sets minCount, the minimum number of times a token must appear to be included in the word2vec + * model's vocabulary (default: 5). + */ def setMinCount(minCount: Int): this.type = { this.minCount = minCount this From 854958bc92e76b9c5cc6a4cd2fda57403f05b65c Mon Sep 17 00:00:00 2001 From: ganonp Date: Thu, 18 Dec 2014 09:12:19 -0600 Subject: [PATCH 5/7] Fixed Indentation for setMinCount --- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index fe5ff996d4447..c2b81e67740a3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -133,8 +133,8 @@ class Word2Vec extends Serializable with Logging { * model's vocabulary (default: 5). */ def setMinCount(minCount: Int): this.type = { - this.minCount = minCount - this + this.minCount = minCount + this } private var trainWordsCount = 0 From 5110a6f5641af2f9461b4fc8c043b700de3aa4f3 Mon Sep 17 00:00:00 2001 From: ganonp Date: Fri, 19 Dec 2014 17:46:47 -0600 Subject: [PATCH 6/7] Reorganized Moved mincount variable to top and removed its javadoc and moved setMinCount below other set methods. --- .../apache/spark/mllib/feature/Word2Vec.scala | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index c2b81e67740a3..d25a7cd5b439d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -71,7 +71,8 @@ class Word2Vec extends Serializable with Logging { private var numPartitions = 1 private var numIterations = 1 private var seed = Utils.random.nextLong() - + private var minCount = 5 + /** * Sets vector size (default: 100). */ @@ -114,20 +115,6 @@ class Word2Vec extends Serializable with Logging { this } - private val EXP_TABLE_SIZE = 1000 - private val MAX_EXP = 6 - private val MAX_CODE_LENGTH = 40 - private val MAX_SENTENCE_LENGTH = 1000 - - /** context words from [-window, window] */ - private val window = 5 - - /** - * The minimum number of times a token must occur in the training corpus to be - * included in the word2vec model (default: 5). - */ - private var minCount = 5 - /** * Sets minCount, the minimum number of times a token must appear to be included in the word2vec * model's vocabulary (default: 5). @@ -136,6 +123,14 @@ class Word2Vec extends Serializable with Logging { this.minCount = minCount this } + + private val EXP_TABLE_SIZE = 1000 + private val MAX_EXP = 6 + private val MAX_CODE_LENGTH = 40 + private val MAX_SENTENCE_LENGTH = 1000 + + /** context words from [-window, window] */ + private val window = 5 private var trainWordsCount = 0 private var vocabSize = 0 From ad534f26c44a7bdc8ee91f73d80a93bd13aa6805 Mon Sep 17 00:00:00 2001 From: ganonp Date: Sat, 20 Dec 2014 01:05:37 -0600 Subject: [PATCH 7/7] made norm method public --- .../src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 47d1a76fa361d..01f3f90577142 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -268,7 +268,7 @@ object Vectors { * @param p norm. * @return norm in L^p^ space. */ - private[spark] def norm(vector: Vector, p: Double): Double = { + def norm(vector: Vector, p: Double): Double = { require(p >= 1.0) val values = vector match { case dv: DenseVector => dv.values