Skip to content

Commit 343db39

Browse files
ganonpmengxr
authored andcommitted
Added setMinCount to Word2Vec.scala
Wanted to customize the private minCount variable in the Word2Vec class. Added a method to do so. Author: ganonp <[email protected]> Closes #3693 from ganonp/my-custom-spark and squashes the following commits: ad534f2 [ganonp] made norm method public 5110a6f [ganonp] Reorganized 854958b [ganonp] Fixed Indentation for setMinCount 12ed8f9 [ganonp] Update Word2Vec.scala 76bdf5a [ganonp] Update Word2Vec.scala ffb88bb [ganonp] Update Word2Vec.scala 5eb9100 [ganonp] Added setMinCount to Word2Vec.scala
1 parent 6cf6fdf commit 343db39

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ class Word2Vec extends Serializable with Logging {
7171
private var numPartitions = 1
7272
private var numIterations = 1
7373
private var seed = Utils.random.nextLong()
74-
74+
private var minCount = 5
75+
7576
/**
7677
* Sets vector size (default: 100).
7778
*/
@@ -114,6 +115,15 @@ class Word2Vec extends Serializable with Logging {
114115
this
115116
}
116117

118+
/**
119+
* Sets minCount, the minimum number of times a token must appear to be included in the word2vec
120+
* model's vocabulary (default: 5).
121+
*/
122+
def setMinCount(minCount: Int): this.type = {
123+
this.minCount = minCount
124+
this
125+
}
126+
117127
private val EXP_TABLE_SIZE = 1000
118128
private val MAX_EXP = 6
119129
private val MAX_CODE_LENGTH = 40
@@ -122,9 +132,6 @@ class Word2Vec extends Serializable with Logging {
122132
/** context words from [-window, window] */
123133
private val window = 5
124134

125-
/** minimum frequency to consider a vocabulary word */
126-
private val minCount = 5
127-
128135
private var trainWordsCount = 0
129136
private var vocabSize = 0
130137
private var vocab: Array[VocabWord] = null

mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ object Vectors {
268268
* @param p norm.
269269
* @return norm in L^p^ space.
270270
*/
271-
private[spark] def norm(vector: Vector, p: Double): Double = {
271+
def norm(vector: Vector, p: Double): Double = {
272272
require(p >= 1.0)
273273
val values = vector match {
274274
case dv: DenseVector => dv.values

0 commit comments

Comments
 (0)