From 1d9ac29f7c5daeabb0abf4c605dc0671f389e91c Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Tue, 12 Apr 2016 12:58:48 -0700 Subject: [PATCH 1/8] added windowSize setter and getter for python word2Vec --- .../mllib/api/python/PythonMLLibAPI.scala | 4 +++- python/pyspark/ml/feature.py | 21 ++++++++++++++++--- python/pyspark/mllib/feature.py | 11 +++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 1a58779055f4..a5d17fee508e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -680,7 +680,8 @@ private[python] class PythonMLLibAPI extends Serializable { numPartitions: Int, numIterations: Int, seed: Long, - minCount: Int): Word2VecModelWrapper = { + minCount: Int, + windowSize: Int): Word2VecModelWrapper = { val word2vec = new Word2Vec() .setVectorSize(vectorSize) .setLearningRate(learningRate) @@ -688,6 +689,7 @@ private[python] class PythonMLLibAPI extends Serializable { .setNumIterations(numIterations) .setSeed(seed) .setMinCount(minCount) + .setWindowSize(windowSize) try { val model = word2vec.fit(dataJRDD.rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)) new Word2VecModelWrapper(model) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 86b53285b5b0..99767c0d3882 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2176,7 +2176,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None): + seed=None, inputCol=None, outputCol=None, windowSize=5): """ __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \ seed=None, inputCol=None, outputCol=None) @@ -2184,14 +2184,14 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None) + seed=None, windowSize=5) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None): + seed=None, inputCol=None, outputCol=None, windowSize=5): """ setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \ inputCol=None, outputCol=None) @@ -2245,6 +2245,21 @@ def getMinCount(self): """ return self.getOrDefault(self.minCount) + @since("1.4.0") + def setWindowSize(self, value): + """ + Sets the value of :py:attr:`windowSize`. + """ + self._paramMap[self.windowSize] = value + return self + + @since("1.4.0") + def getWindowSize(self): + """ + Gets the value of windowSize or its default value. + """ + return self.getOrDefault(self.windowSize) + def _create_model(self, java_model): return Word2VecModel(java_model) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 612935352575..273e6aca03e8 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -606,6 +606,7 @@ def __init__(self): self.numIterations = 1 self.seed = random.randint(0, sys.maxsize) self.minCount = 5 + self.windowSize = 5 @since('1.2.0') def setVectorSize(self, vectorSize): @@ -658,6 +659,14 @@ def setMinCount(self, minCount): self.minCount = minCount return self + @since('2.0.0') + def setWindowSize(self, windowSize): + """ + Sets window size (default: 5). + """ + self.windowSize = windowSize + return self + @since('1.2.0') def fit(self, data): """ @@ -671,7 +680,7 @@ def fit(self, data): jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize), float(self.learningRate), int(self.numPartitions), int(self.numIterations), int(self.seed), - int(self.minCount)) + int(self.minCount), int(self.windowSize)) return Word2VecModel(jmodel) From 1e373aae7d7931ed24d61b9f3a0268f46adf76c2 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Thu, 14 Apr 2016 11:01:26 -0700 Subject: [PATCH 2/8] added windowSize test case --- python/pyspark/mllib/tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 5f515b666ce1..b7bad667a94b 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -21,6 +21,7 @@ import os import sys +sys.path = sys.path[1:] import tempfile import array as pyarray from time import time, sleep @@ -1026,7 +1027,8 @@ def test_word2vec_setters(self): .setNumPartitions(2) \ .setNumIterations(10) \ .setSeed(1024) \ - .setMinCount(3) + .setMinCount(3) \ + .setWindowSize(6) self.assertEqual(model.vectorSize, 2) self.assertTrue(model.learningRate < 0.02) self.assertEqual(model.numPartitions, 2) From 859dd1b2bb4d6a9c4339c7fd628eab4dc53ea980 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Thu, 14 Apr 2016 11:02:56 -0700 Subject: [PATCH 3/8] added windowSize test case --- python/pyspark/mllib/tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index b7bad667a94b..4582590a1396 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -21,7 +21,6 @@ import os import sys -sys.path = sys.path[1:] import tempfile import array as pyarray from time import time, sleep From 2f757c837377a0b24ad689ee139d20a6bfe92451 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Thu, 14 Apr 2016 11:35:16 -0700 Subject: [PATCH 4/8] added assertEqual for windowSize test case --- python/pyspark/mllib/tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 4582590a1396..2e83c809de65 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -1034,6 +1034,7 @@ def test_word2vec_setters(self): self.assertEqual(model.numIterations, 10) self.assertEqual(model.seed, 1024) self.assertEqual(model.minCount, 3) + self.assertEqual(model.windowSize, 6) def test_word2vec_get_vectors(self): data = [ From 2614fe89a6cf110f33a3cb3862be36e15742873d Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Thu, 14 Apr 2016 13:20:29 -0700 Subject: [PATCH 5/8] set correct version --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 99767c0d3882..0fafeec10f83 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2245,7 +2245,7 @@ def getMinCount(self): """ return self.getOrDefault(self.minCount) - @since("1.4.0") + @since("2.0.0") def setWindowSize(self, value): """ Sets the value of :py:attr:`windowSize`. @@ -2253,7 +2253,7 @@ def setWindowSize(self, value): self._paramMap[self.windowSize] = value return self - @since("1.4.0") + @since("2.0.0") def getWindowSize(self): """ Gets the value of windowSize or its default value. From 83bbdc10e073988b0664d29ff0f73fe7d2629e4b Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Thu, 14 Apr 2016 16:07:44 -0700 Subject: [PATCH 6/8] added a test case in ml --- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 1 + python/pyspark/ml/feature.py | 7 +++++-- python/pyspark/ml/tests.py | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a5d17fee508e..32dc16de0846 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -671,6 +671,7 @@ private[python] class PythonMLLibAPI extends Serializable { * @param numPartitions number of partitions * @param numIterations number of iterations * @param seed initial seed for random generator + * @param windowSize size of window * @return A handle to java Word2VecModelWrapper instance at python side */ def trainWord2VecModel( diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 0fafeec10f83..0265692a4207 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2173,13 +2173,16 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has minCount = Param(Params._dummy(), "minCount", "the minimum number of times a token must appear to be included in the " + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) + windowSize = Param(Params._dummy(), "windowSize", + "the window size of words", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5): """ __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \ - seed=None, inputCol=None, outputCol=None) + seed=None, inputCol=None, outputCol=None, windowSize=5) """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) @@ -2194,7 +2197,7 @@ def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, seed=None, inputCol=None, outputCol=None, windowSize=5): """ setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \ - inputCol=None, outputCol=None) + inputCol=None, outputCol=None, windowSize=5) Sets params for this Word2Vec. """ kwargs = self.setParams._input_kwargs diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 2dcd5eeb52c2..dfaf8b2ef1b8 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -339,6 +339,12 @@ def test_param_property_error(self): params = param_store.params # should not invoke the property 'test_property' self.assertEqual(len(params), 1) + def test_word2vec_param(self): + model = Word2Vec() \ + .setWindowSize(6) + # Check windowSize is set properly + self.assertEqual(model.getWindowSize(), 6) + class FeatureTests(PySparkTestCase): From 417a7d48e3167bed319b8f2472971b61b4024590 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Fri, 15 Apr 2016 14:33:54 -0700 Subject: [PATCH 7/8] rephrase doc for windowSize --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 0265692a4207..7643dfcad7bd 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2174,7 +2174,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has "the minimum number of times a token must appear to be included in the " + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) windowSize = Param(Params._dummy(), "windowSize", - "the window size of words", + "the window size (context words from [-window, window])", typeConverter=TypeConverters.toInt) @keyword_only From 5cdcf22cf0090e3f58b5f1b19c18c350cd6e2d5c Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Mon, 18 Apr 2016 09:59:23 -0700 Subject: [PATCH 8/8] changes made based on review comments --- python/pyspark/ml/feature.py | 4 ++-- python/pyspark/ml/tests.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 7643dfcad7bd..e8d565523ded 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2174,7 +2174,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has "the minimum number of times a token must appear to be included in the " + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) windowSize = Param(Params._dummy(), "windowSize", - "the window size (context words from [-window, window])", + "the window size (context words from [-window, window]). Default value is 5", typeConverter=TypeConverters.toInt) @keyword_only @@ -2253,7 +2253,7 @@ def setWindowSize(self, value): """ Sets the value of :py:attr:`windowSize`. """ - self._paramMap[self.windowSize] = value + self._set(windowSize=value) return self @since("2.0.0") diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index dfaf8b2ef1b8..7ebb09ae8eed 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -340,8 +340,7 @@ def test_param_property_error(self): self.assertEqual(len(params), 1) def test_word2vec_param(self): - model = Word2Vec() \ - .setWindowSize(6) + model = Word2Vec().setWindowSize(6) # Check windowSize is set properly self.assertEqual(model.getWindowSize(), 6)