add missing comments

Ishiihara · Ishiihara · commit cdef9f482fd2 · 2014-10-03T14:40:37.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -289,24 +289,28 @@ class PythonMLLibAPI extends Serializable {
    * handle to the Java object instead of the content of the Java object.
    * Extra care needs to be taken in the Python code to ensure it gets freed on
    * exit; see the Py4J documentation.
-   * @param dataJRDD Input JavaRDD
+   * @param dataJRDD input JavaRDD
+   * @param vectorSize size of vector
+   * @param learningRate initial learning rate
+   * @param numPartitions number of partitions
+   * @param numIterations number of iterations
+   * @param seed initial seed for random generator
    * @return A handle to java Word2VecModelWrapper instance at python side
    */
   def trainWord2Vec(
     dataJRDD: JavaRDD[java.util.ArrayList[String]],
     vectorSize: Int,
-    startingAlpha: Double,
+    learningRate: Double,
     numPartitions: Int,
     numIterations: Int,
-    seed: Long
-    ): Word2VecModelWrapper = {
+    seed: Long): Word2VecModelWrapper = {
     val data = dataJRDD.rdd.cache()
     val word2vec = new Word2Vec()
-                    .setVectorSize(vectorSize)
-                    .setLearningRate(startingAlpha)
-                    .setNumPartitions(numPartitions)
-                    .setNumIterations(numIterations)
-                    .setSeed(seed)
+        .setVectorSize(vectorSize)
+        .setLearningRate(learningRate)
+        .setNumPartitions(numPartitions)
+        .setNumIterations(numIterations)
+        .setSeed(seed)
     val model = word2vec.fit(data)
     new Word2VecModelWrapper(model)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -67,7 +67,7 @@ private case class VocabWord(
 class Word2Vec extends Serializable with Logging {
 
   private var vectorSize = 100
-  private var startingAlpha = 0.025
+  private var learningRate = 0.025
   private var numPartitions = 1
   private var numIterations = 1
   private var seed = Utils.random.nextLong()
@@ -84,7 +84,7 @@ class Word2Vec extends Serializable with Logging {
    * Sets initial learning rate (default: 0.025).
    */
   def setLearningRate(learningRate: Double): this.type = {
-    this.startingAlpha = learningRate
+    this.learningRate = learningRate
     this
   }
 
@@ -286,7 +286,7 @@ class Word2Vec extends Serializable with Logging {
     val syn0Global =
       Array.fill[Float](vocabSize * vectorSize)((initRandom.nextFloat() - 0.5f) / vectorSize)
     val syn1Global = new Array[Float](vocabSize * vectorSize)
-    var alpha = startingAlpha
+    var alpha = learningRate
     for (k <- 1 to numIterations) {
       val partial = newSentences.mapPartitionsWithIndex { case (idx, iter) =>
         val random = new XORShiftRandom(seed ^ ((idx + 1) << 16) ^ ((-k - 1) << 8))
@@ -300,8 +300,8 @@ class Word2Vec extends Serializable with Logging {
               lwc = wordCount
               // TODO: discount by iteration?
               alpha =
-                startingAlpha * (1 - numPartitions * wordCount.toDouble / (trainWordsCount + 1))
-              if (alpha < startingAlpha * 0.0001) alpha = startingAlpha * 0.0001
+                learningRate * (1 - numPartitions * wordCount.toDouble / (trainWordsCount + 1))
+              if (alpha < learningRate * 0.0001) alpha = learningRate * 0.0001
               logInfo("wordCount = " + wordCount + ", alpha = " + alpha)
             }
             wc += sentence.size
diff --git a/python/pyspark/mllib/Word2Vec.py b/python/pyspark/mllib/Word2Vec.py
@@ -18,10 +18,10 @@
 """
 Python package for Word2Vec in MLlib.
 """
-from numpy import random
-
 from sys import maxint
 
+from numpy import random
+
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
 
 from pyspark.mllib.linalg import _convert_to_vector
@@ -46,15 +46,22 @@ def __del__(self):
 
     def transform(self, word):
         """
-        local use only
+        :param word: a word
+        :return: vector representation of word
+
+        Note: local use only
         TODO: make transform usable in RDD operations from python side
         """
         result = self._java_model.transform(word)
         return PickleSerializer().loads(str(self._sc._jvm.SerDe.dumps(result)))
 
     def findSynonyms(self, x, num):
         """
-        local use only
+        :param x: a word or a vector representation of word
+        :param num: number of synonyms to find
+        :return: array of (word, cosineSimilarity)
+
+        Note: local use only
         TODO: make findSynonyms usable in RDD operations from python side
         """
         jlist = self._java_model.findSynonyms(x, num)
@@ -95,45 +102,70 @@ class Word2Vec(object):
     10
     """
     def __init__(self):
+        """
+        Construct Word2Vec instance
+        """
         self.vectorSize = 100
-        self.startingAlpha = 0.025
+        self.learningRate = 0.025
         self.numPartitions = 1
         self.numIterations = 1
         self.seed = random.randint(0, high=maxint)
 
     def setVectorSize(self, vectorSize):
+        """
+        Sets vector size (default: 100).
+        """
         self.vectorSize = vectorSize
         return self
 
     def setLearningRate(self, learningRate):
-        self.startingAlpha = learningRate
+        """
+        Sets initial learning rate (default: 0.025).
+        """
+        self.learningRate = learningRate
         return self
 
     def setNumPartitions(self, numPartitions):
+        """
+        Sets number of partitions (default: 1). Use a small number for accuracy.
+        """
         self.numPartitions = numPartitions
         return self
 
     def setNumIterations(self, numIterations):
+        """
+        Sets number of iterations (default: 1), which should be smaller than or equal to number of
+        partitions.
+        """
         self.numIterations = numIterations
         return self
 
     def setSeed(self, seed):
+        """
+        Sets random seed (default: a random long integer).
+        """
         self.seed = seed
         return self
 
     def fit(self, data):
+        """
+        Computes the vector representation of each word in vocabulary.
+
+        :param data: training data.
+        :return: python Word2VecModel instance
+        """
         sc = data.context
         ser = PickleSerializer()
         vectorSize = self.vectorSize
-        startingAlpha = self.startingAlpha
+        learningRate = self.learningRate
         numPartitions = self.numPartitions
         numIterations = self.numIterations
         seed = self.seed
 
         # cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
         model = sc._jvm.PythonMLLibAPI().trainWord2Vec(
             data._to_java_object_rdd(), vectorSize,
-            startingAlpha, numPartitions, numIterations, seed)
+            learningRate, numPartitions, numIterations, seed)
         return Word2VecModel(sc, model)