Functionality improvement

Ishiihara · Ishiihara · commit 48d5e721a589 · 2014-09-11T02:50:30.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -291,8 +291,12 @@ class PythonMLLibAPI extends Serializable {
   }
 
   /**
-   * Java stub for Python mllib Word2Vec fit().
-   * @param dataBytesJRDD Input
+   * Java stub for Python mllib Word2Vec fit(). This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   * @param dataBytesJRDD Input JavaRDD
+   * @return A handle to java Word2VecModel instance at python side
    */
   def trainWord2Vec(
     dataBytesJRDD: JavaRDD[Array[Byte]]
@@ -304,19 +308,60 @@ class PythonMLLibAPI extends Serializable {
   }
 
   /**
-   * Java stub for Python mllib Word2VecModel
+   * Java stub for Python mllib Word2VecModel transform
+   * @param model Word2VecModel instance
+   * @param word a word
+   * @return serialized vector representation of word
    */
-  def Word2VecSynonynms(
+  def Word2VecModelTransform(
+    model: Word2VecModel,
+    word: String
+    ): Array[Byte] = {
+    SerDe.serializeDoubleVector(model.transform(word))
+  }
+
+  /**
+   * Java stub for Python mllib Word2VecModel findSynonyms
+   * @param model Word2VecModel instance
+   * @param word a word
+   * @param num number of synonyms to find
+   * @return a java LinkedList containing serialized version of
+   * synonyms and similarities
+   */
+  def Word2VecModelSynonyms(
     model: Word2VecModel,
     word: String,
     num: Int
-    ) = {
+    ): java.util.List[java.lang.Object] = {
     val result = model.findSynonyms(word, num)
-    val vec = Vectors.dense(result.map(_._2))
-    val words = result.map(_._1).toArray
+    val similarity = Vectors.dense(result.map(_._2))
+    val words = result.map(_._1)
+    val ret = new java.util.LinkedList[java.lang.Object]()
+    ret.add(SerDe.serializeSeqString(words))
+    ret.add(SerDe.serializeDoubleVector(similarity))
+    ret
+  }
+
+  /**
+   * Java stub for Python mllib Word2VecModel findSynonyms
+   * @param model Word2VecModel instance
+   * @param vecBytes serialization of vector representation of words
+   * @param num number of synonyms to find
+   * @return a java LinkedList containing serialized version of
+   * synonyms and similarities
+   */
+  def Word2VecModelSynonyms(
+    model: Word2VecModel,
+    vecBytes: Array[Byte],
+    num: Int
+    ): java.util.List[java.lang.Object] = {
+    val vec = SerDe.deserializeDoubleVector(vecBytes)
+    val result = model.findSynonyms(vec, num)
+    val similarity = Vectors.dense(result.map(_._2))
+    val words = result.map(_._1)
     val ret = new java.util.LinkedList[java.lang.Object]()
     ret.add(SerDe.serializeSeqString(words))
-    ret.add(SerDe.serializeDoubleVector(vec))
+    ret.add(SerDe.serializeDoubleVector(similarity))
     ret
   }
 
@@ -713,7 +758,7 @@ private[spark] object SerDe extends Serializable {
   }
 
   private[python] def deserializeSeqString(bytes:Array[Byte]):Seq[String] = {
-    require(bytes.length >=0, "Byte array too short")
+    require(bytes.length >= 8, "Byte array too short")
     val seqLengthBytes = ByteBuffer.wrap(bytes, 0, 8)
     seqLengthBytes.order(ByteOrder.nativeOrder())
     val ib = seqLengthBytes.asIntBuffer()
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -79,4 +79,11 @@ class PythonMLLibAPISuite extends FunSuite {
     val empty2D = SerDe.to2dArray(emptyMatrix)
     assert(empty2D === Array[Array[Double]]())
   }
+
+  test("string seq serialization") {
+    val original = Array[String]("abc", "def", "ghi")
+    val bytes = SerDe.serializeSeqString(original)
+    val ss = SerDe.deserializeSeqString(bytes)
+    assert(ss === original)
+  }
 }
diff --git a/python/pyspark/mllib/Word2Vec.py b/python/pyspark/mllib/Word2Vec.py
@@ -20,15 +20,17 @@
 """
 
 from pyspark.mllib._common import \
-    _get_unmangled_double_vector_rdd, _get_unmangled_rdd, \
-    _serialize_double, _deserialize_double_matrix, _deserialize_double_vector, \
+    _serialize_double_vector, \
+    _deserialize_double_vector, \
     _deserialize_string_seq, \
     _get_unmangled_string_seq_rdd
 
 __all__ = ['Word2Vec', 'Word2VecModel']
 
 class Word2VecModel(object):
-
+    """
+    class for Word2Vec model
+    """
     def __init__(self, sc, java_model):
         """
         :param sc:  Spark context
@@ -40,23 +42,38 @@ def __init__(self, sc, java_model):
     def __del__(self):
         self._sc._gateway.detach(self._java_model)
 
-    #def transform(self, word):
+    def transform(self, word):
+        pythonAPI = self._sc._jvm.PythonMLLibAPI()
+        result = pythonAPI.Word2VecModelTransform(self._java_model, word)
+        return _deserialize_double_vector(result)
 
-    #def findSynonyms(self, vector, num):
-         
-    def findSynonyms(self, word, num): 
+    def findSynonyms(self, x, num):
         pythonAPI = self._sc._jvm.PythonMLLibAPI()
-        result = pythonAPI.Word2VecSynonynms(self._java_model, word, num)
-        similarity = _deserialize_double_vector(result[1])
+        if type(x) == str:
+            result = pythonAPI.Word2VecModelSynonyms(self._java_model, x, num)
+        else:
+            xSer = _serialize_double_vector(x)
+            result = pythonAPI.Word2VecModelSynonyms(self._java_model, xSer, num)
         words = _deserialize_string_seq(result[0])
-        ret = []
-        for w,s in zip(words, similarity):
-            ret.append((w,s))
-        return ret
+        similarity = _deserialize_double_vector(result[1])
+        return zip(words, similarity)
 
 class Word2Vec(object):
     """
-    data:RDD[Array[String]]
+    Word2Vec creates vector representation of words in a text corpus.
+    The algorithm first constructs a vocabulary from the corpus
+    and then learns vector representation of words in the vocabulary.
+    The vector representation can be used as features in
+    natural language processing and machine learning algorithms.
+
+    We used skip-gram model in our implementation and hierarchical softmax
+    method to train the model. The variable names in the implementation
+    matches the original C implementation.
+    For original C implementation, see https://code.google.com/p/word2vec/
+    For research papers, see
+    Efficient Estimation of Word Representations in Vector Space
+    and
+    Distributed Representations of Words and Phrases and their Compositionality.
     """
     def __init__(self):
         self.vectorSize = 100
@@ -81,8 +98,23 @@ def setNumIterations(self, numIterations):
         return self
 
     def fit(self, data):
+        """
+        :param data: Input RDD
+        """
         sc = data.context
         dataBytes = _get_unmangled_string_seq_rdd(data)
         model = sc._jvm.PythonMLLibAPI().trainWord2Vec(dataBytes._jrdd)
         return Word2VecModel(sc, model)
 
+def _test():
+    import doctest
+    from pyspark import SparkContext
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
@@ -144,7 +144,7 @@ def _serialize_double_vector(v):
                         "wanted ndarray or SparseVector" % type(v))
 
 def _serialize_string_seq(ss):
-    """Serialize a sequence of string"""
+    """Serialize a sequence of string."""
     seqLength = len(ss)
     totalLength = 0
     lengthArray = ndarray(shape=[seqLength], dtype=int32)
@@ -200,6 +200,31 @@ def _serialize_sparse_vector(v):
     return ba
 
 
+def _deserialize_string_seq(ba, offset=0):
+    """Deserialize a string sequence from a mutually understood format.
+    >>> import sys
+    >>> _derserialize_string_seq(_serialize_string_seq(['abc'])) == ['abc']
+    True
+    """
+    if type(ba) != bytearray:
+        raise TypeError("__deserialize_string_seq called on a %s; "
+                        "wanted bytearray" % type(ba))
+    nb = len(ba) - offset
+    if nb < 8:
+        raise TypeError("__deserialize_string_seq called on a %d-byte array, "
+                        "which is too short" % nb)
+    headers = ndarray(shape=[2], buffer=ba, offset=offset, dtype=int32)
+    seqLength = headers[0]
+    totalLength = headers[1]
+    lengthArray = ndarray(shape=[seqLength], buffer=ba, offset=offset + 8, dtype=int32)
+    offset = offset + 8 + 4 * seqLength
+    ret = []
+    for i in range(0, seqLength):
+        curLen = lengthArray[i]
+        ret.append(str(ba[offset: offset + curLen]))
+        offset = offset + curLen
+    return ret
+
 def _deserialize_double(ba, offset=0):
     """Deserialize a double from a mutually understood format.
 
@@ -226,19 +251,6 @@ def _deserialize_double(ba, offset=0):
     return _unpack("d", ba[offset:])[0]
 
 
-def _deserialize_string_seq(ba, offset=0):
-    nb = len(ba) - offset
-    headers = ndarray(shape=[2], buffer=ba, offset=offset, dtype=int32)
-    seqLength = headers[0]
-    totalLength = headers[1]
-    lengthArray = ndarray(shape=[seqLength], buffer=ba, offset=offset + 8, dtype=int32)
-    offset = offset + 8 + 4 * seqLength
-    ret = []
-    for i in range(0, seqLength):
-        ret.append(str(ba[offset: offset + lengthArray[i]]))
-        offset = offset + lengthArray[i]
-    return ret
-
 def _deserialize_double_vector(ba, offset=0):
     """Deserialize a double vector from a mutually understood format.
 
@@ -400,6 +412,12 @@ def _get_unmangled_rdd(data, serializer, cache=True):
     return dataBytes
 
 def _get_unmangled_string_seq_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of Python string sequence to a Java RDD of
+    Array[Byte]
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
     return _get_unmangled_rdd(data, _serialize_string_seq, cache)
 
 def _get_unmangled_double_vector_rdd(data, cache=True):

Original file line number	Diff line number	Diff line change
`@@ -79,4 +79,11 @@ class PythonMLLibAPISuite extends FunSuite {`
`79`	`79`	`val empty2D = SerDe.to2dArray(emptyMatrix)`
`80`	`80`	`assert(empty2D === Array[Array[Double]]())`
`81`	`81`	`}`
	`82`	`+`
	`83`	`+ test("string seq serialization") {`
	`84`	`+ val original = Array[String]("abc", "def", "ghi")`
	`85`	`+ val bytes = SerDe.serializeSeqString(original)`
	`86`	`+ val ss = SerDe.deserializeSeqString(bytes)`
	`87`	`+ assert(ss === original)`
	`88`	`+ }`
`82`	`89`	`}`