add Word2Vec to pyspark

Ishiihara · Ishiihara · commit c867fdfdf623 · 2014-09-10T01:51:44.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -36,6 +36,8 @@ import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.model.DecisionTreeModel
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
+import org.apache.spark.mllib.feature.Word2Vec
+import org.apache.spark.mllib.feature.Word2VecModel
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
@@ -288,6 +290,37 @@ class PythonMLLibAPI extends Serializable {
     ALS.trainImplicit(ratings, rank, iterations, lambda, blocks, alpha)
   }
 
+  /**
+   * Java stub for Python mllib Word2Vec fit().
+   * @param dataBytesJRDD Input
+   */
+  def trainWord2Vec(
+    dataBytesJRDD: JavaRDD[Array[Byte]]
+    ): Word2VecModel = {
+    val data = dataBytesJRDD.rdd.map(SerDe.deserializeSeqString)
+    data.collect()
+    val word2vec = new Word2Vec()
+    val model = word2vec.fit(data)
+    model
+  }
+
+  /**
+   * Java stub for Python mllib Word2VecModel
+   */
+  def Word2VecSynonynms(
+    model: Word2VecModel,
+    word: String,
+    num: Int
+    ) = {
+    val result = model.findSynonyms(word, num)
+    val vec = Vectors.dense(result.map(_._2))
+    val words = result.map(_._1).toArray
+    val ret = new java.util.LinkedList[java.lang.Object]()
+    ret.add(SerDe.serializeSeqString(words))
+    ret.add(SerDe.serializeDoubleVector(vec))
+    ret
+  }
+
   /**
    * Java stub for Python mllib DecisionTree.train().
    * This stub returns a handle to the Java object instead of the content of the Java object.
@@ -659,6 +692,51 @@ private[spark] object SerDe extends Serializable {
     bytes
   }
 
+  private[python] def serializeSeqString(ss:Seq[String]): Array[Byte] = {
+    val seqLength = ss.length
+    val lengthArray = new Array[Int](seqLength)
+    var totalLength = 0
+    for(s <- ss) {
+      totalLength += s.length
+    }
+    val bytes = new Array[Byte](8 + 4 * seqLength + totalLength)
+    val bb = ByteBuffer.wrap(bytes)
+    bb.order(ByteOrder.nativeOrder())
+    bb.putInt(seqLength)
+    bb.putInt(totalLength)
+    for( i <- 0 until seqLength) {
+      bb.putInt(ss(i).length)
+    }
+    for(s <- ss) {
+      bb.put(s.getBytes())
+    }
+    bytes
+  }
+
+  private[python] def deserializeSeqString(bytes:Array[Byte]):Seq[String] = {
+    require(bytes.length >=0, "Byte array too short")
+    val seqLengthBytes = ByteBuffer.wrap(bytes, 0, 8)
+    seqLengthBytes.order(ByteOrder.nativeOrder())
+    val ib = seqLengthBytes.asIntBuffer()
+    val seqLength = ib.get()
+    val totalLength = ib.get()
+    val lengthBytes = ByteBuffer.wrap(bytes, 8, 4 * seqLength)
+    lengthBytes.order(ByteOrder.nativeOrder())
+    val stringBytes = ByteBuffer.wrap(bytes, 8 + 4 * seqLength, totalLength)
+    stringBytes.order(ByteOrder.nativeOrder())
+    val ss = new Array[String](seqLength)
+    val lengthBuffer = lengthBytes.asIntBuffer()
+    var index = 0
+    while(lengthBuffer.hasRemaining()){
+      val curLen = lengthBuffer.get()
+      val content = new Array[Byte](curLen)
+      stringBytes.get(content, 0, curLen)
+      ss(index) = new String(content)
+      index += 1
+    }
+    ss.toSeq
+  }
+
   private[python] def serializeLabeledPoint(p: LabeledPoint): Array[Byte] = {
     val fb = serializeDoubleVector(p.features)
     val bytes = new Array[Byte](1 + 8 + fb.length)
diff --git a/python/pyspark/mllib/Word2Vec.py b/python/pyspark/mllib/Word2Vec.py
@@ -0,0 +1,88 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Python package for Word2Vec in MLlib.
+"""
+
+from pyspark.mllib._common import \
+    _get_unmangled_double_vector_rdd, _get_unmangled_rdd, \
+    _serialize_double, _deserialize_double_matrix, _deserialize_double_vector, \
+    _deserialize_string_seq, \
+    _get_unmangled_string_seq_rdd
+
+__all__ = ['Word2Vec', 'Word2VecModel']
+
+class Word2VecModel(object):
+
+    def __init__(self, sc, java_model):
+        """
+        :param sc:  Spark context
+        :param java_model:  Handle to Java model object
+        """
+        self._sc = sc
+        self._java_model = java_model
+
+    def __del__(self):
+        self._sc._gateway.detach(self._java_model)
+
+    #def transform(self, word):
+
+    #def findSynonyms(self, vector, num):
+         
+    def findSynonyms(self, word, num): 
+        pythonAPI = self._sc._jvm.PythonMLLibAPI()
+        result = pythonAPI.Word2VecSynonynms(self._java_model, word, num)
+        similarity = _deserialize_double_vector(result[1])
+        words = _deserialize_string_seq(result[0])
+        ret = []
+        for w,s in zip(words, similarity):
+            ret.append((w,s))
+        return ret
+
+class Word2Vec(object):
+    """
+    data:RDD[Array[String]]
+    """
+    def __init__(self):
+        self.vectorSize = 100
+        self.startingAlpha = 0.025
+        self.numPartitions = 1
+        self.numIterations = 1
+
+    def setVectorSize(self, vectorSize):
+        self.vectorSize = vectorSize
+        return self
+
+    def setLearningRate(self, learningRate):
+        self.startingAlpha = learningRate
+        return self
+
+    def setNumPartitions(self, numPartitions):
+        self.numPartitions = numPartitions
+        return self
+
+    def setNumIterations(self, numIterations):
+        self.numIterations = numIterations
+        return self
+
+    def fit(self, data):
+        sc = data.context
+        dataBytes = _get_unmangled_string_seq_rdd(data)
+        model = sc._jvm.PythonMLLibAPI().trainWord2Vec(dataBytes._jrdd)
+        return Word2VecModel(sc, model)
+
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
@@ -143,6 +143,29 @@ def _serialize_double_vector(v):
         raise TypeError("_serialize_double_vector called on a %s; "
                         "wanted ndarray or SparseVector" % type(v))
 
+def _serialize_string_seq(ss):
+    """Serialize a sequence of string"""
+    seqLength = len(ss)
+    totalLength = 0
+    lengthArray = ndarray(shape=[seqLength], dtype=int32)
+    i = 0
+    for s in ss:
+        length = len(s)
+        totalLength = totalLength + length
+        lengthArray[i] = length
+        i = i + 1
+    ba = bytearray(4 + 4 + 4 * seqLength + totalLength)
+    header_bytes = ndarray(shape=[2], buffer=ba, offset=0, dtype=int32)
+    header_bytes[0] = seqLength
+    header_bytes[1] = totalLength
+    _copyto(lengthArray, buffer=ba, offset=8, shape=[seqLength],dtype=int32)
+    i = 0
+    offset = 4 + 4 + 4 * seqLength
+    for s in ss:
+        ba[offset:offset + lengthArray[i]] = bytes(s)
+        offset = offset + lengthArray[i]
+        i = i + 1
+    return ba
 
 def _serialize_dense_vector(v):
     """Serialize a dense vector given as a NumPy array."""
@@ -203,6 +226,19 @@ def _deserialize_double(ba, offset=0):
     return _unpack("d", ba[offset:])[0]
 
 
+def _deserialize_string_seq(ba, offset=0):
+    nb = len(ba) - offset
+    headers = ndarray(shape=[2], buffer=ba, offset=offset, dtype=int32)
+    seqLength = headers[0]
+    totalLength = headers[1]
+    lengthArray = ndarray(shape=[seqLength], buffer=ba, offset=offset + 8, dtype=int32)
+    offset = offset + 8 + 4 * seqLength
+    ret = []
+    for i in range(0, seqLength):
+        ret.append(str(ba[offset: offset + lengthArray[i]]))
+        offset = offset + lengthArray[i]
+    return ret
+
 def _deserialize_double_vector(ba, offset=0):
     """Deserialize a double vector from a mutually understood format.
 
@@ -363,6 +399,8 @@ def _get_unmangled_rdd(data, serializer, cache=True):
         dataBytes.cache()
     return dataBytes
 
+def _get_unmangled_string_seq_rdd(data, cache=True):
+    return _get_unmangled_rdd(data, _serialize_string_seq, cache)
 
 def _get_unmangled_double_vector_rdd(data, cache=True):
     """