From fcbeee2d180130c0b91552edb9af5c8e6c9137d6 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 6 Oct 2014 11:15:40 -0400 Subject: [PATCH 01/25] SPARK-3789: initial commit --- python/pyspark/graphx/__init__.pyc | Bin 0 -> 329 bytes python/pyspark/graphx/edge.pyc | Bin 0 -> 1440 bytes python/pyspark/graphx/graph.pyc | Bin 0 -> 1359 bytes python/pyspark/graphx/vertex.pyc | Bin 0 -> 2768 bytes 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 python/pyspark/graphx/__init__.pyc create mode 100644 python/pyspark/graphx/edge.pyc create mode 100644 python/pyspark/graphx/graph.pyc create mode 100644 python/pyspark/graphx/vertex.pyc diff --git a/python/pyspark/graphx/__init__.pyc b/python/pyspark/graphx/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9db5b7a23975579bbf973e73b78e88795918fd86 GIT binary patch literal 329 zcmYL@!Ait15Qb;Ab=9JA%!HX#3#oM|puBW|(ZWFh$+oUF`Y_Ix+zKSoP z(<(S5-{hY^naStZaZ)}`o(lMji1Hn6e?^c0D!94vuEqr!)&Y9S)#uw=Dr^}%`W{J<-uW2y|&sZ z&*p8t=opz7X-sa|CkP>JAw=j$n6_@Zq57tr*VcqXs^0Gk)dW&03HXPh_e*eoPq0M|gri+FMSD literal 0 HcmV?d00001 diff --git a/python/pyspark/graphx/edge.pyc b/python/pyspark/graphx/edge.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25da5729cb81948838dfb4e5a0fd71822601c9be GIT binary patch literal 1440 zcmb`GU5nE|6oyZder)Rp3WDAUxezRZjeF&VBBB-1i?U>8(Xuea%%sgWO+scWEv#4T zAM~&I2lPEN*{wHT$TnoooH_IL%zI{j{<<~XoId@OQu}pKzedr|U?kBD)+6cD(xbws znNOY!Ncof@+Mypr7errX0m&{Mk9s`wAoXAc9$C-9^-JNPjO2% z!6L&DU%O417KzqjuEVufGBj1FN~4<4EL8YSHAZdP5j%1_J!LE|KFV7ZeHZ4OD7Q*o zco2vn1$R$l9_UWWC)CPJla+FTjtvyq2sK%wpTmn5)k;N6nHZBCM#pv1EF*1gZ6%HB zt!vI%%}u1B+8A92m@7nH=0*t4fMx*lQm%NbE8P2NoO^WY(|7zqi|(Kqck1xEaK%|8 z<%nm;oG8OUGpqHJt694#amU%L~p4Jk8C3243GCiBXu=W`GGL zi|+W*BCEfn=H6^wrhDQVV7XONpCrXv=|^Z>D$Sc;mVaXL0E@LJ8opMILN>H|{1*$$ z^B)p%vMb#DW=~YCQC^m+`B>%UlO5OS0DgN_OS}Xj%n%VFR+U^A+~o5JA*)n~m=hQC zQt=HoH`(xY+d*3II?u*F&<9vk^Xk2z=MREG2PK%gd(V&CLbcB>Mw{HTW^adb9SW<+ ROQ*rBdstTU(R=C52zrC^myNoEt(c50ZQ0b3?NY0Y?NETXe-D1Lee~~`Jb77zi$%<#@D9jxv2>^J z@t!T zu!Hl-`Gff@`2+3qj849Ma0-o8op&^|8qGfQq8Yz_b7dGm`~J5~mRAq=yIA!@5G%3| z8i)<$Fpya&`=JCj5*Nw@yFGa&awhWUek68HTu*oV=euiS*JXFKL3>|;3?HBPgR+R9 zD*m}j=Ldgm4fDTojRL6o2*f;i;0L(xxyrj^Naql#au2KC195-|B%mRI z0uQl-%Jc+KxdR*umCTy%^u;R7{dGd5&W6|xWd>q)M?6_WFpP1XV~Dd^TGw%2$BWw8 z*q5;@ysKh=;Ns^Bbg8j&Obui`*lX8)+Wr>Ai<|~>8p>%TXORS_A~}2s5&`JuqUbAY@*E5%36(L=8yeQpHz7|%U`%^1WAXYIFitvi zst?nXe1@DdK{drl2Elsc6u}U$Iz@PIc>t+NQiGdMCyOHM3>ho(D4p-6vxTc~K&XjE zf{cLL^-f?^P+(o*F(1JqjkPd{=0C)Txi0b9bXSj>k9;z@`QD(+CSL6-Kw7dhH}Sue zd9n2trU*ROzZMb|G+&Czu;M!d#!AYI!c|1J4WUipdmz*1QaQWG+`}%b{Zqh7r_;(! zQ}2GSF)SVx)*atkp&hZUEt^n*E|?^WB}tA-yO>dAi%b$*W=Uc;z?&;1tXuOj$tNT{ z$b1H}3a@gPYmVBoKZn(thxDUhC}tW|0oGS70uhX_!t- zRMuvTAKiVveGP>31YU!|Q~{%w+9pd)9S@9`&XVWsCTgzfT7)%U_Ee4*&yckn+Im)D zx@3fy5e}f`*ce>U$e6%pRJ0yw8~Xy=IxsqIXr8@(>inA#A<|S>;H%OD2jnaT`mAp%!wqGh(m|3tYdl<^Ko< jvS{gRFv}-TUFNmaJ4K`){g2IOe&;HVTq4^HuYL0m3id$9 literal 0 HcmV?d00001 From 0eefa444a3beec83e2a79ecf2ca620f9e32685e9 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 6 Oct 2014 14:40:45 -0400 Subject: [PATCH 02/25] SPARK-3789: added graph, vertex and edge python files --- .../graphx/api/python/PythonVertexRDD.scala | 67 +++++++++++++++++ python/pyspark/graphx/__init__.py | 26 +++++++ python/pyspark/graphx/edge.py | 43 +++++++++++ python/pyspark/graphx/graph.py | 39 ++++++++++ python/pyspark/graphx/graph.pyc | Bin 1359 -> 0 bytes python/pyspark/graphx/vertex.py | 68 ++++++++++++++++++ 6 files changed, 243 insertions(+) create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala create mode 100644 python/pyspark/graphx/__init__.py create mode 100644 python/pyspark/graphx/edge.py create mode 100644 python/pyspark/graphx/graph.py delete mode 100644 python/pyspark/graphx/graph.pyc create mode 100644 python/pyspark/graphx/vertex.py diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala new file mode 100644 index 000000000000..a1024c8cfc29 --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.graphx.api.python + +import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} + +import org.apache.spark.api.python.PythonRDD +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.{Accumulator, Partition, TaskContext} + +import scala.collection.JavaConversions._ +import scala.reflect.ClassTag + +private[spark] class PythonVertexRDD[VD: ClassTag]( + parent: RDD[_], + command: Array[Byte], + envVars: Map[String, String], + pythonIncludes: JList[String], + preservePartitoning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[Array[Byte]]], + accumulator: Accumulator[JList[Array[Byte]]]) + extends PythonRDD( + parent, + command, + envVars, + pythonIncludes, + preservePartitoning, + pythonExec, + broadcastVars, + accumulator) { + + /** + * :: DeveloperApi :: + * Implemented by subclasses to compute a given partition. + */ + override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { + super.compute(split, context) +// override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { +// firstParent[ShippableVertexPartition[VD]].iterator(part, context).next.iterator +// } + } + + /** + * Implemented by subclasses to return the set of partitions in this RDD. This method will only + * be called once, so it is safe to implement a time-consuming computation in it. + */ + override protected def getPartitions: Array[Partition] = { + + } +} diff --git a/python/pyspark/graphx/__init__.py b/python/pyspark/graphx/__init__.py new file mode 100644 index 000000000000..4149f54931d1 --- /dev/null +++ b/python/pyspark/graphx/__init__.py @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python bindings for MLlib. +""" + +# MLlib currently needs and NumPy 1.4+, so complain if lower + +import numpy +if numpy.version.version < '1.4': + raise Exception("MLlib requires NumPy 1.4+") diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py new file mode 100644 index 000000000000..5c21bc9316c2 --- /dev/null +++ b/python/pyspark/graphx/edge.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python bindings for GraphX. +""" + +from pyspark import RDD + +class EdgeRDD(RDD): + """ + EdgeRDD class is used to enter the vertex class for GraphX + """ + + def __init__(self): + self.setName("EdgeRDD") + + def filter(self, (vertexId, VertexProperty)): + self._jrdd + return + + def mapValues(self, func): + return + + def reverse(self): + return + + def innerJoin(self, other): + return \ No newline at end of file diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py new file mode 100644 index 000000000000..26cdebb5d2dc --- /dev/null +++ b/python/pyspark/graphx/graph.py @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python bindings for GraphX. +""" + +__all__ = ["_vertexRDD", "_edgeRDD"] + +class Graph(object): + def __init__(self, vertexRDD, edgeRDD): + self._vertexRDD = vertexRDD + self._edgeRDD = edgeRDD + + def persist(self, storageLevel): + return + + def cache(self): + return + + def partitionBy(self, partitionStrategy): + return + + def subgraph(self, condition): + return diff --git a/python/pyspark/graphx/graph.pyc b/python/pyspark/graphx/graph.pyc deleted file mode 100644 index d0a1228254337e72afcfec21ebc3d9779fb5f665..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1359 zcmb`H&u-H|5XQ&OADYk}kPw0cQY0kg1JvSx!~r2xDse#-T5|wjEXUqBrjBiSw^bwc z29Lyx@G3k3-?vT)7Y=|E&1A;jo%zkoZodCGntyAae#m6^8R7Re#5{q@MPg`Aa$gEx zVi+UQzN}!3<*UdSkx%hR@`*&jl)D~6&EIeBx~Uh-vd+u8FpE{wE#7u%yZ(4Qw{xsW zE_G-1l{!7O2Xu8_XgU)(Zs9K+4^K`Y<~fuG4l)Zf$3D4eAieL5G3oeZvZITqqN0F7 zGnPg1A#Wh&9@Mob*T@dTcZQ1{1C52zrC^myNoEt(c50ZQ0b3?NY0Y?NETXe-D1Lee~~`Jb77zi$%<#@D9jxv2>^J z@t!T Date: Tue, 7 Oct 2014 16:06:54 -0400 Subject: [PATCH 03/25] SPARK-3789: Added PythonGraphLoader --- .../src/main/python/graphx/simpleGraph.py | 48 ++++ .../graphx/api/python/PythonGraphLoader.scala | 22 ++ .../graphx/api/python/PythonVertexRDD.scala | 267 ++++++++++++++++-- python/pyspark/graphx/edge.py | 3 + python/pyspark/graphx/graph.py | 2 +- python/pyspark/graphx/graphloader.py | 16 ++ python/pyspark/graphx/vertex.py | 11 +- 7 files changed, 346 insertions(+), 23 deletions(-) create mode 100644 examples/src/main/python/graphx/simpleGraph.py create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala create mode 100644 python/pyspark/graphx/graphloader.py diff --git a/examples/src/main/python/graphx/simpleGraph.py b/examples/src/main/python/graphx/simpleGraph.py new file mode 100644 index 000000000000..6b403db28f50 --- /dev/null +++ b/examples/src/main/python/graphx/simpleGraph.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Correlations using MLlib. +""" + +import sys + +from pyspark import SparkContext +from pyspark.graphx import GraphLoader +from pyspark.graphx import Vertex +from pyspark.graphx import Edge + +if __name__ == "__main__": + + """ + Usage: simpleGraph filename [partitions]" + """ + + sc = SparkContext(appName="PythonSimpleGraphExample") + graphFile = int(sys.argv[1]) if len(sys.argv) > 1 else "simplegraph.edges" + partitions = int(sys.argv[2]) if len(sys.argv) > 2 else 2 + + print "Running SimpleGraph example with filename=%s partitions=%d\n" % (graphFile, partitions) + + graph = GraphLoader.edgeListFile(sc, graphFile, partitions) + vertices = graph.vertices() + edges = graph.edges + + + + + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala new file mode 100644 index 000000000000..8adc6f040c07 --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala @@ -0,0 +1,22 @@ +package org.apache.spark.graphx.api.python + +import org.apache.spark.SparkContext +import org.apache.spark.graphx.{Graph, GraphLoader} +import org.apache.spark.storage.StorageLevel + +/** + * Created by kdatta1 on 10/7/14. + */ +class PythonGraphLoader { + + def edgeListFile( + sc: SparkContext, + path: String, + partitions: Int, + edgeStorageLevel: StorageLevel, + vertexStorageLevel: StorageLevel) : Graph[Array[Byte], Array[Byte]] = { + + val graph: Graph[Int, Int] = GraphLoader.edgeListFile(sc, path, false, partitions, edgeStorageLevel, vertexStorageLevel) + graph.vertices.foreach(vertex => vertex.toByteArray()) + } +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index a1024c8cfc29..7d91c8f57659 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -17,51 +17,282 @@ package org.apache.spark.graphx.api.python -import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} +import java.io._ +import java.net.Socket +import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Collections} +import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.{Accumulator, Partition, TaskContext} +import org.apache.spark.graphx.VertexRDD +import org.apache.spark.storage.StorageLevel +import org.apache.spark._ +import org.apache.spark.util.Utils import scala.collection.JavaConversions._ import scala.reflect.ClassTag -private[spark] class PythonVertexRDD[VD: ClassTag]( - parent: RDD[_], +private[python] class PythonVertexRDD[VD: ClassTag]( + parent: JavaRDD[Array[Byte]], command: Array[Byte], - envVars: Map[String, String], + envVars: JMap[String, String], pythonIncludes: JList[String], preservePartitoning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], - accumulator: Accumulator[JList[Array[Byte]]]) - extends PythonRDD( - parent, - command, - envVars, - pythonIncludes, - preservePartitoning, - pythonExec, - broadcastVars, - accumulator) { + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) + extends VertexRDD[VD](parent.firstParent, targetStorageLevel) { + + import PythonVertexRDD._ + + val bufferSize = conf.getInt("spark.buffer.size", DEFAULT_SPARK_BUFFER_SIZE) + val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true) /** * :: DeveloperApi :: * Implemented by subclasses to compute a given partition. */ override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { - super.compute(split, context) + val startTime = System.currentTimeMillis + val env = SparkEnv.get + val localdir = env.blockManager.diskBlockManager.localDirs.map( + f => f.getPath()).mkString(",") + envVars += ("SPARK_LOCAL_DIRS" -> localdir) // it's also used in monitor thread + if (reuse_worker) { + envVars += ("SPARK_REUSE_WORKER" -> "1") + } + val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap) + + // Start a thread to feed the process input from our parent's iterator + val writerThread = new WriterThread(env, worker, split, context) + + var complete_cleanly = false + context.addTaskCompletionListener { context => + writerThread.shutdownOnTaskCompletion() + if (reuse_worker && complete_cleanly) { + env.releasePythonWorker(pythonExec, envVars.toMap, worker) + } else { + try { + worker.close() + } catch { + case e: Exception => + logWarning("Failed to close worker socket", e) + } + } + } + + writerThread.start() + new MonitorThread(env, worker, context).start() + + // Return an iterator that read lines from the process's stdout + val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize)) + val stdoutIterator = new Iterator[Array[Byte]] { + def next(): Array[Byte] = { + val obj = _nextObj + if (hasNext) { + _nextObj = read() + } + obj + } + + private def read(): Array[Byte] = { + if (writerThread.exception.isDefined) { + throw writerThread.exception.get + } + try { + stream.readInt() match { + case length if length > 0 => + val obj = new Array[Byte](length) + stream.readFully(obj) + obj + case 0 => Array.empty[Byte] + case SpecialLengths.TIMING_DATA => + // Timing data from worker + val bootTime = stream.readLong() + val initTime = stream.readLong() + val finishTime = stream.readLong() + val boot = bootTime - startTime + val init = initTime - bootTime + val finish = finishTime - initTime + val total = finishTime - startTime + logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot, + init, finish)) + val memoryBytesSpilled = stream.readLong() + val diskBytesSpilled = stream.readLong() + context.taskMetrics.memoryBytesSpilled += memoryBytesSpilled + context.taskMetrics.diskBytesSpilled += diskBytesSpilled + read() + case SpecialLengths.PYTHON_EXCEPTION_THROWN => + // Signals that an exception has been thrown in python + val exLength = stream.readInt() + val obj = new Array[Byte](exLength) + stream.readFully(obj) + throw new PythonException(new String(obj, "utf-8"), + writerThread.exception.getOrElse(null)) + case SpecialLengths.END_OF_DATA_SECTION => + // We've finished the data section of the output, but we can still + // read some accumulator updates: + val numAccumulatorUpdates = stream.readInt() + (1 to numAccumulatorUpdates).foreach { _ => + val updateLen = stream.readInt() + val update = new Array[Byte](updateLen) + stream.readFully(update) + accumulator += Collections.singletonList(update) + } + complete_cleanly = true + null + } + } catch { + + case e: Exception if context.isInterrupted => + logDebug("Exception thrown after task interruption", e) + throw new TaskKilledException + + case e: Exception if writerThread.exception.isDefined => + logError("Python worker exited unexpectedly (crashed)", e) + logError("This may have been caused by a prior exception:", writerThread.exception.get) + throw writerThread.exception.get + + case eof: EOFException => + throw new SparkException("Python worker exited unexpectedly (crashed)", eof) + } + } + + var _nextObj = read() + + def hasNext = _nextObj != null + } + new InterruptibleIterator(context, stdoutIterator) // override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { // firstParent[ShippableVertexPartition[VD]].iterator(part, context).next.iterator // } } + val asJavaRDD : JavaRDD[Array[Byte]] = this.parent + + /** + * The thread responsible for writing the data from the PythonRDD's parent iterator to the + * Python process. + */ + class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext) + extends Thread(s"stdout writer for $pythonExec") { + + @volatile private var _exception: Exception = null + + setDaemon(true) + + /** Contains the exception thrown while writing the parent iterator to the Python process. */ + def exception: Option[Exception] = Option(_exception) + + /** Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. */ + def shutdownOnTaskCompletion() { + assert(context.isCompleted) + this.interrupt() + } + + override def run(): Unit = Utils.logUncaughtExceptions { + try { + SparkEnv.set(env) + val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize) + val dataOut = new DataOutputStream(stream) + // Partition index + dataOut.writeInt(split.index) + // sparkFilesDir + PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut) + // Python includes (*.zip and *.egg files) + dataOut.writeInt(pythonIncludes.length) + for (include <- pythonIncludes) { + PythonRDD.writeUTF(include, dataOut) + } + // Broadcast variables + val oldBids = PythonRDD.getWorkerBroadcasts(worker) + val newBids = broadcastVars.map(_.id).toSet + // number of different broadcasts + val cnt = oldBids.diff(newBids).size + newBids.diff(oldBids).size + dataOut.writeInt(cnt) + for (bid <- oldBids) { + if (!newBids.contains(bid)) { + // remove the broadcast from worker + dataOut.writeLong(- bid - 1) // bid >= 0 + oldBids.remove(bid) + } + } + for (broadcast <- broadcastVars) { + if (!oldBids.contains(broadcast.id)) { + // send new broadcast + dataOut.writeLong(broadcast.id) + dataOut.writeInt(broadcast.value.length) + dataOut.write(broadcast.value) + oldBids.add(broadcast.id) + } + } + dataOut.flush() + // Serialized command: + dataOut.writeInt(command.length) + dataOut.write(command) + // Data values + PythonRDD.writeIteratorToStream(parent.iterator(split, context), dataOut) + dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION) + dataOut.flush() + } catch { + case e: Exception if context.isCompleted || context.isInterrupted => + logDebug("Exception thrown after task completion (likely due to cleanup)", e) + worker.shutdownOutput() + + case e: Exception => + // We must avoid throwing exceptions here, because the thread uncaught exception handler + // will kill the whole executor (see org.apache.spark.executor.Executor). + _exception = e + worker.shutdownOutput() + } + } + } + + /** + * It is necessary to have a monitor thread for python workers if the user cancels with + * interrupts disabled. In that case we will need to explicitly kill the worker, otherwise the + * threads can block indefinitely. + */ + class MonitorThread(env: SparkEnv, worker: Socket, context: TaskContext) + extends Thread(s"Worker Monitor for $pythonExec") { + + setDaemon(true) + + override def run() { + // Kill the worker if it is interrupted, checking until task completion. + // TODO: This has a race condition if interruption occurs, as completed may still become true. + while (!context.isInterrupted && !context.isCompleted) { + Thread.sleep(2000) + } + if (!context.isCompleted) { + try { + logWarning("Incomplete task interrupted: Attempting to kill Python Worker") + env.destroyPythonWorker(pythonExec, envVars.toMap, worker) + } catch { + case e: Exception => + logError("Exception when trying to kill worker", e) + } + } + } + } + /** * Implemented by subclasses to return the set of partitions in this RDD. This method will only * be called once, so it is safe to implement a time-consuming computation in it. */ - override protected def getPartitions: Array[Partition] = { + override protected def getPartitions: Array[Partition] = partitionsRDD.partitions + private object SpecialLengths { + val END_OF_DATA_SECTION = -1 + val PYTHON_EXCEPTION_THROWN = -2 + val TIMING_DATA = -3 } } + +/** Thrown for exceptions in user Python code. */ +private class PythonException(msg: String, cause: Exception) extends RuntimeException(msg, cause) + +object PythonVertexRDD { + val DEFAULT_SPARK_BUFFER_SIZE = 65536 +} diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index 5c21bc9316c2..2cd6d8efff00 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -19,8 +19,11 @@ Python bindings for GraphX. """ +__all__ = ["VertexRDD"] + from pyspark import RDD + class EdgeRDD(RDD): """ EdgeRDD class is used to enter the vertex class for GraphX diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index 26cdebb5d2dc..e3b3eb7ed37c 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -19,7 +19,7 @@ Python bindings for GraphX. """ -__all__ = ["_vertexRDD", "_edgeRDD"] +__all__ = ["Graph", "vertexRDD", "edgeRDD"] class Graph(object): def __init__(self, vertexRDD, edgeRDD): diff --git a/python/pyspark/graphx/graphloader.py b/python/pyspark/graphx/graphloader.py new file mode 100644 index 000000000000..f5f3002ff088 --- /dev/null +++ b/python/pyspark/graphx/graphloader.py @@ -0,0 +1,16 @@ + + +from pyspark import SparkContext + + +class GraphLoader(object): + + @staticmethod + def edgeListFile(sc, filename, partitions): + + edgeStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY + vertexStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY + graphLoader = sc._jvm.org.apache.spark.PythonGraphLoader + graph = graphLoader.edgeListFile(sc, filename, partitions, edgeStorageLevel, vertexStorageLevel) + + return graph diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index a883db7a2393..e1c7e5845997 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -19,6 +19,8 @@ Python bindings for GraphX. """ +__all__ = ["VertexRDD"] + from pyspark import RDD @@ -39,19 +41,20 @@ def filter(self, (vertexId, VertexProperty)): return def mapValues(self, func): + self._jrdd._jvm.org.apache.spark.PythonVertexRDD.mapValues() return def diff(self, other): - return + return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.diff() def leftJoin(self, other): - return + return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.leftJoin() def innerJoin(self, other, func): - return + return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.innerJoin() def aggregateUsingIndex(self, other, reduceFunc): - return + return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() class VertexProperty(object): From 207d8ba9d7b39322885a6c9676c17b60cf9a51f6 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 13 Oct 2014 12:29:05 -0400 Subject: [PATCH 04/25] SPARK-3789: Removed PythonGraphLoader. Added java_import statement to java_gateway.py --- .../graphx/api/python/PythonGraphLoader.scala | 22 ------------------ .../graphx/api/python/PythonVertexRDD.scala | 23 +++++++++++++------ python/pyspark/graphx/graphloader.py | 2 +- python/pyspark/java_gateway.py | 1 + 4 files changed, 18 insertions(+), 30 deletions(-) delete mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala deleted file mode 100644 index 8adc6f040c07..000000000000 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraphLoader.scala +++ /dev/null @@ -1,22 +0,0 @@ -package org.apache.spark.graphx.api.python - -import org.apache.spark.SparkContext -import org.apache.spark.graphx.{Graph, GraphLoader} -import org.apache.spark.storage.StorageLevel - -/** - * Created by kdatta1 on 10/7/14. - */ -class PythonGraphLoader { - - def edgeListFile( - sc: SparkContext, - path: String, - partitions: Int, - edgeStorageLevel: StorageLevel, - vertexStorageLevel: StorageLevel) : Graph[Array[Byte], Array[Byte]] = { - - val graph: Graph[Int, Int] = GraphLoader.edgeListFile(sc, path, false, partitions, edgeStorageLevel, vertexStorageLevel) - graph.vertices.foreach(vertex => vertex.toByteArray()) - } -} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 7d91c8f57659..c2d63cd7c241 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -19,20 +19,22 @@ package org.apache.spark.graphx.api.python import java.io._ import java.net.Socket -import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Collections} +import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} +import org.apache.spark._ import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast -import org.apache.spark.graphx.VertexRDD +import org.apache.spark.graphx.{VertexId, VertexRDD} +import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -import org.apache.spark._ import org.apache.spark.util.Utils import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.reflect.ClassTag -private[python] class PythonVertexRDD[VD: ClassTag]( +private[graphx] class PythonVertexRDD( parent: JavaRDD[Array[Byte]], command: Array[Byte], envVars: JMap[String, String], @@ -42,9 +44,9 @@ private[python] class PythonVertexRDD[VD: ClassTag]( broadcastVars: JList[Broadcast[Array[Byte]]], accumulator: Accumulator[JList[Array[Byte]]], targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends VertexRDD[VD](parent.firstParent, targetStorageLevel) { + extends RDD[(VertexId, _)](parent.firstParent, targetStorageLevel) { - import PythonVertexRDD._ + import org.apache.spark.graphx.api.python.PythonVertexRDD._ val bufferSize = conf.getInt("spark.buffer.size", DEFAULT_SPARK_BUFFER_SIZE) val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true) @@ -206,7 +208,7 @@ private[python] class PythonVertexRDD[VD: ClassTag]( PythonRDD.writeUTF(include, dataOut) } // Broadcast variables - val oldBids = PythonRDD.getWorkerBroadcasts(worker) + val oldBids = getWorkerBroadcasts(new Socket()) val newBids = broadcastVars.map(_.id).toSet // number of different broadcasts val cnt = oldBids.diff(newBids).size + newBids.diff(oldBids).size @@ -247,6 +249,13 @@ private[python] class PythonVertexRDD[VD: ClassTag]( worker.shutdownOutput() } } + + private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]() + private def getWorkerBroadcasts(worker: Socket) = { + synchronized { + workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]()) + } + } } /** diff --git a/python/pyspark/graphx/graphloader.py b/python/pyspark/graphx/graphloader.py index f5f3002ff088..c88111be164d 100644 --- a/python/pyspark/graphx/graphloader.py +++ b/python/pyspark/graphx/graphloader.py @@ -6,7 +6,7 @@ class GraphLoader(object): @staticmethod - def edgeListFile(sc, filename, partitions): + def edgeListFile(sc, filename, partitions, edgeStorageLevel, vertexStorageLevel): edgeStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY vertexStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index 9c70fa5c16d0..f3b6e3aec68d 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -108,6 +108,7 @@ def run(self): java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") + java_import(gateway.jvm, "org.apache.spark.graphx.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") java_import(gateway.jvm, "org.apache.spark.sql.SQLContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext") From 2c2cef75cfcfe0958eb20e9ebe9a39a9c6e85685 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Thu, 30 Oct 2014 16:59:22 -0400 Subject: [PATCH 05/25] SPARK-3789: WIP - Added JavaVertexRDD, JavaEdgeRDD and the first few functions --- .../spark/graphx/PartitionStrategy.scala | 4 +- .../spark/graphx/api/java/JavaEdgeRDD.scala | 8 + .../spark/graphx/api/java/JavaGraph.scala | 8 + .../spark/graphx/api/java/JavaVertexRDD.scala | 91 +++++ .../graphx/api/python/PythonEdgeRDD.scala | 37 ++ .../spark/graphx/api/python/PythonGraph.scala | 26 ++ .../graphx/api/python/PythonVertexRDD.scala | 280 ++------------- python/pyspark/__init__.py | 4 +- python/pyspark/graphx/__init__.py | 10 +- python/pyspark/graphx/edge.py | 93 ++++- python/pyspark/graphx/graph.py | 52 ++- python/pyspark/graphx/graphloader.py | 6 +- python/pyspark/graphx/partitionstrategy.py | 9 + python/pyspark/graphx/vertex.py | 325 ++++++++++++++++-- python/pyspark/rdd.py | 3 + 15 files changed, 651 insertions(+), 305 deletions(-) create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala create mode 100644 python/pyspark/graphx/partitionstrategy.py diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala index 13033fee0e6b..d6ac8d941468 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala @@ -85,7 +85,7 @@ object PartitionStrategy { } /** - * Assigns edges to partitions using only the source vertex ID, colocating edges with the same + * Assigns edges to partitions using only the source vertex ID, collocating edges with the same * source. */ case object EdgePartition1D extends PartitionStrategy { @@ -98,7 +98,7 @@ object PartitionStrategy { /** * Assigns edges to partitions by hashing the source and destination vertex IDs, resulting in a - * random vertex cut that colocates all same-direction edges between two vertices. + * random vertex cut that collocates all same-direction edges between two vertices. */ case object RandomVertexCut extends PartitionStrategy { override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala new file mode 100644 index 000000000000..8dcef3ffd7c7 --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -0,0 +1,8 @@ +package org.apache.spark.graphx.api.java + +/** + * Created by kdatta1 on 10/30/14. + */ +class JavaEdgeRDD { + +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala new file mode 100644 index 000000000000..5d2c9e4700cb --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -0,0 +1,8 @@ +package org.apache.spark.graphx.api.java + +/** + * Created by kdatta1 on 10/30/14. + */ +class JavaGraph { + +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala new file mode 100644 index 000000000000..d83dd7d6702a --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.graphx.api.java + +import org.apache.spark.Partitioner +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.graphx.impl.{DoubleAggMsgSerializer, LongAggMsgSerializer, IntAggMsgSerializer, ShippableVertexPartition} +import org.apache.spark.graphx.{VertexId, VertexRDD} +import org.apache.spark.rdd.{ShuffledRDD, RDD} +import org.apache.spark.storage.StorageLevel + +import scala.reflect._ + +/** + * A Java-friendly interface to [[org.apache.spark.graphx.VertexRDD]], the vertex + * RDD abstraction in Spark GraphX that represents a vertex class in a graph. + * Vertices can be created from existing RDDs or it can be generated from transforming + * existing VertexRDDs using operations such as `mapValues`, `pagerank`, etc. + * For operations applicable to vertices in a graph in GraphX, please refer to + * [[org.apache.spark.graphx.VertexRDD]] + */ + +class JavaVertexRDD[VD](val rdd: RDD[(VertexId, VD)], + override val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) + extends VertexRDD[VD](rdd.firstParent) { + + + override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) + (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { + other match { + case other: VertexRDD[_] => + innerZipJoin(other)(f) + case _ => + this.withPartitionsRDD( + partitionsRDD.zipPartitions( + other.copartitionWithVertices(this.partitioner.get), preservesPartitioning = true) { + (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f)) + } + ) + } + } + + override def diff(other: VertexRDD[VD]): JavaRDD[(VertexId, VD)] = { + JavaRDD.fromRDD(super[VertexRDD].diff(other)) + } + + override def mapValues[VD2: ClassTag](f: VD => VD2): JavaVertexRDD[VD2] = + this.mapVertexPartitions(_.map((vid, attr) => f(attr))) + + override def mapVertexPartitions[VD2: ClassTag] + (f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD2] = { + val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) + this.withPartitionsRDD(newPartitionsRDD) + } + + override def withPartitionsRDD[VD2: ClassTag] + (partitionsRDD: RDD[ShippableVertexPartition[VD2]]): JavaVertexRDD[VD2] = { + new JavaVertexRDD[VD2](partitionsRDD.firstParent, this.targetStorageLevel) + } + + def copartitionWithVertices(partitioner: Partitioner): JavaRDD[(VertexId, VD)] = { + + val rdd = new ShuffledRDD[VertexId, VD, VD](rdd, partitioner) + + // Set a custom serializer if the data is of int or double type. + if (classTag[VD] == ClassTag.Int) { + rdd.setSerializer(new IntAggMsgSerializer) + } else if (classTag[VD] == ClassTag.Long) { + rdd.setSerializer(new LongAggMsgSerializer) + } else if (classTag[VD] == ClassTag.Double) { + rdd.setSerializer(new DoubleAggMsgSerializer) + } + rdd + } + +} + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala new file mode 100644 index 000000000000..457e3b71196a --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.graphx.api.python + +import java.util.{List => JList, Map => JMap} + +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.graphx.EdgeRDD +import org.apache.spark.storage.StorageLevel + +private[graphx] class PythonEdgeRDD ( + parent: JavaRDD[Array[Byte]], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + partitionStrategy: String, + targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) + extends EdgeRDD[Array[Byte], Array[Byte]](parent.firstParent, targetStorageLevel) { + + } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala new file mode 100644 index 000000000000..e76193fd8559 --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.graphx.api.python + +abstract private[graphx] class PythonGraph ( + @transient val vertexRDD: PythonVertexRDD, + @transient val edgeRDD: PythonEdgeRDD) +// extends Graph[Array[Byte], Array[Byte]] with Serializable { + extends Serializable { +} + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index c2d63cd7c241..136004305b27 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -17,25 +17,28 @@ package org.apache.spark.graphx.api.python -import java.io._ -import java.net.Socket -import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} +import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} -import org.apache.spark._ import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast -import org.apache.spark.graphx.{VertexId, VertexRDD} import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.Utils +import org.apache.spark.{Accumulator, Partition, TaskContext} -import scala.collection.JavaConversions._ -import scala.collection.mutable -import scala.reflect.ClassTag - -private[graphx] class PythonVertexRDD( - parent: JavaRDD[Array[Byte]], +/** + * + * @param parent + * @param command + * @param envVars + * @param pythonIncludes + * @param preservePartitoning + * @param pythonExec + * @param broadcastVars + * @param accumulator + * @param targetStorageLevel + */ +private[graphx] class PythonVertexRDD ( + parent: PythonRDD, command: Array[Byte], envVars: JMap[String, String], pythonIncludes: JList[String], @@ -43,8 +46,8 @@ private[graphx] class PythonVertexRDD( pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends RDD[(VertexId, _)](parent.firstParent, targetStorageLevel) { + targetStorageLevel: String = "MEMORY_ONLY") + extends RDD[Array[Byte]](parent) { import org.apache.spark.graphx.api.python.PythonVertexRDD._ @@ -56,252 +59,27 @@ private[graphx] class PythonVertexRDD( * Implemented by subclasses to compute a given partition. */ override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { - val startTime = System.currentTimeMillis - val env = SparkEnv.get - val localdir = env.blockManager.diskBlockManager.localDirs.map( - f => f.getPath()).mkString(",") - envVars += ("SPARK_LOCAL_DIRS" -> localdir) // it's also used in monitor thread - if (reuse_worker) { - envVars += ("SPARK_REUSE_WORKER" -> "1") - } - val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap) - - // Start a thread to feed the process input from our parent's iterator - val writerThread = new WriterThread(env, worker, split, context) - - var complete_cleanly = false - context.addTaskCompletionListener { context => - writerThread.shutdownOnTaskCompletion() - if (reuse_worker && complete_cleanly) { - env.releasePythonWorker(pythonExec, envVars.toMap, worker) - } else { - try { - worker.close() - } catch { - case e: Exception => - logWarning("Failed to close worker socket", e) - } - } - } - - writerThread.start() - new MonitorThread(env, worker, context).start() - - // Return an iterator that read lines from the process's stdout - val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize)) - val stdoutIterator = new Iterator[Array[Byte]] { - def next(): Array[Byte] = { - val obj = _nextObj - if (hasNext) { - _nextObj = read() - } - obj - } - - private def read(): Array[Byte] = { - if (writerThread.exception.isDefined) { - throw writerThread.exception.get - } - try { - stream.readInt() match { - case length if length > 0 => - val obj = new Array[Byte](length) - stream.readFully(obj) - obj - case 0 => Array.empty[Byte] - case SpecialLengths.TIMING_DATA => - // Timing data from worker - val bootTime = stream.readLong() - val initTime = stream.readLong() - val finishTime = stream.readLong() - val boot = bootTime - startTime - val init = initTime - bootTime - val finish = finishTime - initTime - val total = finishTime - startTime - logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot, - init, finish)) - val memoryBytesSpilled = stream.readLong() - val diskBytesSpilled = stream.readLong() - context.taskMetrics.memoryBytesSpilled += memoryBytesSpilled - context.taskMetrics.diskBytesSpilled += diskBytesSpilled - read() - case SpecialLengths.PYTHON_EXCEPTION_THROWN => - // Signals that an exception has been thrown in python - val exLength = stream.readInt() - val obj = new Array[Byte](exLength) - stream.readFully(obj) - throw new PythonException(new String(obj, "utf-8"), - writerThread.exception.getOrElse(null)) - case SpecialLengths.END_OF_DATA_SECTION => - // We've finished the data section of the output, but we can still - // read some accumulator updates: - val numAccumulatorUpdates = stream.readInt() - (1 to numAccumulatorUpdates).foreach { _ => - val updateLen = stream.readInt() - val update = new Array[Byte](updateLen) - stream.readFully(update) - accumulator += Collections.singletonList(update) - } - complete_cleanly = true - null - } - } catch { - - case e: Exception if context.isInterrupted => - logDebug("Exception thrown after task interruption", e) - throw new TaskKilledException - - case e: Exception if writerThread.exception.isDefined => - logError("Python worker exited unexpectedly (crashed)", e) - logError("This may have been caused by a prior exception:", writerThread.exception.get) - throw writerThread.exception.get - - case eof: EOFException => - throw new SparkException("Python worker exited unexpectedly (crashed)", eof) - } - } - - var _nextObj = read() - - def hasNext = _nextObj != null - } - new InterruptibleIterator(context, stdoutIterator) -// override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { -// firstParent[ShippableVertexPartition[VD]].iterator(part, context).next.iterator -// } - } - - val asJavaRDD : JavaRDD[Array[Byte]] = this.parent - - /** - * The thread responsible for writing the data from the PythonRDD's parent iterator to the - * Python process. - */ - class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext) - extends Thread(s"stdout writer for $pythonExec") { - - @volatile private var _exception: Exception = null - - setDaemon(true) - - /** Contains the exception thrown while writing the parent iterator to the Python process. */ - def exception: Option[Exception] = Option(_exception) - - /** Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. */ - def shutdownOnTaskCompletion() { - assert(context.isCompleted) - this.interrupt() - } - - override def run(): Unit = Utils.logUncaughtExceptions { - try { - SparkEnv.set(env) - val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize) - val dataOut = new DataOutputStream(stream) - // Partition index - dataOut.writeInt(split.index) - // sparkFilesDir - PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut) - // Python includes (*.zip and *.egg files) - dataOut.writeInt(pythonIncludes.length) - for (include <- pythonIncludes) { - PythonRDD.writeUTF(include, dataOut) - } - // Broadcast variables - val oldBids = getWorkerBroadcasts(new Socket()) - val newBids = broadcastVars.map(_.id).toSet - // number of different broadcasts - val cnt = oldBids.diff(newBids).size + newBids.diff(oldBids).size - dataOut.writeInt(cnt) - for (bid <- oldBids) { - if (!newBids.contains(bid)) { - // remove the broadcast from worker - dataOut.writeLong(- bid - 1) // bid >= 0 - oldBids.remove(bid) - } - } - for (broadcast <- broadcastVars) { - if (!oldBids.contains(broadcast.id)) { - // send new broadcast - dataOut.writeLong(broadcast.id) - dataOut.writeInt(broadcast.value.length) - dataOut.write(broadcast.value) - oldBids.add(broadcast.id) - } - } - dataOut.flush() - // Serialized command: - dataOut.writeInt(command.length) - dataOut.write(command) - // Data values - PythonRDD.writeIteratorToStream(parent.iterator(split, context), dataOut) - dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION) - dataOut.flush() - } catch { - case e: Exception if context.isCompleted || context.isInterrupted => - logDebug("Exception thrown after task completion (likely due to cleanup)", e) - worker.shutdownOutput() - - case e: Exception => - // We must avoid throwing exceptions here, because the thread uncaught exception handler - // will kill the whole executor (see org.apache.spark.executor.Executor). - _exception = e - worker.shutdownOutput() - } - } - - private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]() - private def getWorkerBroadcasts(worker: Socket) = { - synchronized { - workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]()) - } - } - } - - /** - * It is necessary to have a monitor thread for python workers if the user cancels with - * interrupts disabled. In that case we will need to explicitly kill the worker, otherwise the - * threads can block indefinitely. - */ - class MonitorThread(env: SparkEnv, worker: Socket, context: TaskContext) - extends Thread(s"Worker Monitor for $pythonExec") { - - setDaemon(true) - - override def run() { - // Kill the worker if it is interrupted, checking until task completion. - // TODO: This has a race condition if interruption occurs, as completed may still become true. - while (!context.isInterrupted && !context.isCompleted) { - Thread.sleep(2000) - } - if (!context.isCompleted) { - try { - logWarning("Incomplete task interrupted: Attempting to kill Python Worker") - env.destroyPythonWorker(pythonExec, envVars.toMap, worker) - } catch { - case e: Exception => - logError("Exception when trying to kill worker", e) - } - } - } + null } /** * Implemented by subclasses to return the set of partitions in this RDD. This method will only * be called once, so it is safe to implement a time-consuming computation in it. */ - override protected def getPartitions: Array[Partition] = partitionsRDD.partitions + override def getPartitions: Array[Partition] = ??? + +// def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { +// this(parent, null, null, preservePartitioning, "MEMORY_ONLY") +// System.out.println("PythonVertexRDD constructor") +// } - private object SpecialLengths { - val END_OF_DATA_SECTION = -1 - val PYTHON_EXCEPTION_THROWN = -2 - val TIMING_DATA = -3 + val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) + + def countToString(): String = { + this.count().toString } } -/** Thrown for exceptions in user Python code. */ -private class PythonException(msg: String, cause: Exception) extends RuntimeException(msg, cause) - object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 } diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 1a2e774738fe..1f4d40353b67 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -68,10 +68,8 @@ from pyspark.broadcast import Broadcast from pyspark.serializers import MarshalSerializer, PickleSerializer -# for back compatibility -from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row - __all__ = [ "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", + "Vertex", "Edge", "Graph", ] diff --git a/python/pyspark/graphx/__init__.py b/python/pyspark/graphx/__init__.py index 4149f54931d1..7e025a50d3c7 100644 --- a/python/pyspark/graphx/__init__.py +++ b/python/pyspark/graphx/__init__.py @@ -16,11 +16,11 @@ # """ -Python bindings for MLlib. +Python bindings for GraphX. """ -# MLlib currently needs and NumPy 1.4+, so complain if lower +from pyspark.graphx.vertex import VertexRDD, Vertex, VertexId +from pyspark.graphx.edge import Edge, EdgeRDD +from pyspark.graphx.graph import Graph -import numpy -if numpy.version.version < '1.4': - raise Exception("MLlib requires NumPy 1.4+") +__all__ = ["PartitioningStrategy", "VertexRDD", "EdgeRDD", "Graph", "Vertex", "Edge"] diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index 2cd6d8efff00..f6b8f2fb4155 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -18,10 +18,38 @@ """ Python bindings for GraphX. """ +import operator +import itertools +from pyspark.graphx.partitionstrategy import PartitionStrategy +from pyspark import RDD, StorageLevel +from pyspark.rdd import PipelinedRDD -__all__ = ["VertexRDD"] +__all__ = ["EdgeRDD", "Edge"] -from pyspark import RDD + +class Edge(object): + """ + Edge object contains a source vertex id, target vertex id and edge properties + """ + + def __init__(self, src_id, tgt_id, edge_property): + self._src_id = src_id + self._tgt_id = tgt_id + self._property = edge_property + + @property + def srcId(self): + return self._src_id + + @property + def tgtId(self): + return self._tgt_id + + def asTuple(self): + return (self._src_id, self._tgt_id, self._property) + + def __str__(self): + return self._src_id + self._tgt_id + self._property class EdgeRDD(RDD): @@ -29,18 +57,57 @@ class EdgeRDD(RDD): EdgeRDD class is used to enter the vertex class for GraphX """ - def __init__(self): - self.setName("EdgeRDD") + def __init__(self, jrdd, ctx, jrdd_deserializer): + self._jrdd = jrdd + self._ctx = ctx + self._jrdd_deserializer = jrdd_deserializer + self._name = "VertexRDD" + + # TODO: Does not work + def __repr__(self): + return RDD(self._jrdd, self._ctx, self._jrdd_deserializer).take(1).__repr__() + + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + return self._jrdd.persist(storageLevel) + + def cache(self): + self._jrdd.cache() + + def count(self): + return self._jrdd.count() + + def collect(self): + return self._jrdd.collect() + + def take(self, num=10): + return self._jrdd.take(num) + + def sum(self): + """ + Add up the elements in this RDD. + + >>> sc.parallelize([1.0, 2.0, 3.0]).sum() + 6.0 + """ + return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) + + def mapValues(self, f, preservesPartitioning=False): + """ + Return a new RDD by applying a function to each element of this RDD. - def filter(self, (vertexId, VertexProperty)): - self._jrdd - return + >>> rdd = sc.parallelize(["b", "a", "c"]) + >>> sorted(rdd.map(lambda x: (x, 1)).collect()) + [('a', 1), ('b', 1), ('c', 1)] + """ + def func(_, iterator): + return itertools.imap(f, iterator) + return self.mapVertexPartitions(func, preservesPartitioning) - def mapValues(self, func): - return + def filter(self, f): + return self._jrdd.filter(f) - def reverse(self): - return + def innerJoin(self, f): + return self._jrdd.innerJoin(f) - def innerJoin(self, other): - return \ No newline at end of file + def leftOuterJoin(self, other, numPartitions=None): + return self._jrdd.leftOuterJoin(other, numPartitions) \ No newline at end of file diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index e3b3eb7ed37c..d75680b1d299 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -18,22 +18,66 @@ """ Python bindings for GraphX. """ +from pyspark import PickleSerializer, RDD, StorageLevel +from pyspark.graphx import VertexRDD, EdgeRDD -__all__ = ["Graph", "vertexRDD", "edgeRDD"] +from pyspark.graphx.partitionstrategy import PartitionStrategy +from pyspark.rdd import PipelinedRDD +from pyspark.serializers import BatchedSerializer + +__all__ = ["Graph"] class Graph(object): - def __init__(self, vertexRDD, edgeRDD): - self._vertexRDD = vertexRDD - self._edgeRDD = edgeRDD + def __init__(self, vertex_jrdd, edge_jrdd, partition_strategy=PartitionStrategy.EdgePartition1D): + self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, BatchedSerializer(PickleSerializer())) + self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, BatchedSerializer(PickleSerializer())) + self._partition_strategy = partition_strategy def persist(self, storageLevel): + self._vertex_jrdd.persist(storageLevel) + self._edge_jrdd.persist(storageLevel) return def cache(self): + self._vertex_jrdd.cache() + self._edge_jrdd.cache() return + def vertices(self): + return self._vertex_jrdd + + def edges(self): + return self._edge_jrdd + def partitionBy(self, partitionStrategy): + return def subgraph(self, condition): return + + def pagerank(self, num_iterations, reset_probability = 0.15): + """ + Pagerank on the graph depends on valid vertex and edge RDDs + Users can specify terminating conditions as number of + iterations or the Random reset probability or alpha + + :param num_iterations: Number of iterations for the + algorithm to terminate + :param reset_probability: Random reset probability + :return: + """ + + return + + def connected_components(self): + return + + def reverse(self): + return + + def apply(self, f): + + return + + diff --git a/python/pyspark/graphx/graphloader.py b/python/pyspark/graphx/graphloader.py index c88111be164d..bfcb65acf1ae 100644 --- a/python/pyspark/graphx/graphloader.py +++ b/python/pyspark/graphx/graphloader.py @@ -1,6 +1,7 @@ from pyspark import SparkContext +from pyspark.graphx import Graph, EdgeRDD, VertexRDD class GraphLoader(object): @@ -8,9 +9,8 @@ class GraphLoader(object): @staticmethod def edgeListFile(sc, filename, partitions, edgeStorageLevel, vertexStorageLevel): - edgeStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY - vertexStorageLevel = sc._jvm.JavaStorageLevel.MEMORY_ONLY + jrdd = sc.textFile(filename) + graphLoader = sc._jvm.org.apache.spark.PythonGraphLoader graph = graphLoader.edgeListFile(sc, filename, partitions, edgeStorageLevel, vertexStorageLevel) - return graph diff --git a/python/pyspark/graphx/partitionstrategy.py b/python/pyspark/graphx/partitionstrategy.py new file mode 100644 index 000000000000..6071d5218b44 --- /dev/null +++ b/python/pyspark/graphx/partitionstrategy.py @@ -0,0 +1,9 @@ +__author__ = 'kdatta1' + +class PartitionStrategy(object): + EdgePartition1D = 1 + EdgePartition2D = 2 + RandomVertexCut = 3 + + + diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index e1c7e5845997..3d7339dd9b37 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -19,33 +19,146 @@ Python bindings for GraphX. """ -__all__ = ["VertexRDD"] +import itertools +import os +from py4j.java_collections import MapConverter, ListConverter +from tempfile import NamedTemporaryFile +from types import TupleType, IntType +import operator +from numpy.numarray.numerictypes import Long +from pyspark.accumulators import PStatsParam +from pyspark.rdd import PipelinedRDD +from pyspark.serializers import CloudPickleSerializer, NoOpSerializer +from pyspark import RDD, PickleSerializer, StorageLevel +from pyspark.graphx.partitionstrategy import PartitionStrategy +from pyspark.sql import StringType, LongType +from pyspark.traceback_utils import SCCallSiteSync -from pyspark import RDD +__all__ = ["VertexRDD", "VertexId", "Vertex"] -class VertexRDD(RDD): +""" +Vertex id type is long by default. +Defining a type for that enables +us to override it in future if +need be +""" +VertexId = Long + + +class Vertex(object): + """ + Vertex class is a tuple of (VertexId and VertexProperty) + """ + def __init__(self, vertex_id, vertex_property): + self._id = VertexId(vertex_id) + self._property = vertex_property + + @property + def property(self): + return self._property + + def asTuple(self): + return (self._id, self._property) + + def __str__(self): + return self._id + self._property + + +class VertexRDD(object): """ - VertexRDD class is used to enter the vertex class for GraphX + VertexRDD class defines vertex operations/transformation and vertex properties + The schema of the vertex properties are specified as a tuple to the vertex + The vertex operations are mapValues, filter, diff, innerJoin, leftOuterJoin + and aggergateUsingIndex. These operations are mapped to Scala functions defined + in PythonVertexRDD class in [[org.apache.spark.graphx.api.python package]] """ - def __init__(self, otherRDD): - self.__init__(otherRDD._jrdd, otherRDD._ctx, otherRDD._serializer) - self.setName("VertexRDD") + def __init__(self, jrdd, ctx, jrdd_deserializer): + self._jrdd = jrdd + self._ctx = ctx + self._jrdd_deserializer = jrdd_deserializer + self._preserve_partitioning = False + self._name = "VertexRDD" + self.is_cached = False + self.is_checkpointed = False + self._id = jrdd.id() + self._partitionFunc = None + self._jrdd_val = None + self._bypass_serializer = False + self._jrdd_val = self.toVertexRDD(jrdd, ctx, jrdd_deserializer) + + + # TODO: Does not work + def __repr__(self): + return self._jrdd.toString() + + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + return self._jrdd.persist(storageLevel) + + def cache(self): + self._jrdd.cache() + + def count(self): + return self._jrdd.count() + + # def collect(self): + # return self._jrdd.collect() + + def collect(self): + print "in collect() of vertex.py" + """ + Return a list that contains all of the elements in this RDD. + """ + # with SCCallSiteSync(self._ctx) as css: + bytesInJava = self._jrdd.collect().iterator() + return list(self._collect_iterator_through_file(bytesInJava)) + + def _collect_iterator_through_file(self, iterator): + # Transferring lots of data through Py4J can be slow because + # socket.readline() is inefficient. Instead, we'll dump the data to a + # file and read it back. + tempFile = NamedTemporaryFile(delete=False, dir=self._ctx._temp_dir) + tempFile.close() + self._ctx._writeToFile(iterator, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + for item in self._jrdd_deserializer.load_stream(tempFile): + yield item + os.unlink(tempFile.name) - def __init__(self, jrdd, ctx, serializer): - super(jrdd, ctx, serializer) - self.setName("VertexRDD") + def take(self, num=10): + return self._jrdd.take(num) - def filter(self, (vertexId, VertexProperty)): - return + def sum(self): + self._jrdd.sum() - def mapValues(self, func): - self._jrdd._jvm.org.apache.spark.PythonVertexRDD.mapValues() - return + def mapValues(self, f, preservesPartitioning=False): + """ + Return a new RDD by applying a function to each element of this RDD. + + >>> rdd = sc.parallelize(["b", "a", "c"]) + >>> sorted(rdd.map(lambda x: (x, 1)).collect()) + [('a', 1), ('b', 1), ('c', 1)] + """ + def func(_, iterator): + return itertools.imap(f, iterator) + return self.mapVertexPartitions(func, preservesPartitioning) + + def filter(self, f): + return self._jrdd.filter(f) def diff(self, other): - return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.diff() + """ + Return a new RDD containing only the elements that satisfy a predicate. + + >>> rdd1 = sc.parallelize([1, 2, 3, 4, 5]) + >>> rdd2 = sc.parallelize([2, 3, 4]) + >>> rdd1.diff(rdd2).collect() + [1, 5] + """ + self._jrdd = self._jrdd._jvm.org.apache.spark.PythonVertexRDD() + return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.diff(other) def leftJoin(self, other): return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.leftJoin() @@ -56,16 +169,180 @@ def innerJoin(self, other, func): def aggregateUsingIndex(self, other, reduceFunc): return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() + def mapVertexPartitions(self, f, preservesPartitioning=False): + """ + Return a new RDD by applying a function to each partition of this RDD. + + >>> rdd = sc.parallelize([1, 2, 3, 4], 2) + >>> def f(iterator): yield sum(iterator) + >>> rdd.mapPartitions(f).collect() + [3, 7] + """ + def func(s, iterator): + return f(iterator) + return self._jrdd.mapPartitionsWithIndex(func, preservesPartitioning) + + def reduce(self, f): + """ + Reduces the elements of this RDD using the specified commutative and + associative binary operator. Currently reduces partitions locally. + + >>> from operator import add + >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add) + 15 + >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add) + 10 + >>> sc.parallelize([]).reduce(add) + Traceback (most recent call last): + ... + ValueError: Can not reduce() empty RDD + """ + def func(iterator): + iterator = iter(iterator) + try: + initial = next(iterator) + except StopIteration: + return + yield reduce(f, iterator, initial) + + vals = self.mapVertexPartitions(func).collect() + if vals: + return reduce(f, vals) + raise ValueError("Can not reduce() empty RDD") + + def toVertexRDD(self, jrdd, ctx, jrdd_deserializer): + if self._jrdd_val: + return self._jrdd_val + if self._bypass_serializer: + self._jrdd_deserializer = NoOpSerializer() + enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None + command = (self._jrdd_deserializer) + # the serialized command will be compressed by broadcast + ser = CloudPickleSerializer() + pickled_command = ser.dumps(command) + if len(pickled_command) > (1 << 20): # 1M + self._broadcast = self._ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self._broadcast) + broadcast_vars = ListConverter().convert( + [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], + self._ctx._gateway._gateway_client) + self._ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self._ctx.environment, + self._ctx._gateway._gateway_client) + includes = ListConverter().convert(self._ctx._python_includes, + self._ctx._gateway._gateway_client) + python_rdd = self._ctx._jvm.PythonVertexRDD(jrdd._jrdd, + bytearray(pickled_command), + env, includes, self._preserve_partitioning, + self._ctx.pythonExec, + broadcast_vars, self._ctx._javaAccumulator) + if enable_profile: + self._id = self._jrdd_val.id() + self._ctx._add_profile(self._id, profileStats) + + return python_rdd.asJavaRDD() + + def id(self): + """ + A unique ID for this RDD (within its SparkContext). + """ + return self._id -class VertexProperty(object): - def __init__(self, property_name, propertyValue): - self.name = property_name - self.value = propertyValue +class PipelinedVertexRDD(VertexRDD): - def getKey(self): - return self.name + """ + Pipelined maps: + + >>> rdd = sc.parallelize([1, 2, 3, 4]) + >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() + [4, 8, 12, 16] + >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() + [4, 8, 12, 16] + + Pipelined reduces: + >>> from operator import add + >>> rdd.map(lambda x: 2 * x).reduce(add) + 20 + >>> rdd.flatMap(lambda x: [x, x]).reduce(add) + 20 + """ + + def __init__(self, prev, func, preservesPartitioning=False): + if not isinstance(prev, PipelinedVertexRDD) or not prev._is_pipelinable(): + # This transformation is the first in its stage: + self.func = func + self.preservesPartitioning = preservesPartitioning + self._prev_jrdd = prev._jrdd + self._prev_jrdd_deserializer = prev._jrdd_deserializer + else: + prev_func = prev.func + + def pipeline_func(split, iterator): + return func(split, prev_func(split, iterator)) + self.func = pipeline_func + self.preservesPartitioning = \ + prev.preservesPartitioning and preservesPartitioning + self._prev_jrdd = prev._prev_jrdd # maintain the pipeline + self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer + self.is_cached = False + self.is_checkpointed = False + self._ctx = prev._ctx + self.prev = prev + self._jrdd_val = None + self._id = None + self._jrdd_deserializer = self._ctx.serializer + self._bypass_serializer = False + self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None + self._broadcast = None + + def __del__(self): + if self._broadcast: + self._broadcast.unpersist() + self._broadcast = None @property - def getValue(self): - return self.value \ No newline at end of file + def _jrdd(self): + print "in _jrdd of vertex.py" + if self._jrdd_val: + return self._jrdd_val + if self._bypass_serializer: + self._jrdd_deserializer = NoOpSerializer() + enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None + command = (self.func, profileStats, self._prev_jrdd_deserializer, + self._jrdd_deserializer) + # the serialized command will be compressed by broadcast + ser = CloudPickleSerializer() + pickled_command = ser.dumps(command) + if len(pickled_command) > (1 << 20): # 1M + self._broadcast = self._ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self._broadcast) + broadcast_vars = ListConverter().convert( + [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], + self._ctx._gateway._gateway_client) + self._ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self._ctx.environment, + self._ctx._gateway._gateway_client) + includes = ListConverter().convert(self._ctx._python_includes, + self._ctx._gateway._gateway_client) + python_rdd = self._ctx._jvm.PythonVertexRDD(self._prev_jrdd.rdd(), + bytearray(pickled_command), + env, includes, self.preservesPartitioning, + self._ctx.pythonExec, + broadcast_vars, self._ctx._javaAccumulator) + self._jrdd_val = python_rdd.asJavaRDD() + + if enable_profile: + self._id = self._jrdd_val.id() + self._ctx._add_profile(self._id, profileStats) + return self._jrdd_val + + def id(self): + if self._id is None: + self._id = self._jrdd.id() + return self._id + + def _is_pipelinable(self): + return not (self.is_cached or self.is_checkpointed) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index dc6497772e50..d6b5a8fcab5c 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -649,6 +649,7 @@ def func(it): self.mapPartitions(func).count() # Force evaluation def collect(self): + print "in collect() of rdd.py" """ Return a list that contains all of the elements in this RDD. """ @@ -657,6 +658,7 @@ def collect(self): return list(self._collect_iterator_through_file(bytesInJava)) def _collect_iterator_through_file(self, iterator): + print "in _collect_iterator_through_file() of rdd.py" # Transferring lots of data through Py4J can be slow because # socket.readline() is inefficient. Instead, we'll dump the data to a # file and read it back. @@ -2082,6 +2084,7 @@ def __del__(self): @property def _jrdd(self): + print "in _jrdd of rdd.py" if self._jrdd_val: return self._jrdd_val if self._bypass_serializer: From cf1df50057a9fdea35cbf34a40945697a77766d5 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Thu, 30 Oct 2014 17:11:48 -0400 Subject: [PATCH 06/25] SPARK-3789: Removed .pyc files --- python/pyspark/graphx/__init__.pyc | Bin 329 -> 0 bytes python/pyspark/graphx/edge.pyc | Bin 1440 -> 0 bytes python/pyspark/graphx/vertex.pyc | Bin 2768 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/pyspark/graphx/__init__.pyc delete mode 100644 python/pyspark/graphx/edge.pyc delete mode 100644 python/pyspark/graphx/vertex.pyc diff --git a/python/pyspark/graphx/__init__.pyc b/python/pyspark/graphx/__init__.pyc deleted file mode 100644 index 9db5b7a23975579bbf973e73b78e88795918fd86..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 329 zcmYL@!Ait15Qb;Ab=9JA%!HX#3#oM|puBW|(ZWFh$+oUF`Y_Ix+zKSoP z(<(S5-{hY^naStZaZ)}`o(lMji1Hn6e?^c0D!94vuEqr!)&Y9S)#uw=Dr^}%`W{J<-uW2y|&sZ z&*p8t=opz7X-sa|CkP>JAw=j$n6_@Zq57tr*VcqXs^0Gk)dW&03HXPh_e*eoPq0M|gri+FMSD diff --git a/python/pyspark/graphx/edge.pyc b/python/pyspark/graphx/edge.pyc deleted file mode 100644 index 25da5729cb81948838dfb4e5a0fd71822601c9be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1440 zcmb`GU5nE|6oyZder)Rp3WDAUxezRZjeF&VBBB-1i?U>8(Xuea%%sgWO+scWEv#4T zAM~&I2lPEN*{wHT$TnoooH_IL%zI{j{<<~XoId@OQu}pKzedr|U?kBD)+6cD(xbws znNOY!Ncof@+Mypr7errX0m&{Mk9s`wAoXAc9$C-9^-JNPjO2% z!6L&DU%O417KzqjuEVufGBj1FN~4<4EL8YSHAZdP5j%1_J!LE|KFV7ZeHZ4OD7Q*o zco2vn1$R$l9_UWWC)CPJla+FTjtvyq2sK%wpTmn5)k;N6nHZBCM#pv1EF*1gZ6%HB zt!vI%%}u1B+8A92m@7nH=0*t4fMx*lQm%NbE8P2NoO^WY(|7zqi|(Kqck1xEaK%|8 z<%nm;oG8OUGpqHJt694#amU%L~p4Jk8C3243GCiBXu=W`GGL zi|+W*BCEfn=H6^wrhDQVV7XONpCrXv=|^Z>D$Sc;mVaXL0E@LJ8opMILN>H|{1*$$ z^B)p%vMb#DW=~YCQC^m+`B>%UlO5OS0DgN_OS}Xj%n%VFR+U^A+~o5JA*)n~m=hQC zQt=HoH`(xY+d*3II?u*F&<9vk^Xk2z=MREG2PK%gd(V&CLbcB>Mw{HTW^adb9SW<+ ROQ*rBdstTU(R= zu!Hl-`Gff@`2+3qj849Ma0-o8op&^|8qGfQq8Yz_b7dGm`~J5~mRAq=yIA!@5G%3| z8i)<$Fpya&`=JCj5*Nw@yFGa&awhWUek68HTu*oV=euiS*JXFKL3>|;3?HBPgR+R9 zD*m}j=Ldgm4fDTojRL6o2*f;i;0L(xxyrj^Naql#au2KC195-|B%mRI z0uQl-%Jc+KxdR*umCTy%^u;R7{dGd5&W6|xWd>q)M?6_WFpP1XV~Dd^TGw%2$BWw8 z*q5;@ysKh=;Ns^Bbg8j&Obui`*lX8)+Wr>Ai<|~>8p>%TXORS_A~}2s5&`JuqUbAY@*E5%36(L=8yeQpHz7|%U`%^1WAXYIFitvi zst?nXe1@DdK{drl2Elsc6u}U$Iz@PIc>t+NQiGdMCyOHM3>ho(D4p-6vxTc~K&XjE zf{cLL^-f?^P+(o*F(1JqjkPd{=0C)Txi0b9bXSj>k9;z@`QD(+CSL6-Kw7dhH}Sue zd9n2trU*ROzZMb|G+&Czu;M!d#!AYI!c|1J4WUipdmz*1QaQWG+`}%b{Zqh7r_;(! zQ}2GSF)SVx)*atkp&hZUEt^n*E|?^WB}tA-yO>dAi%b$*W=Uc;z?&;1tXuOj$tNT{ z$b1H}3a@gPYmVBoKZn(thxDUhC}tW|0oGS70uhX_!t- zRMuvTAKiVveGP>31YU!|Q~{%w+9pd)9S@9`&XVWsCTgzfT7)%U_Ee4*&yckn+Im)D zx@3fy5e}f`*ce>U$e6%pRJ0yw8~Xy=IxsqIXr8@(>inA#A<|S>;H%OD2jnaT`mAp%!wqGh(m|3tYdl<^Ko< jvS{gRFv}-TUFNmaJ4K`){g2IOe&;HVTq4^HuYL0m3id$9 From 15805134892a5ea78f4825cb328199b5168e7d7d Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Tue, 4 Nov 2014 09:54:39 -0500 Subject: [PATCH 07/25] SPARK-3789: WIP - PythonVertexRDD works --- .../spark/graphx/api/java/JavaEdgeRDD.scala | 20 ++- .../spark/graphx/api/java/JavaGraph.scala | 20 ++- .../spark/graphx/api/java/JavaVertexRDD.scala | 125 ++++++++++-------- .../spark/graphx/api/python/PythonGraph.scala | 3 + .../graphx/api/python/PythonVertexRDD.scala | 92 +++++++------ python/pyspark/graphx/graph.py | 16 ++- python/pyspark/graphx/vertex.py | 64 ++++----- python/pyspark/rdd.py | 3 +- 8 files changed, 198 insertions(+), 145 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 8dcef3ffd7c7..f1877dab40a3 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -1,8 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.graphx.api.java -/** - * Created by kdatta1 on 10/30/14. - */ class JavaEdgeRDD { } + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index 5d2c9e4700cb..0f22530dd5ab 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -1,8 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.graphx.api.java -/** - * Created by kdatta1 on 10/30/14. - */ class JavaGraph { } + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index d83dd7d6702a..e029ed40b287 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -16,11 +16,8 @@ */ package org.apache.spark.graphx.api.java -import org.apache.spark.Partitioner -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.graphx.impl.{DoubleAggMsgSerializer, LongAggMsgSerializer, IntAggMsgSerializer, ShippableVertexPartition} -import org.apache.spark.graphx.{VertexId, VertexRDD} -import org.apache.spark.rdd.{ShuffledRDD, RDD} +import org.apache.spark.graphx.VertexId +import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.reflect._ @@ -34,58 +31,76 @@ import scala.reflect._ * [[org.apache.spark.graphx.VertexRDD]] */ -class JavaVertexRDD[VD](val rdd: RDD[(VertexId, VD)], - override val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends VertexRDD[VD](rdd.firstParent) { +class JavaVertexRDD[@specialized VD: ClassTag]( + val parent: RDD[(VertexId, VD)], + val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) + extends Serializable { +// val rdd = new VertexRDD(parent, targetStorageLevel) - override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) - (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { - other match { - case other: VertexRDD[_] => - innerZipJoin(other)(f) - case _ => - this.withPartitionsRDD( - partitionsRDD.zipPartitions( - other.copartitionWithVertices(this.partitioner.get), preservesPartitioning = true) { - (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f)) - } - ) - } - } - - override def diff(other: VertexRDD[VD]): JavaRDD[(VertexId, VD)] = { - JavaRDD.fromRDD(super[VertexRDD].diff(other)) - } - - override def mapValues[VD2: ClassTag](f: VD => VD2): JavaVertexRDD[VD2] = - this.mapVertexPartitions(_.map((vid, attr) => f(attr))) - - override def mapVertexPartitions[VD2: ClassTag] - (f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD2] = { - val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) - this.withPartitionsRDD(newPartitionsRDD) - } - - override def withPartitionsRDD[VD2: ClassTag] - (partitionsRDD: RDD[ShippableVertexPartition[VD2]]): JavaVertexRDD[VD2] = { - new JavaVertexRDD[VD2](partitionsRDD.firstParent, this.targetStorageLevel) - } - - def copartitionWithVertices(partitioner: Partitioner): JavaRDD[(VertexId, VD)] = { - - val rdd = new ShuffledRDD[VertexId, VD, VD](rdd, partitioner) - - // Set a custom serializer if the data is of int or double type. - if (classTag[VD] == ClassTag.Int) { - rdd.setSerializer(new IntAggMsgSerializer) - } else if (classTag[VD] == ClassTag.Long) { - rdd.setSerializer(new LongAggMsgSerializer) - } else if (classTag[VD] == ClassTag.Double) { - rdd.setSerializer(new DoubleAggMsgSerializer) - } - rdd - } +// val wrapVertexRDD(rdd: RDD[(VertexId, VD)]): This +// +// override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) +// (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { +// other match { +// case other: JavaVertexRDD[_] => +// innerZipJoin(other)(f) +// case _ => +// this.withPartitionsRDD( +// partitionsRDD.zipPartitions( +// other.copartitionWithVertices(this.partitioner.get), preservesPartitioning = true) { +// (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f)) +// } +// ) +// } +// } +// +// override def diff(other: VertexRDD[VD]): JavaRDD[(VertexId, VD)] = { +// JavaRDD.fromRDD(super[VertexRDD].diff(other)) +// } +// +// override def mapValues[VD2: ClassTag](f: VD => VD2): JavaVertexRDD[VD2] = +// this.mapVertexPartitions(_.map((vid, attr) => f(attr))) +// +// override def mapVertexPartitions[VD2: ClassTag] +// (f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD2] = { +// val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) +// this.withPartitionsRDD(newPartitionsRDD) +// } +// +// override def withPartitionsRDD[VD2: ClassTag] +// (partitionsRDD: RDD[ShippableVertexPartition[VD2]]): JavaVertexRDD[VD2] = { +// new JavaVertexRDD[VD2](partitionsRDD.firstParent, this.targetStorageLevel) +// } +// +// def copartitionWithVertices(partitioner: Partitioner): JavaRDD[(VertexId, VD)] = { +// +// val rdd = new ShuffledRDD[VertexId, VD, VD](rdd, partitioner) +// +// // Set a custom serializer if the data is of int or double type. +// if (classTag[VD] == ClassTag.Int) { +// rdd.setSerializer(new IntAggMsgSerializer) +// } else if (classTag[VD] == ClassTag.Long) { +// rdd.setSerializer(new LongAggMsgSerializer) +// } else if (classTag[VD] == ClassTag.Double) { +// rdd.setSerializer(new DoubleAggMsgSerializer) +// } +// rdd +// } +// +// def innerZipJoin[U: ClassTag, VD2: ClassTag] +// (other: JavaVertexRDD[U]) +// (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { +// val newPartitionsRDD = partitionsRDD.zipPartitions( +// other.partitionsRDD, preservesPartitioning = true +// ) { (thisIter, otherIter) => +// val thisPart = thisIter.next() +// val otherPart = otherIter.next() +// Iterator(thisPart.innerJoin(otherPart)(f)) +// } +// this.withPartitionsRDD(newPartitionsRDD) +// } } + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index e76193fd8559..2adad9837adc 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -17,6 +17,9 @@ package org.apache.spark.graphx.api.python +import org.apache.spark.annotation.DeveloperApi + +@DeveloperApi abstract private[graphx] class PythonGraph ( @transient val vertexRDD: PythonVertexRDD, @transient val edgeRDD: PythonEdgeRDD) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 136004305b27..9505f0ecca8e 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -20,66 +20,74 @@ package org.apache.spark.graphx.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import org.apache.spark.api.java.JavaRDD -import org.apache.spark.api.python.PythonRDD -import org.apache.spark.broadcast.Broadcast +import org.apache.spark.graphx.VertexRDD import org.apache.spark.rdd.RDD -import org.apache.spark.{Accumulator, Partition, TaskContext} -/** - * - * @param parent - * @param command - * @param envVars - * @param pythonIncludes - * @param preservePartitoning - * @param pythonExec - * @param broadcastVars - * @param accumulator - * @param targetStorageLevel - */ -private[graphx] class PythonVertexRDD ( - parent: PythonRDD, - command: Array[Byte], - envVars: JMap[String, String], - pythonIncludes: JList[String], - preservePartitoning: Boolean, - pythonExec: String, - broadcastVars: JList[Broadcast[Array[Byte]]], - accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel: String = "MEMORY_ONLY") - extends RDD[Array[Byte]](parent) { - - import org.apache.spark.graphx.api.python.PythonVertexRDD._ - - val bufferSize = conf.getInt("spark.buffer.size", DEFAULT_SPARK_BUFFER_SIZE) - val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true) +//class PythonVertexRDD ( +// parent: JavaRDD[Array[Byte]], +// command: Array[Byte], +// envVars: JMap[String, String], +// pythonIncludes: JList[String], +// preservePartitoning: Boolean, +// pythonExec: String, +// broadcastVars: JList[Broadcast[Array[Byte]]], +// accumulator: Accumulator[JList[Array[Byte]]], +// targetStorageLevel: String = "MEMORY_ONLY") +// extends RDD[Array[Byte]](parent) { - /** - * :: DeveloperApi :: - * Implemented by subclasses to compute a given partition. - */ - override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { - null - } +class PythonVertexRDD(parent: RDD[_], schema: String) extends Serializable { + + +// val bufferSize = conf.getInt("spark.buffer.size", DEFAULT_SPARK_BUFFER_SIZE) +// val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true) + +// /** +// * :: DeveloperApi :: +// * Implemented by subclasses to compute a given partition. +// */ +// override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { +// null +// } /** * Implemented by subclasses to return the set of partitions in this RDD. This method will only * be called once, so it is safe to implement a time-consuming computation in it. */ - override def getPartitions: Array[Partition] = ??? +// override def getPartitions: Array[Partition] = ??? // def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { // this(parent, null, null, preservePartitioning, "MEMORY_ONLY") // System.out.println("PythonVertexRDD constructor") // } - val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) +// val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) - def countToString(): String = { - this.count().toString + def toVertexRDD[VD](pyRDD: RDD[_], schema: String): JavaRDD[Array[Byte]] = { +// new VertexRDD[VD](PythonRDD.pythonToJava(pyRDD, true), StorageLevel.MEMORY_ONLY) + System.out.println("In PythonVertexRDD.toVertexRDD()") + val propertySchema = new VertexSchema(schema) + vertices = new VertexRDD[VertexSchema](pyRDD.mapPartitions()) + null } } object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 + + def toVertexRDD(parent: RDD[_], schema: String) : JavaRDD[Array[Byte]] = { + val pyRDD = new PythonVertexRDD(parent, schema) + pyRDD.toVertexRDD(parent, schema) + } +} + +class VertexSchema(val schemaString: String) { + + /** + * The vertex property schema is + * @param schemaString + * @return + */ + def fromString(schemaString: String) : List[String] = + schemaString.split(" ").toList + } diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index d75680b1d299..fbc94d4fcb14 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -32,6 +32,7 @@ def __init__(self, vertex_jrdd, edge_jrdd, partition_strategy=PartitionStrategy. self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, BatchedSerializer(PickleSerializer())) self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, BatchedSerializer(PickleSerializer())) self._partition_strategy = partition_strategy + self._sc = vertex_jrdd.context def persist(self, storageLevel): self._vertex_jrdd.persist(storageLevel) @@ -68,16 +69,21 @@ def pagerank(self, num_iterations, reset_probability = 0.15): :return: """ - return + py_graph = self._sc._jvm.org.apache.PythonGraph.pagerank(num_iterations, reset_probability) + return py_graph.asJavaRDD() def connected_components(self): - return + py_graph = self._sc._jvm.org.apache.PythonGraph.connectedComponents() + return py_graph.asJavaRDD() def reverse(self): - return + py_graph = self._sc._jvm.org.apache.PythonGraph.reverse() + return py_graph.asJavaRDD() def apply(self, f): - - return + def func(iterator): + return itertools.imap(f, iterator) + py_graph = self._sc._jvm.org.apache.PythonGraph.apply(func) + return py_graph.asJavaRDD() diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 3d7339dd9b37..f2029cbbf0b0 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -28,7 +28,7 @@ from numpy.numarray.numerictypes import Long from pyspark.accumulators import PStatsParam from pyspark.rdd import PipelinedRDD -from pyspark.serializers import CloudPickleSerializer, NoOpSerializer +from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, BatchedSerializer from pyspark import RDD, PickleSerializer, StorageLevel from pyspark.graphx.partitionstrategy import PartitionStrategy from pyspark.sql import StringType, LongType @@ -65,6 +65,11 @@ def __str__(self): return self._id + self._property +class VertexPropertySchema(object): + def __init__(self, tuple): + self.schema = list(tuple) + + class VertexRDD(object): """ VertexRDD class defines vertex operations/transformation and vertex properties @@ -74,19 +79,31 @@ class VertexRDD(object): in PythonVertexRDD class in [[org.apache.spark.graphx.api.python package]] """ - def __init__(self, jrdd, ctx, jrdd_deserializer): + def __init__(self, vertex_property, jrdd, + jrdd_deserializer = BatchedSerializer(PickleSerializer())): + """ + Constructor + :param vertex_property: A tuple of the vertex properties, e.g. + vd=sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices=VertexRDD(vd,("String", "String")) + :param jrdd: + :param jrdd_deserializer: + + """ + self._jrdd = jrdd - self._ctx = ctx + self._ctx = jrdd._jrdd.context self._jrdd_deserializer = jrdd_deserializer self._preserve_partitioning = False self._name = "VertexRDD" - self.is_cached = False - self.is_checkpointed = False + self._is_cached = False + self._is_checkpointed = False self._id = jrdd.id() self._partitionFunc = None self._jrdd_val = None self._bypass_serializer = False - self._jrdd_val = self.toVertexRDD(jrdd, ctx, jrdd_deserializer) + self._schema = VertexPropertySchema(vertex_property) + self._jrdd_val = self.toVertexRDD(self._jrdd, self._ctx, self._jrdd_deserializer, self._schema) # TODO: Does not work @@ -210,36 +227,11 @@ def func(iterator): return reduce(f, vals) raise ValueError("Can not reduce() empty RDD") - def toVertexRDD(self, jrdd, ctx, jrdd_deserializer): - if self._jrdd_val: - return self._jrdd_val - if self._bypass_serializer: - self._jrdd_deserializer = NoOpSerializer() - enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" - profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None - command = (self._jrdd_deserializer) - # the serialized command will be compressed by broadcast - ser = CloudPickleSerializer() - pickled_command = ser.dumps(command) - if len(pickled_command) > (1 << 20): # 1M - self._broadcast = self._ctx.broadcast(pickled_command) - pickled_command = ser.dumps(self._broadcast) - broadcast_vars = ListConverter().convert( - [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], - self._ctx._gateway._gateway_client) - self._ctx._pickled_broadcast_vars.clear() - env = MapConverter().convert(self._ctx.environment, - self._ctx._gateway._gateway_client) - includes = ListConverter().convert(self._ctx._python_includes, - self._ctx._gateway._gateway_client) - python_rdd = self._ctx._jvm.PythonVertexRDD(jrdd._jrdd, - bytearray(pickled_command), - env, includes, self._preserve_partitioning, - self._ctx.pythonExec, - broadcast_vars, self._ctx._javaAccumulator) - if enable_profile: - self._id = self._jrdd_val.id() - self._ctx._add_profile(self._id, profileStats) + def toVertexRDD(self, jrdd, ctx, jrdd_deserializer, schema): + + sc = jrdd.context + python_rdd = sc._jvm.PythonVertexRDD(bytearray(" ".join(x for x in schema.schema))) + print "in toVertexRDD" return python_rdd.asJavaRDD() diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index cd2b2d8c2246..9dde8a16492a 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2088,7 +2088,8 @@ def __del__(self): @property def _jrdd(self): - print "in _jrdd of rdd.py" + import traceback + print traceback.print_stack() if self._jrdd_val: return self._jrdd_val if self._bypass_serializer: From 08140bf7f190061cafe58a8d973ea280bff6168f Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Thu, 6 Nov 2014 10:47:19 -0500 Subject: [PATCH 08/25] SPARK-3789: WIP - 11/6/2014 --- .../scala/org/apache/spark/rdd/PipedRDD.scala | 2 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 27 +++---- .../spark/graphx/api/java/JavaVertexRDD.scala | 7 +- .../graphx/api/java/JavaVertexRDDLike.scala | 72 +++++++++++++++++++ .../graphx/api/python/PythonVertexRDD.scala | 19 ++--- .../impl/ShippableVertexPartition.scala | 2 +- pom.xml | 2 +- 7 files changed, 95 insertions(+), 36 deletions(-) create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala index 56ac7a69be0d..ed79032893d3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala @@ -63,7 +63,7 @@ private[spark] class PipedRDD[T: ClassTag]( /** * A FilenameFilter that accepts anything that isn't equal to the name passed in. - * @param name of file or directory to leave out + * @param filterName of file or directory to leave out */ class NotEqualsFileNameFilter(filterName: String) extends FilenameFilter { def accept(dir: File, name: String): Boolean = { diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index b7f125d01dfa..1bcb7551a18b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -17,33 +17,26 @@ package org.apache.spark.rdd -import java.util.{Properties, Random} - -import scala.collection.{mutable, Map} -import scala.collection.mutable.ArrayBuffer -import scala.reflect.{classTag, ClassTag} +import java.util.Random import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus -import org.apache.hadoop.io.BytesWritable +import org.apache.hadoop.io.{BytesWritable, NullWritable, Text} import org.apache.hadoop.io.compress.CompressionCodec -import org.apache.hadoop.io.NullWritable -import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextOutputFormat - -import org.apache.spark._ import org.apache.spark.Partitioner._ import org.apache.spark.SparkContext._ +import org.apache.spark._ import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.api.java.JavaRDD -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.partial.BoundedDouble -import org.apache.spark.partial.CountEvaluator -import org.apache.spark.partial.GroupedCountEvaluator -import org.apache.spark.partial.PartialResult +import org.apache.spark.partial.{BoundedDouble, CountEvaluator, GroupedCountEvaluator, PartialResult} import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.{BoundedPriorityQueue, Utils, CallSite} import org.apache.spark.util.collection.OpenHashMap import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, SamplingUtils} +import org.apache.spark.util.{BoundedPriorityQueue, Utils} + +import scala.collection.mutable.ArrayBuffer +import scala.collection.{Map, mutable} +import scala.reflect.{ClassTag, classTag} /** * A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable, @@ -1305,7 +1298,7 @@ abstract class RDD[T: ClassTag]( def toDebugString: String = { // Get a debug description of an rdd without its children def debugSelf (rdd: RDD[_]): Seq[String] = { - import Utils.bytesToString + import org.apache.spark.util.Utils.bytesToString val persistence = storageLevel.description val storageInfo = rdd.context.getRDDStorageInfo.filter(_.id == rdd.id).map(info => diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index e029ed40b287..06eaf0fe71b7 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.graphx.api.java +import org.apache.spark.api.java.{JavaRDD, JavaRDDLike} import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -34,7 +35,7 @@ import scala.reflect._ class JavaVertexRDD[@specialized VD: ClassTag]( val parent: RDD[(VertexId, VD)], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends Serializable { + extends JavaVertexRDDLike[(VertexId, VD), JavaVertexRDD[VD]] { // val rdd = new VertexRDD(parent, targetStorageLevel) @@ -100,7 +101,11 @@ class JavaVertexRDD[@specialized VD: ClassTag]( // } // this.withPartitionsRDD(newPartitionsRDD) // } + override def wrapRDD(rdd: RDD[(VertexId, VD)]): JavaRDD[(VertexId, VD)] = ??? + override def rdd: RDD[(VertexId, VD)] = ??? + + override implicit val classTag: ClassTag[(VertexId, VD)] = _ } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala new file mode 100644 index 000000000000..16793e28692c --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.graphx.api.java + +import java.lang.{Long => JLong} +import java.util.{List => JList} + +import org.apache.spark.api.java.JavaRDDLike +import org.apache.spark.api.java.function.{Function => JFunction, Function2 => JFunction2, Function3 => JFunction3} +import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.ShippableVertexPartition +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel + +import scala.language.implicitConversions +import scala.reflect.ClassTag + +trait JavaVertexRDDLike[T, This <: JavaVertexRDDLike[T, This, R], R <: JavaRDDLike[T, R]] + extends Serializable { + implicit val classTag: ClassTag[T] + + def vertexRDD: VertexRDD[T] + + def wrapRDD(in: RDD[T]): R + + /** + * Return a new single long element generated by counting all elements in the vertex RDD + */ + def count(): JLong = vertexRDD.count() + + /** + * Construct a new VertexRDD that is indexed by only the visible vertices. The resulting + * VertexRDD will be based on a different index and can no longer be quickly joined with this + * RDD. + */ + def reindex(): JavaVertexRDD[T] = new JavaVertexRDD(vertexRDD.reindex()) + + def setName(name: String) = vertexRDD.setName(name) + + def persist(newLevel: StorageLevel) : JavaVertexRDD[T] = { + new JavaVertexRDD(vertexRDD.persist(newLevel)) + } + + def cache() : JavaVertexRDD[T] = new JavaVertexRDD(vertexRDD.cache()) + + + def mapVertexPartitions[VD2: ClassTag]( + f: ShippableVertexPartition[T] => ShippableVertexPartition[VD2]) : JavaVertexRDD[VD2] = { + new JavaVertexRDD(vertexRDD.mapVertexPartitions[VD2](f)) + } + + def mapValues[VD2: ClassTag](f: T => VD2) : JavaVertexRDD[VD2] = + new JavaVertexRDD(vertexRDD.mapValues(f)) + + def filter(pred: Tuple2[VertexId, T] => Boolean): JavaVertexRDD[T] = + this.mapVertexPartitions(_.filter(Function.untupled(pred))) +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 9505f0ecca8e..c4eca49f17cd 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -35,19 +35,7 @@ import org.apache.spark.rdd.RDD // targetStorageLevel: String = "MEMORY_ONLY") // extends RDD[Array[Byte]](parent) { -class PythonVertexRDD(parent: RDD[_], schema: String) extends Serializable { - - -// val bufferSize = conf.getInt("spark.buffer.size", DEFAULT_SPARK_BUFFER_SIZE) -// val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true) - -// /** -// * :: DeveloperApi :: -// * Implemented by subclasses to compute a given partition. -// */ -// override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { -// null -// } +class PythonVertexRDD(parent: RDD[_], schema: String) extends { /** * Implemented by subclasses to return the set of partitions in this RDD. This method will only @@ -55,18 +43,19 @@ class PythonVertexRDD(parent: RDD[_], schema: String) extends Serializable { */ // override def getPartitions: Array[Partition] = ??? +// def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { // def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { // this(parent, null, null, preservePartitioning, "MEMORY_ONLY") // System.out.println("PythonVertexRDD constructor") // } -// val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) + val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(this) def toVertexRDD[VD](pyRDD: RDD[_], schema: String): JavaRDD[Array[Byte]] = { // new VertexRDD[VD](PythonRDD.pythonToJava(pyRDD, true), StorageLevel.MEMORY_ONLY) System.out.println("In PythonVertexRDD.toVertexRDD()") val propertySchema = new VertexSchema(schema) - vertices = new VertexRDD[VertexSchema](pyRDD.mapPartitions()) + val vertices = new VertexRDD[VertexSchema](pyRDD.mapPartitions()) null } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala index 5412d720475d..ed9639134262 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala @@ -50,7 +50,7 @@ object ShippableVertexPartition { /** * Construct a `ShippableVertexPartition` from the given vertices with the specified routing * table, filling in missing vertices mentioned in the routing table using `defaultVal`, - * and merging duplicate vertex atrribute with mergeFunc. + * and merging duplicate vertex attribute with mergeFunc. */ def apply[VD: ClassTag]( iter: Iterator[(VertexId, VD)], routingTable: RoutingTablePartition, defaultVal: VD, diff --git a/pom.xml b/pom.xml index e4c92470fc03..c38b5e129dc4 100644 --- a/pom.xml +++ b/pom.xml @@ -87,8 +87,8 @@ core - bagel graphx + bagel mllib tools network/common From a2faa642a84aff3175b0919758bbefa0483e0a85 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Wed, 12 Nov 2014 14:29:38 -0500 Subject: [PATCH 09/25] SPARK-3789: WIP --- graphx/pom.xml | 6 + .../spark/graphx/api/java/JavaEdgeRDD.scala | 2 + .../graphx/api/java/JavaEdgeRDDLike.scala | 32 +++++ .../spark/graphx/api/java/JavaVertexRDD.scala | 125 +++++++----------- .../graphx/api/java/JavaVertexRDDLike.scala | 46 ++++--- .../graphx/api/python/PythonVertexRDD.scala | 17 ++- .../org/apache/spark/graphx/JavaAPISuite.java | 67 ++++++++++ pom.xml | 2 +- python/pyspark/graphx/vertex.py | 2 +- .../streaming/LocalJavaStreamingContext.java | 2 +- 10 files changed, 197 insertions(+), 104 deletions(-) create mode 100644 graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala create mode 100644 graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java diff --git a/graphx/pom.xml b/graphx/pom.xml index 3f49b1d63b6e..15ee54198062 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -59,6 +59,12 @@ scalacheck_${scala.binary.version} test + + junit + junit + 4.11 + test + target/scala-${scala.binary.version}/classes diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index f1877dab40a3..66f21ccc9233 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -18,5 +18,7 @@ package org.apache.spark.graphx.api.java class JavaEdgeRDD { + + } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala new file mode 100644 index 000000000000..15c23db4fc36 --- /dev/null +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.graphx.api.java + +import org.apache.spark.api.java.JavaRDDLike +import org.apache.spark.graphx._ + +trait JavaEdgeRDDLike [ED, This <: JavaEdgeRDDLike[ED, This, R], +R <: JavaRDDLike[Edge[ED], R]] + extends Serializable { + + def edgeRDD: EdgeRDD[ED, VD] + + def setName(name: String) = + + + +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index 06eaf0fe71b7..b88ac32daca6 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -16,11 +16,14 @@ */ package org.apache.spark.graphx.api.java -import org.apache.spark.api.java.{JavaRDD, JavaRDDLike} -import org.apache.spark.graphx.VertexId +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.function.{Function => JFunction} +import org.apache.spark.graphx.{VertexId, VertexRDD} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel +import org.apache.spark.{Partition, TaskContext} +import scala.language.implicitConversions import scala.reflect._ /** @@ -32,80 +35,52 @@ import scala.reflect._ * [[org.apache.spark.graphx.VertexRDD]] */ -class JavaVertexRDD[@specialized VD: ClassTag]( - val parent: RDD[(VertexId, VD)], +class JavaVertexRDD[VD]( + val vertices: RDD[(VertexId, VD)], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends JavaVertexRDDLike[(VertexId, VD), JavaVertexRDD[VD]] { - -// val rdd = new VertexRDD(parent, targetStorageLevel) - -// val wrapVertexRDD(rdd: RDD[(VertexId, VD)]): This -// -// override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) -// (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { -// other match { -// case other: JavaVertexRDD[_] => -// innerZipJoin(other)(f) -// case _ => -// this.withPartitionsRDD( -// partitionsRDD.zipPartitions( -// other.copartitionWithVertices(this.partitioner.get), preservesPartitioning = true) { -// (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f)) -// } -// ) -// } -// } -// -// override def diff(other: VertexRDD[VD]): JavaRDD[(VertexId, VD)] = { -// JavaRDD.fromRDD(super[VertexRDD].diff(other)) -// } -// -// override def mapValues[VD2: ClassTag](f: VD => VD2): JavaVertexRDD[VD2] = -// this.mapVertexPartitions(_.map((vid, attr) => f(attr))) -// -// override def mapVertexPartitions[VD2: ClassTag] -// (f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD2] = { -// val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true) -// this.withPartitionsRDD(newPartitionsRDD) -// } -// -// override def withPartitionsRDD[VD2: ClassTag] -// (partitionsRDD: RDD[ShippableVertexPartition[VD2]]): JavaVertexRDD[VD2] = { -// new JavaVertexRDD[VD2](partitionsRDD.firstParent, this.targetStorageLevel) -// } -// -// def copartitionWithVertices(partitioner: Partitioner): JavaRDD[(VertexId, VD)] = { -// -// val rdd = new ShuffledRDD[VertexId, VD, VD](rdd, partitioner) -// -// // Set a custom serializer if the data is of int or double type. -// if (classTag[VD] == ClassTag.Int) { -// rdd.setSerializer(new IntAggMsgSerializer) -// } else if (classTag[VD] == ClassTag.Long) { -// rdd.setSerializer(new LongAggMsgSerializer) -// } else if (classTag[VD] == ClassTag.Double) { -// rdd.setSerializer(new DoubleAggMsgSerializer) -// } -// rdd -// } -// -// def innerZipJoin[U: ClassTag, VD2: ClassTag] -// (other: JavaVertexRDD[U]) -// (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { -// val newPartitionsRDD = partitionsRDD.zipPartitions( -// other.partitionsRDD, preservesPartitioning = true -// ) { (thisIter, otherIter) => -// val thisPart = thisIter.next() -// val otherPart = otherIter.next() -// Iterator(thisPart.innerJoin(otherPart)(f)) -// } -// this.withPartitionsRDD(newPartitionsRDD) -// } - override def wrapRDD(rdd: RDD[(VertexId, VD)]): JavaRDD[(VertexId, VD)] = ??? - - override def rdd: RDD[(VertexId, VD)] = ??? - - override implicit val classTag: ClassTag[(VertexId, VD)] = _ + (implicit val classTag: ClassTag[VD]) + extends JavaVertexRDDLike[VD, JavaVertexRDD[VD], JavaRDD[(VertexId, VD)]] { + + override def vertexRDD = VertexRDD(vertices) + + override def wrapRDD(in: RDD[(VertexId, VD)]): JavaRDD[(VertexId, VD)] = { + JavaRDD.fromRDD(in) + } + + /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ + def cache(): JavaVertexRDD[VD] = vertices.cache().asInstanceOf[JavaVertexRDD[VD]] + + /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ + def persist(): JavaVertexRDD[VD] = vertices.persist().asInstanceOf[JavaVertexRDD[VD]] + + /** Persist the RDDs of this DStream with the given storage level */ + def persist(storageLevel: StorageLevel): JavaVertexRDD[VD] = + vertices.persist(storageLevel).asInstanceOf[JavaVertexRDD[VD]] + + def unpersist(blocking: Boolean = true) : this.type = + JavaVertexRDD(vertices.unpersist(blocking)) + + override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = + vertexRDD.compute(part, context) + + + def asJavaVertexRDD = JavaRDD.fromRDD(this.vertexRDD) + + + + + + +} + +object JavaVertexRDD { + + implicit def fromVertexRDD[VD: ClassTag](vertices: JavaRDD[(VertexId, VD)]): JavaVertexRDD[VD] = + new JavaVertexRDD[VD](vertices) + + implicit def apply[VD: ClassTag](vertices: JavaRDD[(Long, VD)]): JavaVertexRDD[VD] = { + new JavaVertexRDD[VD](vertices) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala index 16793e28692c..a35707ad5e20 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -25,18 +25,21 @@ import org.apache.spark.api.java.function.{Function => JFunction, Function2 => J import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.ShippableVertexPartition import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel +import org.apache.spark.{Partition, TaskContext} import scala.language.implicitConversions import scala.reflect.ClassTag -trait JavaVertexRDDLike[T, This <: JavaVertexRDDLike[T, This, R], R <: JavaRDDLike[T, R]] +trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], + R <: JavaRDDLike[(VertexId, VD), R]] extends Serializable { - implicit val classTag: ClassTag[T] - def vertexRDD: VertexRDD[T] + implicit val classTag: ClassTag[VD] - def wrapRDD(in: RDD[T]): R + // The type of the RDD is (VertexId, VD) + def vertexRDD: VertexRDD[VD] + + def wrapRDD(in: RDD[(VertexId, VD)]): R /** * Return a new single long element generated by counting all elements in the vertex RDD @@ -48,25 +51,34 @@ trait JavaVertexRDDLike[T, This <: JavaVertexRDDLike[T, This, R], R <: JavaRDDLi * VertexRDD will be based on a different index and can no longer be quickly joined with this * RDD. */ - def reindex(): JavaVertexRDD[T] = new JavaVertexRDD(vertexRDD.reindex()) +// def reindex(): JavaVertexRDD[VD] = new JavaVertexRDD(vertexRDD.reindex()) def setName(name: String) = vertexRDD.setName(name) - def persist(newLevel: StorageLevel) : JavaVertexRDD[T] = { - new JavaVertexRDD(vertexRDD.persist(newLevel)) + def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { + vertexRDD.compute(part, context) } - def cache() : JavaVertexRDD[T] = new JavaVertexRDD(vertexRDD.cache()) - - def mapVertexPartitions[VD2: ClassTag]( - f: ShippableVertexPartition[T] => ShippableVertexPartition[VD2]) : JavaVertexRDD[VD2] = { - new JavaVertexRDD(vertexRDD.mapVertexPartitions[VD2](f)) + f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD] = { + vertexRDD.mapVertexPartitions(f).asInstanceOf[JavaVertexRDD[VD]] } - def mapValues[VD2: ClassTag](f: T => VD2) : JavaVertexRDD[VD2] = - new JavaVertexRDD(vertexRDD.mapValues(f)) + def reindex() : JavaVertexRDD[VD] = + JavaVertexRDD(vertexRDD.reindex()) + + /** Return a new DStream containing only the elements that satisfy a predicate. */ + def filter(pred: Tuple2[VertexId, VD] => Boolean): JavaVertexRDD[VD] = + JavaVertexRDD(vertexRDD.filter(pred)) + + def mapVertexPartitions() + def mapValues() + def diff() + def leftZipJoin() + def leftJoin() + def innerZipJoin() + def innerJoin() + def aggregateUsingIndex() - def filter(pred: Tuple2[VertexId, T] => Boolean): JavaVertexRDD[T] = - this.mapVertexPartitions(_.filter(Function.untupled(pred))) + def fromEdges() } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index c4eca49f17cd..94ab1bfa8434 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -19,8 +19,7 @@ package org.apache.spark.graphx.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.graphx.VertexRDD +import org.apache.spark.graphx.api.java.JavaVertexRDD import org.apache.spark.rdd.RDD //class PythonVertexRDD ( @@ -49,13 +48,13 @@ class PythonVertexRDD(parent: RDD[_], schema: String) extends { // System.out.println("PythonVertexRDD constructor") // } - val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(this) + val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(parent.asInstanceOf) - def toVertexRDD[VD](pyRDD: RDD[_], schema: String): JavaRDD[Array[Byte]] = { + def toVertexRDD[VD](pyRDD: RDD[_], schema: String): JavaVertexRDD[Array[Byte]] = { // new VertexRDD[VD](PythonRDD.pythonToJava(pyRDD, true), StorageLevel.MEMORY_ONLY) System.out.println("In PythonVertexRDD.toVertexRDD()") - val propertySchema = new VertexSchema(schema) - val vertices = new VertexRDD[VertexSchema](pyRDD.mapPartitions()) + val propertySchema = new VertexProperty(schema) + val vertices = new JavaVertexRDD[VertexProperty](pyRDD.asInstanceOf) null } } @@ -63,13 +62,14 @@ class PythonVertexRDD(parent: RDD[_], schema: String) extends { object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 - def toVertexRDD(parent: RDD[_], schema: String) : JavaRDD[Array[Byte]] = { + def toVertexRDD(parent: RDD[_], schema: String) : JavaVertexRDD[Array[Byte]] = { val pyRDD = new PythonVertexRDD(parent, schema) pyRDD.toVertexRDD(parent, schema) } } -class VertexSchema(val schemaString: String) { +class VertexProperty(val schemaString: String) { + val schema : List[Any] = fromString(schemaString) /** * The vertex property schema is @@ -78,5 +78,4 @@ class VertexSchema(val schemaString: String) { */ def fromString(schemaString: String) : List[String] = schemaString.split(" ").toList - } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java new file mode 100644 index 000000000000..82262c1a31a9 --- /dev/null +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.graphx; + + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.graphx.api.java.JavaVertexRDD; +import org.apache.spark.rdd.RDD; +import org.apache.spark.storage.StorageLevel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class JavaAPISuite implements Serializable { + + private transient JavaSparkContext ssc; + + @Before + public void initialize() { + this.ssc = new JavaSparkContext("local", "GraphX JavaAPISuite"); + } + + @After + public void finalize() { + ssc.stop(); + ssc = null; + } + + @Test + public void testCount() { + List>> myList = + new ArrayList>>(); + myList.add(new Tuple2(1L, new Tuple2("abc", "XYZ"))); + myList.add(new Tuple2(2L, new Tuple2("def", "SFN"))); + myList.add(new Tuple2(3L, new Tuple2("xyz", "XYZ"))); + JavaRDD>> javaRDD = ssc.parallelize(myList); + JavaVertexRDD javaVertexRDD = new JavaVertexRDD(javaRDD.rdd()); + assertEquals(javaVertexRDD.count(), 3); + } +} diff --git a/pom.xml b/pom.xml index a23986a7e7fe..ac4d8dfd0f16 100644 --- a/pom.xml +++ b/pom.xml @@ -86,8 +86,8 @@ - core graphx + core bagel mllib tools diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index f2029cbbf0b0..35d4b1f9b3b1 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -233,7 +233,7 @@ def toVertexRDD(self, jrdd, ctx, jrdd_deserializer, schema): python_rdd = sc._jvm.PythonVertexRDD(bytearray(" ".join(x for x in schema.schema))) print "in toVertexRDD" - return python_rdd.asJavaRDD() + return python_rdd.asJavaVertexRDD() def id(self): """ diff --git a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071..9bddbbcb4911 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -23,7 +23,7 @@ public abstract class LocalJavaStreamingContext { - protected transient JavaStreamingContext ssc; + protected transient JavaStreamingContext ssc; @Before public void setUp() { From 28be23e39b1d25f7ef331d27ef09c626bd176418 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Tue, 18 Nov 2014 20:10:48 -0500 Subject: [PATCH 10/25] SPARK-3789: WIP --- .../apache/spark/api/python/PythonRDD.scala | 4 +- .../org/apache/spark/graphx/VertexRDD.scala | 4 +- .../spark/graphx/api/java/JavaEdgeRDD.scala | 35 ++- .../graphx/api/java/JavaEdgeRDDLike.scala | 32 ++- .../spark/graphx/api/java/JavaGraph.scala | 89 +++++++- .../spark/graphx/api/java/JavaVertexRDD.scala | 29 +-- .../graphx/api/java/JavaVertexRDDLike.scala | 88 ++++++-- .../graphx/api/python/PythonEdgeRDD.scala | 24 +- .../graphx/api/python/PythonVertexRDD.scala | 57 ++--- .../org/apache/spark/graphx/JavaAPISuite.java | 69 ++++-- .../apache/spark/graphx/JavaTestUtils.scala | 41 ++++ python/pyspark/graphx/graph.py | 93 +++++++- python/pyspark/graphx/tests.py | 213 ++++++++++++++++++ python/pyspark/graphx/vertex.py | 6 +- 14 files changed, 671 insertions(+), 113 deletions(-) create mode 100644 graphx/src/test/java/org/apache/spark/graphx/JavaTestUtils.scala create mode 100644 python/pyspark/graphx/tests.py diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 45beb8fc8c92..e58e51b21176 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -45,7 +45,7 @@ private[spark] class PythonRDD( command: Array[Byte], envVars: JMap[String, String], pythonIncludes: JList[String], - preservePartitoning: Boolean, + preservePartitioning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], accumulator: Accumulator[JList[Array[Byte]]]) @@ -56,7 +56,7 @@ private[spark] class PythonRDD( override def getPartitions = firstParent.partitions - override val partitioner = if (preservePartitoning) firstParent.partitioner else None + override val partitioner = if (preservePartitioning) firstParent.partitioner else None override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { val startTime = System.currentTimeMillis diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala index 2c8b245955d1..8deb1637d6b4 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala @@ -435,8 +435,8 @@ object VertexRDD { * @param numPartitions the desired number of partitions for the resulting `VertexRDD` * @param defaultVal the vertex attribute to use when creating missing vertices */ - def fromEdges[VD: ClassTag]( - edges: EdgeRDD[_, _], numPartitions: Int, defaultVal: VD): VertexRDD[VD] = { + def fromEdges[VD: ClassTag]( + edges: EdgeRDD[_, _], numPartitions: Int, defaultVal: VD): VertexRDD[VD] = { val routingTables = createRoutingTables(edges, new HashPartitioner(numPartitions)) val vertexPartitions = routingTables.mapPartitions({ routingTableIter => val routingTable = diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 66f21ccc9233..7164834350d8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -16,9 +16,42 @@ */ package org.apache.spark.graphx.api.java -class JavaEdgeRDD { +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel +import scala.language.implicitConversions +import scala.reflect.ClassTag +class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] + (edges: RDD[Edge[ED]]) + extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], JavaRDD[Edge[ED]]] { + override def edgeRDD: EdgeRDD[ED, VD] = EdgeRDD.fromEdges(edges) + + /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ + def cache(): JavaEdgeRDD[ED, VD] = edges.cache().asInstanceOf[JavaEdgeRDD[ED, VD]] + + /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ + def persist(): JavaEdgeRDD[ED, VD] = edges.persist().asInstanceOf[JavaEdgeRDD[ED, VD]] + + /** Persist the RDDs of this DStream with the given storage level */ + def persist(storageLevel: StorageLevel): JavaEdgeRDD[ED, VD] = + edges.persist(storageLevel).asInstanceOf[JavaEdgeRDD[ED, VD]] + + def unpersist(blocking: Boolean = true) : JavaEdgeRDD[ED, VD] = + JavaEdgeRDD(edgeRDD.unpersist(blocking)) +} + +object JavaEdgeRDD { + + implicit def fromEdgeRDD[ED: ClassTag, VD: ClassTag] + (edges: JavaRDD[Edge[ED]]): JavaEdgeRDD[ED, VD] = + new JavaEdgeRDD(edges) + + implicit def apply[ED: ClassTag, VD: ClassTag](edges: JavaRDD[Edge[ED]]): JavaEdgeRDD[ED, VD] = { + new JavaEdgeRDD(edges) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala index 15c23db4fc36..b067fc414335 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -16,17 +16,45 @@ */ package org.apache.spark.graphx.api.java +import javax.swing.JList + +import java.lang.{Long => JLong} import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.EdgePartition + +import scala.reflect.ClassTag -trait JavaEdgeRDDLike [ED, This <: JavaEdgeRDDLike[ED, This, R], +trait JavaEdgeRDDLike [ED, VD, This <: JavaEdgeRDDLike[ED, VD, This, R], R <: JavaRDDLike[Edge[ED], R]] extends Serializable { def edgeRDD: EdgeRDD[ED, VD] - def setName(name: String) = + def setName(name: String) = edgeRDD.setName(name) + + def collect(): JList[Edge[ED]] = edgeRDD.collect().toList.asInstanceOf + + def count(): Long = edgeRDD.count() + + def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] + (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { + JavaEdgeRDD(edgeRDD.mapEdgePartitions(f)) + } + def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { + JavaEdgeRDD(edgeRDD.mapValues(f)) + } + def filter + (epred: EdgeTriplet[VD, ED] => Boolean, + vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { + JavaEdgeRDD(edgeRDD.filter(epred, vpred)) + } + def innerJoin[ED2: ClassTag, ED3: ClassTag] + (other: EdgeRDD[ED2, _]) + (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { + JavaEdgeRDD[ED3, VD](edgeRDD.innerJoin(other)(f)) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index 0f22530dd5ab..f603afbf643a 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -16,7 +16,94 @@ */ package org.apache.spark.graphx.api.java -class JavaGraph { +import java.lang.{Double => JDouble, Long => JLong} +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.graphx._ +import org.apache.spark.graphx.lib.PageRank +import org.apache.spark.rdd.RDD + +import scala.language.implicitConversions +import scala.reflect.ClassTag + +class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] + (vertexRDD : RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]) { + + def vertices: JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD) + def edges: JavaEdgeRDD[ED, VD] = JavaEdgeRDD(edgeRDD) + @transient lazy val graph : Graph[VD, ED] = Graph(vertexRDD, edgeRDD) + + def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): JavaGraph[VD, ED] = { + val graph = Graph(vertexRDD, edgeRDD) + JavaGraph(graph.partitionBy(partitionStrategy, numPartitions)) + } + + /** The number of edges in the graph. */ + def numEdges: JLong = edges.count() + + /** The number of vertices in the graph. */ + def numVertices: JLong = vertices.count() + + def inDegrees: JavaVertexRDD[Int] = JavaVertexRDD[Int](graph.inDegrees) + + def outDegrees: JavaVertexRDD[Int] = JavaVertexRDD[Int](graph.outDegrees) + + def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2) : JavaGraph[VD2, ED] = { + JavaGraph(graph.mapVertices(map)) + } + + def mapEdges[ED2: ClassTag](map: Edge[ED] => ED2): JavaGraph[VD, ED2] = { + JavaGraph(graph.mapEdges(map)) + } + + def mapTriplets[ED2: ClassTag](map: EdgeTriplet[VD, ED] => ED2): JavaGraph[VD, ED2] = { + JavaGraph(graph.mapTriplets(map)) + } + + def reverse : JavaGraph[VD, ED] = JavaGraph(graph.reverse) + + def subgraph( + epred: EdgeTriplet[VD,ED] => Boolean = (x => true), + vpred: (VertexId, VD) => Boolean = ((v, d) => true)) : JavaGraph[VD, ED] = { + JavaGraph(graph.subgraph(epred, vpred)) + } + + def groupEdges(merge: (ED, ED) => ED): JavaGraph[VD, ED] = { + JavaGraph(graph.groupEdges(merge)) + } + + def mapReduceTriplets[A: ClassTag]( + mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], + reduceFunc: (A, A) => A, + activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) + : JavaVertexRDD[A] = { + JavaVertexRDD(graph.mapReduceTriplets(mapFunc, reduceFunc, activeSetOpt)) + } + + def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) + (mapFunc: (VertexId, VD, Option[U]) => VD2) : JavaGraph[VD2, ED] = { + JavaGraph(graph.outerJoinVertices(other)(mapFunc)) + } + + def pagerank(tol: Double, resetProb: Double = 0.15) : JavaGraph[Double, Double] = + JavaGraph(PageRank.runUntilConvergence(graph, tol, resetProb)) +} + +object JavaGraph { + + implicit def fromRDD[VD: ClassTag, ED: ClassTag] + (javaVertexRDD: JavaRDD[(VertexId, VD)], javaEdgeRDD: JavaRDD[Edge[ED]]): JavaGraph[VD, ED] = { + new JavaGraph[VD, ED](javaVertexRDD, javaEdgeRDD) + } + + implicit def apply[VD: ClassTag, ED: ClassTag] + (vertexRDD: RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]): JavaGraph[VD, ED] = { + new JavaGraph[VD, ED](vertexRDD, edgeRDD) + } + + implicit def apply[VD: ClassTag, ED: ClassTag] + (graph: Graph[VD, ED]): JavaGraph[VD, ED] = { + new JavaGraph[VD, ED](graph.vertices, graph.edges) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index b88ac32daca6..5ca3cf0011fd 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -43,8 +43,8 @@ class JavaVertexRDD[VD]( override def vertexRDD = VertexRDD(vertices) - override def wrapRDD(in: RDD[(VertexId, VD)]): JavaRDD[(VertexId, VD)] = { - JavaRDD.fromRDD(in) + override def wrapRDD(rdd: RDD[(VertexId, VD)]): JavaRDD[(VertexId, VD)] = { + JavaRDD.fromRDD(rdd) } /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ @@ -57,28 +57,29 @@ class JavaVertexRDD[VD]( def persist(storageLevel: StorageLevel): JavaVertexRDD[VD] = vertices.persist(storageLevel).asInstanceOf[JavaVertexRDD[VD]] - def unpersist(blocking: Boolean = true) : this.type = - JavaVertexRDD(vertices.unpersist(blocking)) - + /** Generate a VertexRDD for the given duration */ override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = vertexRDD.compute(part, context) - + /** Convert [[org.apache.spark.api.java.JavaRDD]] to + * [[org.apache.spark.graphx.api.java.JavaVertexRDD]] instance */ def asJavaVertexRDD = JavaRDD.fromRDD(this.vertexRDD) - - - - - + /** Return a new VertexRDD containing only the elements that satisfy a predicate. */ + def filter(f: JFunction[(VertexId, VD), Boolean]): JavaVertexRDD[VD] = + JavaVertexRDD(vertexRDD.filter(x => f.call(x).booleanValue())) } object JavaVertexRDD { - implicit def fromVertexRDD[VD: ClassTag](vertices: JavaRDD[(VertexId, VD)]): JavaVertexRDD[VD] = - new JavaVertexRDD[VD](vertices) + /** + * Convert a scala [[org.apache.spark.graphx.VertexRDD]] to a Java-friendly + * [[org.apache.spark.graphx.api.java.JavaVertexRDD]]. + */ + implicit def fromVertexRDD[VD: ClassTag](vertexRDD: VertexRDD[VD]): JavaVertexRDD[VD] = + new JavaVertexRDD[VD](vertexRDD) - implicit def apply[VD: ClassTag](vertices: JavaRDD[(Long, VD)]): JavaVertexRDD[VD] = { + implicit def apply[VD: ClassTag](vertices: JavaRDD[(VertexId, VD)]): JavaVertexRDD[VD] = { new JavaVertexRDD[VD](vertices) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala index a35707ad5e20..389b71f74d5c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -41,17 +41,12 @@ trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], def wrapRDD(in: RDD[(VertexId, VD)]): R - /** - * Return a new single long element generated by counting all elements in the vertex RDD - */ - def count(): JLong = vertexRDD.count() + def collect(): List[(VertexId, VD)] = vertexRDD.collect().toList /** - * Construct a new VertexRDD that is indexed by only the visible vertices. The resulting - * VertexRDD will be based on a different index and can no longer be quickly joined with this - * RDD. + * Return a new single long element generated by counting all elements in the vertex RDD */ -// def reindex(): JavaVertexRDD[VD] = new JavaVertexRDD(vertexRDD.reindex()) + def count(): Long = vertexRDD.count() def setName(name: String) = vertexRDD.setName(name) @@ -59,26 +54,71 @@ trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], vertexRDD.compute(part, context) } + /** + * To construct a new Java interface of VertexRDD that is indexed by only the visible vertices. + * The resulting vertex RDD will be based on a different index and can no longer be quickly + * joined with this RDD. + */ + def reindex() : JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD.reindex()) + + /** + * Applies a function to each `VertexPartition` of this RDD and returns a new + * [[org.apache.spark.graphx.api.java.JavaVertexRDD]] + */ def mapVertexPartitions[VD2: ClassTag]( - f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]): JavaVertexRDD[VD] = { - vertexRDD.mapVertexPartitions(f).asInstanceOf[JavaVertexRDD[VD]] + f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2]) : JavaVertexRDD[VD2] = { + JavaVertexRDD(vertexRDD.mapVertexPartitions(f)) } - def reindex() : JavaVertexRDD[VD] = - JavaVertexRDD(vertexRDD.reindex()) + def mapValues[VD2: ClassTag](f: VD => VD2): JavaVertexRDD[VD2] = { + JavaVertexRDD(vertexRDD.mapValues(f)) + } - /** Return a new DStream containing only the elements that satisfy a predicate. */ - def filter(pred: Tuple2[VertexId, VD] => Boolean): JavaVertexRDD[VD] = - JavaVertexRDD(vertexRDD.filter(pred)) + /** Hides vertices that are the same between `this` and `other`; for vertices that are different, + * keeps the values from `other`. + */ + def diff(other: VertexRDD[VD]): JavaVertexRDD[VD] = { + JavaVertexRDD(vertexRDD.diff(other)) + } - def mapVertexPartitions() - def mapValues() - def diff() - def leftZipJoin() - def leftJoin() - def innerZipJoin() - def innerJoin() - def aggregateUsingIndex() + /** Takes a [[org.apache.spark.graphx.api.java.JavaVertexRDD]] instead of a + * [[org.apache.spark.graphx.VertexRDD]] as argument. + */ + def diff(other: JavaVertexRDD[VD]): JavaVertexRDD[VD] = { + JavaVertexRDD(vertexRDD.diff(other.vertexRDD)) + } - def fromEdges() + def leftZipJoin[VD2: ClassTag, VD3: ClassTag] + (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): JavaVertexRDD[VD3] = { + JavaVertexRDD(vertexRDD.leftZipJoin[VD2, VD3](other)(f)) + } + + def leftJoin[VD2: ClassTag, VD3: ClassTag] + (other: RDD[(VertexId, VD2)]) + (f: (VertexId, VD, Option[VD2]) => VD3) + : JavaVertexRDD[VD3] = { + JavaVertexRDD(vertexRDD.leftJoin(other)(f)) + } + + def innerZipJoin[U: ClassTag, VD2: ClassTag] + (other: VertexRDD[U]) + (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { + JavaVertexRDD(vertexRDD.innerZipJoin(other)(f)) + } + + def innerJoin[U: ClassTag, VD2: ClassTag] + (other: RDD[(VertexId, U)]) + (f: (VertexId, VD, U) => VD2): JavaVertexRDD[VD2] = { + JavaVertexRDD(vertexRDD.innerJoin(other)(f)) + } + + def aggregateUsingIndex[VD2: ClassTag] + (messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): JavaVertexRDD[VD2] = { + JavaVertexRDD(vertexRDD.aggregateUsingIndex(messages, reduceFunc)) + } + + def fromEdges[VD: ClassTag] + (edges: EdgeRDD[_, _], numPartitions: Int, defaultVal: VD): JavaVertexRDD[VD] = { + JavaVertexRDD(VertexRDD.fromEdges(edges, numPartitions, defaultVal)) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index 457e3b71196a..d222ff69db04 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -19,19 +19,23 @@ package org.apache.spark.graphx.api.python import java.util.{List => JList, Map => JMap} -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.graphx.EdgeRDD -import org.apache.spark.storage.StorageLevel +import org.apache.spark.Accumulator +import org.apache.spark.api.python.PythonRDD +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD -private[graphx] class PythonEdgeRDD ( - parent: JavaRDD[Array[Byte]], +private[graphx] class PythonEdgeRDD( + @transient parent: RDD[_], command: Array[Byte], envVars: JMap[String, String], pythonIncludes: JList[String], - preservePartitioning: Boolean, + preservePartitoning: Boolean, pythonExec: String, - partitionStrategy: String, - targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends EdgeRDD[Array[Byte], Array[Byte]](parent.firstParent, targetStorageLevel) { + broadcastVars: JList[Broadcast[Array[Byte]]], + accumulator: Accumulator[JList[Array[Byte]]]) + extends PythonRDD (parent, command, envVars, + pythonIncludes, preservePartitoning, + pythonExec, broadcastVars, accumulator) { - } + def +} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 94ab1bfa8434..08ba61db50bf 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -19,53 +19,38 @@ package org.apache.spark.graphx.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} +import org.apache.spark.Accumulator +import org.apache.spark.api.python.PythonRDD +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.graphx.VertexRDD import org.apache.spark.graphx.api.java.JavaVertexRDD import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel -//class PythonVertexRDD ( -// parent: JavaRDD[Array[Byte]], -// command: Array[Byte], -// envVars: JMap[String, String], -// pythonIncludes: JList[String], -// preservePartitoning: Boolean, -// pythonExec: String, -// broadcastVars: JList[Broadcast[Array[Byte]]], -// accumulator: Accumulator[JList[Array[Byte]]], -// targetStorageLevel: String = "MEMORY_ONLY") -// extends RDD[Array[Byte]](parent) { +private[graphx] class PythonVertexRDD( + @transient parent: RDD[_], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[Array[Byte]]], + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : String = StorageLevel.MEMORY_ONLY) + extends PythonRDD (parent, command, envVars, + pythonIncludes, preservePartitioning, + pythonExec, broadcastVars, accumulator) { -class PythonVertexRDD(parent: RDD[_], schema: String) extends { + val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(VertexRDD(parent.asInstanceOf)) - /** - * Implemented by subclasses to return the set of partitions in this RDD. This method will only - * be called once, so it is safe to implement a time-consuming computation in it. - */ -// override def getPartitions: Array[Partition] = ??? - -// def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { -// def this(parent: JavaRDD[Array[Byte]], command: String, preservePartitioning: Boolean) { -// this(parent, null, null, preservePartitioning, "MEMORY_ONLY") -// System.out.println("PythonVertexRDD constructor") -// } - - val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(parent.asInstanceOf) + def writeToFile(): = { - def toVertexRDD[VD](pyRDD: RDD[_], schema: String): JavaVertexRDD[Array[Byte]] = { -// new VertexRDD[VD](PythonRDD.pythonToJava(pyRDD, true), StorageLevel.MEMORY_ONLY) - System.out.println("In PythonVertexRDD.toVertexRDD()") - val propertySchema = new VertexProperty(schema) - val vertices = new JavaVertexRDD[VertexProperty](pyRDD.asInstanceOf) - null } + } object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 - - def toVertexRDD(parent: RDD[_], schema: String) : JavaVertexRDD[Array[Byte]] = { - val pyRDD = new PythonVertexRDD(parent, schema) - pyRDD.toVertexRDD(parent, schema) - } } class VertexProperty(val schemaString: String) { diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index 82262c1a31a9..715ea9b4e980 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.spark.graphx; @@ -23,6 +22,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; +import org.apache.spark.graphx.api.java.JavaEdgeRDD; import org.apache.spark.graphx.api.java.JavaVertexRDD; import org.apache.spark.rdd.RDD; import org.apache.spark.storage.StorageLevel; @@ -30,21 +30,31 @@ import org.junit.Before; import org.junit.Test; import scala.Tuple2; +import scala.reflect.ClassTag; +import scala.reflect.ClassTag$; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; import static org.junit.Assert.assertEquals; public class JavaAPISuite implements Serializable { private transient JavaSparkContext ssc; + private List>> myList; + private ClassTag> classTag; + @Before public void initialize() { this.ssc = new JavaSparkContext("local", "GraphX JavaAPISuite"); + + this.myList = new ArrayList>>(); + this.myList.add(new Tuple2(1L, new VertexProperty("abc", "ABC"))); + this.myList.add(new Tuple2(2L, new VertexProperty("def", "DEF"))); + this.myList.add(new Tuple2(3L, new VertexProperty("xyz", "XYZ"))); + + this.classTag = ClassTag$.MODULE$.apply(VertexProperty.class); } @After @@ -53,15 +63,48 @@ public void finalize() { ssc = null; } + private class VertexProperty implements Serializable { + T1 field1; + T2 field2; + + VertexProperty(T1 field1, T2 field2) { + this.field1 = field1; + this.field2 = field2; + } + + T1 getField1() { return field1; } + T2 getField2() { return field2; } + void setField1(T1 value) { this.field1 = value; } + void setField2(T2 value) { this.field2 = value; } + } + @Test - public void testCount() { - List>> myList = - new ArrayList>>(); - myList.add(new Tuple2(1L, new Tuple2("abc", "XYZ"))); - myList.add(new Tuple2(2L, new Tuple2("def", "SFN"))); - myList.add(new Tuple2(3L, new Tuple2("xyz", "XYZ"))); - JavaRDD>> javaRDD = ssc.parallelize(myList); - JavaVertexRDD javaVertexRDD = new JavaVertexRDD(javaRDD.rdd()); - assertEquals(javaVertexRDD.count(), 3); + public void testVertexRDDCount() { + + JavaRDD>> + javaRDD = ssc.parallelize(this.myList); + + JavaVertexRDD> javaVertexRDD = + JavaVertexRDD.apply(javaRDD, this.classTag); + + assertEquals(javaVertexRDD.count(), 3L); + } + + @Test + public void testEdgeRDDMapValues() { + + List> edgeList = new ArrayList>(); + edgeList.add(new Edge(0, 1, "abcd")); + edgeList.add(new Edge(1, 2, "defg")); + edgeList.add(new Edge(2, 3, "hijk")); + edgeList.add(new Edge(1, 3, "lmno")); + + JavaRDD> javaRDD = ssc.parallelize(edgeList); + + ClassTag classTag = ClassTag$.MODULE$.apply(String.class); + + JavaEdgeRDD javaEdgeRDD = JavaEdgeRDD.apply(javaRDD, classTag); + + assertEquals(javaEdgeRDD.count(), 4L); } } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaTestUtils.scala b/graphx/src/test/java/org/apache/spark/graphx/JavaTestUtils.scala new file mode 100644 index 000000000000..88dea1c41a54 --- /dev/null +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaTestUtils.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.graphx + +import org.apache.spark.api.java.JavaSparkContext +import java.util.{List => JList} + + +import org.apache.spark.graphx.api.java.JavaVertexRDD + +import scala.reflect.ClassTag + +object JavaTestUtils { + + def attachVertexRDD[VD]( + ssc: JavaSparkContext, + data: JList[Tuple2[Long, VD]], + numPartitions: Int) = { + + implicit val cm: ClassTag[VD] = + implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[VD]] + + val vertices = ssc.parallelize(data) + new JavaVertexRDD(vertices) + } + +} diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index fbc94d4fcb14..a42f7e4cbe40 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -18,7 +18,8 @@ """ Python bindings for GraphX. """ -from pyspark import PickleSerializer, RDD, StorageLevel +import itertools +from pyspark import PickleSerializer, RDD, StorageLevel, SparkContext from pyspark.graphx import VertexRDD, EdgeRDD from pyspark.graphx.partitionstrategy import PartitionStrategy @@ -28,11 +29,14 @@ __all__ = ["Graph"] class Graph(object): - def __init__(self, vertex_jrdd, edge_jrdd, partition_strategy=PartitionStrategy.EdgePartition1D): - self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, BatchedSerializer(PickleSerializer())) - self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, BatchedSerializer(PickleSerializer())) + def __init__(self, vertex_jrdd, edge_jrdd, + partition_strategy=PartitionStrategy.EdgePartition1D): + self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, + BatchedSerializer(PickleSerializer())) + self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, + BatchedSerializer(PickleSerializer())) self._partition_strategy = partition_strategy - self._sc = vertex_jrdd.context + self._jsc = vertex_jrdd.context def persist(self, storageLevel): self._vertex_jrdd.persist(storageLevel) @@ -50,11 +54,80 @@ def vertices(self): def edges(self): return self._edge_jrdd + # TODO def partitionBy(self, partitionStrategy): + return + + def numEdges(self): + return self._edge_jrdd.count() + + def numVertices(self): + return self._vertex_jrdd.count() + + # TODO + def inDegrees(self): + return + + # TODO + def outDegrees(self): + return + + # TODO + def degrees(self): + return + + def triplets(self): + if (isinstance(self._jsc, SparkContext)): + pyGraph = self._jsc.jvm.org.apache.spark.PythonGraph() + return pyGraph.triplets() + + # TODO + def unpersistVertices(self, blocking = True): + return + + def mapVertices(self, f): + def func(f): + return itertools.imap(f) + return self._vertex_jrdd.mapValues(func) + + # TODO + def mapEdges(self, f): + return + + # TODO + def mapTriplets(self, f): + return + # TODO + def reverse(self): + return + + # TODO + def subgraph(self, epred, pred): + return + + # TODO + def groupEdges(self, mergeFunc): + return + + # TODO + def joinVertices(self, mapFunc): + return + + # TODO + def outerJoinVertices(self, mapFunc): return - def subgraph(self, condition): + # TODO + def collectNeighborIds(self, edgeDirection): + return + + # TODO + def collectNeighbors(self, edgeDirection): + return + + # TODO + def mapReduceTriplets(self, mapFunc, reduceFunc): return def pagerank(self, num_iterations, reset_probability = 0.15): @@ -86,4 +159,12 @@ def func(iterator): py_graph = self._sc._jvm.org.apache.PythonGraph.apply(func) return py_graph.asJavaRDD() + # TODO + def triangleCount(self): + return + + # TODO + def stronglyConnectedComponents(self, iterations): + return + diff --git a/python/pyspark/graphx/tests.py b/python/pyspark/graphx/tests.py new file mode 100644 index 000000000000..1dded0aff5ae --- /dev/null +++ b/python/pyspark/graphx/tests.py @@ -0,0 +1,213 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.context import SparkConf, SparkContext, RDD +from pyspark.graphx.vertex import VertexRDD + + +class PyVertexRDDTestCase(unittest.TestCase): + """ + Test collect, take, count, mapValues, diff, + filter, mapVertexPartitions, innerJoin and leftJoin + for VertexRDD + """ + + def setUp(self): + class_name = self.__class__.__name__ + conf = SparkConf().set("spark.default.parallelism", 1) + self.sc = SparkContext(appName=class_name, conf=conf) + self.sc.setCheckpointDir("/tmp") + + def tearDown(self): + self.sc.stop() + + def collect(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.take(1) + self.assertEqual(results, [(3, ("rxin", "student"))]) + + def take(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + + def count(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.count() + self.assertEqual(results, 2) + + def mapValues(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.mapValues(lambda x: x + ":" + x) + self.assertEqual(results, [(3, ("rxin:rxin", "student:student")), + (7, ("jgonzal:jgonzal", "postdoc:postdoc"))]) + + def diff(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) + + # TODO + def innerJoin(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.innerJoin(vertices1) + self.assertEqual(results, 2) + + # TODO + def leftJoin(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) + + +class PyEdgeRDDTestCase(unittest.TestCase): + """ + Test collect, take, count, mapValues, + filter and innerJoin for EdgeRDD + """ + + def setUp(self): + class_name = self.__class__.__name__ + conf = SparkConf().set("spark.default.parallelism", 1) + self.sc = SparkContext(appName=class_name, conf=conf) + self.sc.setCheckpointDir("/tmp") + + def tearDown(self): + self.sc.stop() + + # TODO + def collect(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + + # TODO + def take(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + + # TODO + def count(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, 2) + + # TODO + def mapValues(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, 2) + + # TODO + def filter(self): + return + + # TODO + def innerJoin(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) + + +class PyGraphXTestCase(unittest.TestCase): + """ + Test vertices, edges, partitionBy, numEdges, numVertices, + inDegrees, outDegrees, degrees, triplets, mapVertices, + mapEdges, mapTriplets, reverse, subgraph, groupEdges, + joinVertices, outerJoinVertices, collectNeighborIds, + collectNeighbors, mapReduceTriplets, triangleCount for Graph + """ + + def setUp(self): + class_name = self.__class__.__name__ + conf = SparkConf().set("spark.default.parallelism", 1) + self.sc = SparkContext(appName=class_name, conf=conf) + self.sc.setCheckpointDir("/tmp") + + def tearDown(self): + self.sc.stop() + + def collect(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + + def take(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + + def count(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, 2) + + def mapValues(self): + vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertices = VertexRDD(vertexData) + results = vertices.collect() + self.assertEqual(results, 2) + + def diff(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) + + def innerJoin(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) + + def leftJoin(self): + vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) + vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) + vertices0 = VertexRDD(vertexData0) + vertices1 = VertexRDD(vertexData1) + results = vertices0.diff(vertices1) + self.assertEqual(results, 2) diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 35d4b1f9b3b1..d546bae95855 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -319,12 +319,14 @@ def _jrdd(self): self._ctx._gateway._gateway_client) includes = ListConverter().convert(self._ctx._python_includes, self._ctx._gateway._gateway_client) + targetStorageLevel = StorageLevel.MEMORY_ONLY python_rdd = self._ctx._jvm.PythonVertexRDD(self._prev_jrdd.rdd(), bytearray(pickled_command), env, includes, self.preservesPartitioning, self._ctx.pythonExec, - broadcast_vars, self._ctx._javaAccumulator) - self._jrdd_val = python_rdd.asJavaRDD() + broadcast_vars, self._ctx._javaAccumulator, + targetStorageLevel) + self._jrdd_val = python_rdd.asJavaVertexRDD() if enable_profile: self._id = self._jrdd_val.id() From a23d4183bbb65f647d54cbbe533d7d705cc8ae47 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Tue, 18 Nov 2014 22:12:48 -0500 Subject: [PATCH 11/25] SPARK-3789: WIP --- .../graphx/api/python/PythonVertexRDD.scala | 25 ++++++++++++++++++- python/pyspark/graphx/vertex.py | 23 +++-------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 08ba61db50bf..8615a8c1f617 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -17,6 +17,7 @@ package org.apache.spark.graphx.api.python +import java.io.{DataOutputStream, FileOutputStream} import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import org.apache.spark.Accumulator @@ -43,8 +44,30 @@ private[graphx] class PythonVertexRDD( val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(VertexRDD(parent.asInstanceOf)) - def writeToFile(): = { + def writeToFile[T](items: java.util.Iterator[T], filename: String) { + import scala.collection.JavaConverters._ + writeToFile(items.asScala, filename) + } + + def writeToFile[T](items: Iterator[T], filename: String) { + val file = new DataOutputStream(new FileOutputStream(filename)) + writeIteratorToStream(items, file) + file.close() + } + /** A data stream is written to a given file so that the collect() method + * of class VertexRDD in Python can read it back in the client and + * display the contents of the VertexRDD as a list + */ + def writeIteratorToStream[T](items: Iterator[T], stream: DataOutputStream) = { + if (items.hasNext) { + val first = items.next() + val newIter = Seq(first).iterator ++ items + // Assuming the type of this RDD will always be Array[Byte] + newIter.asInstanceOf[Iterator[Array[Byte]]].foreach { bytes => + stream.writeInt(bytes.length) + stream.write(bytes) + } } } diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index d546bae95855..612623a1c8aa 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -79,13 +79,10 @@ class VertexRDD(object): in PythonVertexRDD class in [[org.apache.spark.graphx.api.python package]] """ - def __init__(self, vertex_property, jrdd, + def __init__(self, jrdd, jrdd_deserializer = BatchedSerializer(PickleSerializer())): """ Constructor - :param vertex_property: A tuple of the vertex properties, e.g. - vd=sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) - vertices=VertexRDD(vd,("String", "String")) :param jrdd: :param jrdd_deserializer: @@ -102,8 +99,6 @@ def __init__(self, vertex_property, jrdd, self._partitionFunc = None self._jrdd_val = None self._bypass_serializer = False - self._schema = VertexPropertySchema(vertex_property) - self._jrdd_val = self.toVertexRDD(self._jrdd, self._ctx, self._jrdd_deserializer, self._schema) # TODO: Does not work @@ -119,16 +114,12 @@ def cache(self): def count(self): return self._jrdd.count() - # def collect(self): - # return self._jrdd.collect() - def collect(self): - print "in collect() of vertex.py" """ Return a list that contains all of the elements in this RDD. """ - # with SCCallSiteSync(self._ctx) as css: - bytesInJava = self._jrdd.collect().iterator() + with SCCallSiteSync(self._ctx) as css: + bytesInJava = self._jrdd.collect().iterator() return list(self._collect_iterator_through_file(bytesInJava)) def _collect_iterator_through_file(self, iterator): @@ -227,14 +218,6 @@ def func(iterator): return reduce(f, vals) raise ValueError("Can not reduce() empty RDD") - def toVertexRDD(self, jrdd, ctx, jrdd_deserializer, schema): - - sc = jrdd.context - python_rdd = sc._jvm.PythonVertexRDD(bytearray(" ".join(x for x in schema.schema))) - print "in toVertexRDD" - - return python_rdd.asJavaVertexRDD() - def id(self): """ A unique ID for this RDD (within its SparkContext). From 19b280d043982c628ff1ca90f5f38db527ddd4b9 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Wed, 19 Nov 2014 03:46:21 -0500 Subject: [PATCH 12/25] SPARK-3789: Updated vertex.py, edge.py and graph.py --- .../graphx/api/python/PythonEdgeRDD.scala | 42 ++- .../spark/graphx/api/python/PythonGraph.scala | 8 +- python/pyspark/graphx/edge.py | 254 ++++++++++++++++-- python/pyspark/graphx/graph.py | 15 +- python/pyspark/graphx/vertex.py | 173 +++++++----- 5 files changed, 375 insertions(+), 117 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index d222ff69db04..7e9241b63733 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -17,25 +17,57 @@ package org.apache.spark.graphx.api.python +import java.io.{DataOutputStream, FileOutputStream} import java.util.{List => JList, Map => JMap} import org.apache.spark.Accumulator import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast +import org.apache.spark.graphx.api.java.JavaEdgeRDD import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel private[graphx] class PythonEdgeRDD( @transient parent: RDD[_], command: Array[Byte], envVars: JMap[String, String], pythonIncludes: JList[String], - preservePartitoning: Boolean, + preservePartitioning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], - accumulator: Accumulator[JList[Array[Byte]]]) + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : String = StorageLevel.MEMORY_ONLY) extends PythonRDD (parent, command, envVars, - pythonIncludes, preservePartitoning, - pythonExec, broadcastVars, accumulator) { + pythonIncludes, preservePartitioning, + pythonExec, broadcastVars, accumulator) { - def + val asJavaEdgeRDD = JavaEdgeRDD.fromEdgeRDD(parent.asInstanceOf) + + def writeToFile[T](items: java.util.Iterator[T], filename: String) { + import scala.collection.JavaConverters._ + writeToFile(items.asScala, filename) + } + + def writeToFile[T](items: Iterator[T], filename: String) { + val file = new DataOutputStream(new FileOutputStream(filename)) + writeIteratorToStream(items, file) + file.close() + } + + /** A data stream is written to a given file so that the collect() method + * of class VertexRDD in Python can read it back in the client and + * display the contents of the VertexRDD as a list + */ + def writeIteratorToStream[T](items: Iterator[T], stream: DataOutputStream) = { + if (items.hasNext) { + val first = items.next() + val newIter = Seq(first).iterator ++ items + // Assuming the type of this RDD will always be Array[Byte] + newIter.asInstanceOf[Iterator[Array[Byte]]].foreach { bytes => + stream.writeInt(bytes.length) + stream.write(bytes) + } + } + } } + diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index 2adad9837adc..3fd3b6710cbe 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -18,12 +18,18 @@ package org.apache.spark.graphx.api.python import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.graphx.api.java.{JavaEdgeRDD, JavaVertexRDD, JavaGraph} @DeveloperApi -abstract private[graphx] class PythonGraph ( +private[graphx] class PythonGraph ( @transient val vertexRDD: PythonVertexRDD, @transient val edgeRDD: PythonEdgeRDD) // extends Graph[Array[Byte], Array[Byte]] with Serializable { extends Serializable { + + val asJavaGraph = JavaGraph.fromRDD(vertexRDD.asJavaRDD, edgeRDD.asJavaEdgeRDD) + + } diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index f6b8f2fb4155..5222429f066d 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -20,9 +20,15 @@ """ import operator import itertools +from tempfile import NamedTemporaryFile +from build.py4j.java_collections import MapConverter, ListConverter +from pyspark.accumulators import PStatsParam from pyspark.graphx.partitionstrategy import PartitionStrategy from pyspark import RDD, StorageLevel from pyspark.rdd import PipelinedRDD +from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer, \ + NoOpSerializer +from pyspark.traceback_utils import SCCallSiteSync __all__ = ["EdgeRDD", "Edge"] @@ -52,62 +58,254 @@ def __str__(self): return self._src_id + self._tgt_id + self._property -class EdgeRDD(RDD): - """ - EdgeRDD class is used to enter the vertex class for GraphX - """ +class EdgeRDD(object): + def __init__(self, jrdd, + jrdd_deserializer = BatchedSerializer(PickleSerializer())): + """ + Constructor + :param jrdd: A JavaRDD reference passed from the parent + RDD object + :param jrdd_deserializer: The deserializer used in Python workers + created from PythonRDD to execute a + serialized Python function and RDD + + """ - def __init__(self, jrdd, ctx, jrdd_deserializer): self._jrdd = jrdd - self._ctx = ctx + self._ctx = jrdd._jrdd.context self._jrdd_deserializer = jrdd_deserializer + self._preserve_partitioning = False self._name = "VertexRDD" + self._is_cached = False + self._is_checkpointed = False + self._id = jrdd.id() + self._partitionFunc = None + self._jrdd_val = None + self._bypass_serializer = False + + + def id(self): + """ + VertexRDD has a unique id + """ + return self._id # TODO: Does not work def __repr__(self): - return RDD(self._jrdd, self._ctx, self._jrdd_deserializer).take(1).__repr__() + return self._jrdd.toString() - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): - return self._jrdd.persist(storageLevel) + def context(self): + return self._ctx def cache(self): - self._jrdd.cache() + """ + Persist this vertex RDD with the default storage level (C{MEMORY_ONLY_SER}). + """ + self.is_cached = True + self.persist(StorageLevel.MEMORY_ONLY_SER) + return self + + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + self._is_cached = True + javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) + self._jrdd.persist(javaStorageLevel) + return self + + def unpersist(self): + self._is_cached = False + self._jrdd.unpersist() + return self + + def checkpoint(self): + self.is_checkpointed = True + self._jrdd.rdd().checkpoint() def count(self): return self._jrdd.count() def collect(self): - return self._jrdd.collect() + """ + Return all of the elements in this vertex RDD as a list + """ + with SCCallSiteSync(self._ctx) as css: + bytesInJava = self._jrdd.collect().iterator() + return list(self._collect_iterator_through_file(bytesInJava)) + + def _collect_iterator_through_file(self, iterator): + # Transferring lots of data through Py4J can be slow because + # socket.readline() is inefficient. Instead, we'll dump the data to a + # file and read it back. + tempFile = NamedTemporaryFile(delete=False, dir=self._ctx._temp_dir) + tempFile.close() + self._ctx._writeToFile(iterator, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + for item in self._jrdd_deserializer.load_stream(tempFile): + yield item + os.unlink(tempFile.name) def take(self, num=10): return self._jrdd.take(num) def sum(self): + self._jrdd.sum() + + def mapValues(self, f, preserves_partitioning=False): """ - Add up the elements in this RDD. + Return a new vertex RDD by applying a function to each vertex attributes, + preserving the index - >>> sc.parallelize([1.0, 2.0, 3.0]).sum() - 6.0 + >>> rdd = sc.parallelize([Edge(1, 2, "b"), (2, 3, "a"), (3, 2, "c")]) + >>> vertices = EdgeRDD(rdd) + >>> sorted(edges.mapValues(lambda x: (x + ":" + x)).collect()) + [(1, 2, 'a:a'), (2, 3, 'b:b'), (3, 2, 'c:c')] """ - return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) + map_func = lambda (k, v): (k, f(v)) + def func(_, iterator): + return itertools.imap(map_func, iterator) + return PipelinedEdgeRDD(self, func, preserves_partitioning) - def mapValues(self, f, preservesPartitioning=False): + def mapEdgePartitions(self, f, preserve_partitioning=False): + def func(s, iterator): + return f(iterator) + return PipelinedEdgeRDD(self, func, preserve_partitioning) + + def filter(self, f): """ - Return a new RDD by applying a function to each element of this RDD. + Return a new vertex RDD containing only the elements that satisfy a predicate. - >>> rdd = sc.parallelize(["b", "a", "c"]) - >>> sorted(rdd.map(lambda x: (x, 1)).collect()) - [('a', 1), ('b', 1), ('c', 1)] + >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) + >>> vertices = VertexRDD(rdd) + >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() + [2] """ - def func(_, iterator): - return itertools.imap(f, iterator) - return self.mapVertexPartitions(func, preservesPartitioning) + def func(iterator): + return itertools.ifilter(f, iterator) + return self.mapEdgePartitions(func, True) def filter(self, f): - return self._jrdd.filter(f) + """ + Return a new vertex RDD containing only the elements that satisfy a predicate. + + >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) + >>> vertices = VertexRDD(rdd) + >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() + [2] + """ + def func(iterator): + return itertools.ifilter(f, iterator) + return self.maEdgePartitions(func, True) + + def innerJoin(self, other, numPartitions=None): + def dispatch(seq): + vbuf, wbuf = [], [] + for (n, v) in seq: + if n == 1: + vbuf.append(v) + elif n == 2: + wbuf.append(v) + return [(v, w) for v in vbuf for w in wbuf] + vs = self.map(lambda (k, v): (k, (1, v))) + ws = other.map(lambda (k, v): (k, (2, v))) + return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) + + + +class PipelinedEdgeRDD(EdgeRDD): + + """ + Pipelined maps: + + >>> rdd = sc.parallelize([1, 2, 3, 4]) + >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() + [4, 8, 12, 16] + >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() + [4, 8, 12, 16] + + Pipelined reduces: + >>> from operator import add + >>> rdd.map(lambda x: 2 * x).reduce(add) + 20 + >>> rdd.flatMap(lambda x: [x, x]).reduce(add) + 20 + """ + + def __init__(self, prev, func, preservesPartitioning=False): + if not isinstance(prev, PipelinedEdgeRDD) or not prev._is_pipelinable(): + # This transformation is the first in its stage: + self.func = func + self.preservesPartitioning = preservesPartitioning + self._prev_jrdd = prev._jrdd + self._prev_jrdd_deserializer = prev._jrdd_deserializer + else: + prev_func = prev.func + + def pipeline_func(split, iterator): + return func(split, prev_func(split, iterator)) + self.func = pipeline_func + self.preservesPartitioning = \ + prev.preservesPartitioning and preservesPartitioning + self._prev_jrdd = prev._prev_jrdd # maintain the pipeline + self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer + self.is_cached = False + self.is_checkpointed = False + self._ctx = prev._ctx + self.prev = prev + self._jrdd_val = None + self._id = None + self._jrdd_deserializer = self._ctx.serializer + self._bypass_serializer = False + self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None + self._broadcast = None + + def __del__(self): + if self._broadcast: + self._broadcast.unpersist() + self._broadcast = None + + @property + def _jrdd(self): + print "in _jrdd of edge.py" + if self._jrdd_val: + return self._jrdd_val + if self._bypass_serializer: + self._jrdd_deserializer = NoOpSerializer() + enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None + command = (self.func, profileStats, self._prev_jrdd_deserializer, + self._jrdd_deserializer) + # the serialized command will be compressed by broadcast + ser = CloudPickleSerializer() + pickled_command = ser.dumps(command) + if len(pickled_command) > (1 << 20): # 1M + self._broadcast = self._ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self._broadcast) + broadcast_vars = ListConverter().convert( + [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], + self._ctx._gateway._gateway_client) + self._ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self._ctx.environment, + self._ctx._gateway._gateway_client) + includes = ListConverter().convert(self._ctx._python_includes, + self._ctx._gateway._gateway_client) + targetStorageLevel = StorageLevel.MEMORY_ONLY + python_rdd = self._ctx._jvm.PythonEdgeRDD(self._prev_jrdd.rdd(), + bytearray(pickled_command), + env, includes, self.preservesPartitioning, + self._ctx.pythonExec, + broadcast_vars, self._ctx._javaAccumulator, + targetStorageLevel) + self._jrdd_val = python_rdd.asJavaEdgeRDD() + + if enable_profile: + self._id = self._jrdd_val.id() + self._ctx._add_profile(self._id, profileStats) + return self._jrdd_val - def innerJoin(self, f): - return self._jrdd.innerJoin(f) + def id(self): + if self._id is None: + self._id = self._jrdd.id() + return self._id - def leftOuterJoin(self, other, numPartitions=None): - return self._jrdd.leftOuterJoin(other, numPartitions) \ No newline at end of file + def _is_pipelinable(self): + return not (self.is_cached or self.is_checkpointed) diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index a42f7e4cbe40..4c6e38e9f5cd 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -54,16 +54,16 @@ def vertices(self): def edges(self): return self._edge_jrdd - # TODO - def partitionBy(self, partitionStrategy): - return - def numEdges(self): return self._edge_jrdd.count() def numVertices(self): return self._vertex_jrdd.count() + # TODO + def partitionBy(self, partitionStrategy): + return + # TODO def inDegrees(self): return @@ -86,13 +86,10 @@ def unpersistVertices(self, blocking = True): return def mapVertices(self, f): - def func(f): - return itertools.imap(f) - return self._vertex_jrdd.mapValues(func) + return self._vertex_jrdd.mapValues(f) - # TODO def mapEdges(self, f): - return + return self._vertex_jrdd.mapValues(f) # TODO def mapTriplets(self, f): diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 612623a1c8aa..d44d55409dbe 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -83,8 +83,11 @@ def __init__(self, jrdd, jrdd_deserializer = BatchedSerializer(PickleSerializer())): """ Constructor - :param jrdd: - :param jrdd_deserializer: + :param jrdd: A JavaRDD reference passed from the parent + RDD object + :param jrdd_deserializer: The deserializer used in Python workers + created from PythonRDD to execute a + serialized Python function and RDD """ @@ -101,22 +104,48 @@ def __init__(self, jrdd, self._bypass_serializer = False + def id(self): + """ + VertexRDD has a unique id + """ + return self._id + # TODO: Does not work def __repr__(self): return self._jrdd.toString() - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): - return self._jrdd.persist(storageLevel) + def context(self): + return self._ctx def cache(self): - self._jrdd.cache() + """ + Persist this vertex RDD with the default storage level (C{MEMORY_ONLY_SER}). + """ + self.is_cached = True + self.persist(StorageLevel.MEMORY_ONLY_SER) + return self + + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + self._is_cached = True + javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) + self._jrdd.persist(javaStorageLevel) + return self + + def unpersist(self): + self._is_cached = False + self._jrdd.unpersist() + return self + + def checkpoint(self): + self.is_checkpointed = True + self._jrdd.rdd().checkpoint() def count(self): return self._jrdd.count() def collect(self): """ - Return a list that contains all of the elements in this RDD. + Return all of the elements in this vertex RDD as a list """ with SCCallSiteSync(self._ctx) as css: bytesInJava = self._jrdd.collect().iterator() @@ -141,88 +170,84 @@ def take(self, num=10): def sum(self): self._jrdd.sum() - def mapValues(self, f, preservesPartitioning=False): + def mapValues(self, f, preserves_partitioning=False): """ - Return a new RDD by applying a function to each element of this RDD. + Return a new vertex RDD by applying a function to each vertex attributes, + preserving the index - >>> rdd = sc.parallelize(["b", "a", "c"]) - >>> sorted(rdd.map(lambda x: (x, 1)).collect()) - [('a', 1), ('b', 1), ('c', 1)] + >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) + >>> vertices = VertexRDD(rdd) + >>> sorted(vertices.mapValues(lambda x: (x + ":" + x)).collect()) + [(1, 'a:a'), (2, 'b:b'), (3, 'c:c')] """ + map_func = lambda (k, v): (k, f(v)) def func(_, iterator): - return itertools.imap(f, iterator) - return self.mapVertexPartitions(func, preservesPartitioning) - - def filter(self, f): - return self._jrdd.filter(f) + return itertools.imap(map_func, iterator) + return PipelinedVertexRDD(self, func, preserves_partitioning) - def diff(self, other): - """ - Return a new RDD containing only the elements that satisfy a predicate. - - >>> rdd1 = sc.parallelize([1, 2, 3, 4, 5]) - >>> rdd2 = sc.parallelize([2, 3, 4]) - >>> rdd1.diff(rdd2).collect() - [1, 5] - """ - self._jrdd = self._jrdd._jvm.org.apache.spark.PythonVertexRDD() - return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.diff(other) - - def leftJoin(self, other): - return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.leftJoin() - - def innerJoin(self, other, func): - return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.innerJoin() - - def aggregateUsingIndex(self, other, reduceFunc): - return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() - - def mapVertexPartitions(self, f, preservesPartitioning=False): - """ - Return a new RDD by applying a function to each partition of this RDD. - - >>> rdd = sc.parallelize([1, 2, 3, 4], 2) - >>> def f(iterator): yield sum(iterator) - >>> rdd.mapPartitions(f).collect() - [3, 7] - """ + def mapVertexPartitions(self, f, preserve_partitioning=False): def func(s, iterator): return f(iterator) - return self._jrdd.mapPartitionsWithIndex(func, preservesPartitioning) + return PipelinedVertexRDD(self, func, preserve_partitioning) - def reduce(self, f): + def filter(self, f): """ - Reduces the elements of this RDD using the specified commutative and - associative binary operator. Currently reduces partitions locally. - - >>> from operator import add - >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add) - 15 - >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add) - 10 - >>> sc.parallelize([]).reduce(add) - Traceback (most recent call last): - ... - ValueError: Can not reduce() empty RDD + Return a new vertex RDD containing only the elements that satisfy a predicate. + + >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) + >>> vertices = VertexRDD(rdd) + >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() + [2] """ def func(iterator): - iterator = iter(iterator) - try: - initial = next(iterator) - except StopIteration: - return - yield reduce(f, iterator, initial) - - vals = self.mapVertexPartitions(func).collect() - if vals: - return reduce(f, vals) - raise ValueError("Can not reduce() empty RDD") + return itertools.ifilter(f, iterator) + return self.mapVertexPartitions(func, True) - def id(self): + def diff(self, other, numPartitions=2): """ - A unique ID for this RDD (within its SparkContext). + Hides vertices that are the same between `this` and `other`. + For vertices that are different, keeps the values from `other`. + + TODO: give an example """ - return self._id + if (isinstance(other, RDD)): + vs = self.map(lambda (k, v): (k, (1, v))) + ws = other.map(lambda (k, v): (k, (2, v))) + return vs.union(ws).groupByKey(numPartitions).mapValues(lambda x: x.diff(x.__iter__())) + + def leftJoin(self, other, numPartitions=None): + def dispatch(seq): + vbuf, wbuf = [], [] + for (n, v) in seq: + if n == 1: + vbuf.append(v) + elif n == 2: + wbuf.append(v) + if not wbuf: + wbuf.append(None) + return [(v, w) for v in vbuf for w in wbuf] + vs = self.map(lambda (k, v): (k, (1, v))) + ws = other.map(lambda (k, v): (k, (2, v))) + return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) + + + def innerJoin(self, other, numPartitions=None): + def dispatch(seq): + vbuf, wbuf = [], [] + for (n, v) in seq: + if n == 1: + vbuf.append(v) + elif n == 2: + wbuf.append(v) + return [(v, w) for v in vbuf for w in wbuf] + vs = self.map(lambda (k, v): (k, (1, v))) + ws = other.map(lambda (k, v): (k, (2, v))) + return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) + + + + # def aggregateUsingIndex(self, other, reduceFunc): + # return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() class PipelinedVertexRDD(VertexRDD): From d07ae43bf32c6ffc1ec62f938febd404f41b41b7 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Wed, 10 Dec 2014 10:48:15 -0500 Subject: [PATCH 13/25] SPARK-3789: JavaEdgeRDDLike compiler errors. --- .../spark/graphx/api/java/JavaEdgeRDD.scala | 10 +++--- .../graphx/api/java/JavaEdgeRDDLike.scala | 33 ++++++++++--------- .../spark/graphx/api/java/JavaGraph.scala | 10 ++++-- .../graphx/api/python/PythonEdgeRDD.scala | 4 +-- .../spark/graphx/api/python/PythonGraph.scala | 5 +-- .../graphx/api/python/PythonVertexRDD.scala | 10 +++--- 6 files changed, 38 insertions(+), 34 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 7164834350d8..0e4a56f02134 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -28,6 +28,8 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] (edges: RDD[Edge[ED]]) extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], JavaRDD[Edge[ED]]] { + override def wrapRDD(edgeRDD: RDD[Edge[ED]]): JavaRDD[Edge[ED]] = JavaRDD.fromRDD(edgeRDD) + override def edgeRDD: EdgeRDD[ED, VD] = EdgeRDD.fromEdges(edges) /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ @@ -41,17 +43,13 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] edges.persist(storageLevel).asInstanceOf[JavaEdgeRDD[ED, VD]] def unpersist(blocking: Boolean = true) : JavaEdgeRDD[ED, VD] = - JavaEdgeRDD(edgeRDD.unpersist(blocking)) + JavaEdgeRDD.fromEdgeRDD(edgeRDD.unpersist(blocking)) } object JavaEdgeRDD { implicit def fromEdgeRDD[ED: ClassTag, VD: ClassTag] - (edges: JavaRDD[Edge[ED]]): JavaEdgeRDD[ED, VD] = + (edges: EdgeRDD[ED, VD]): JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edges) - - implicit def apply[ED: ClassTag, VD: ClassTag](edges: JavaRDD[Edge[ED]]): JavaEdgeRDD[ED, VD] = { - new JavaEdgeRDD(edges) - } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala index b067fc414335..1692ed662b6c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -16,12 +16,13 @@ */ package org.apache.spark.graphx.api.java +import java.lang.{Long => JLong} import javax.swing.JList -import java.lang.{Long => JLong} import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.EdgePartition +import org.apache.spark.rdd.RDD import scala.reflect.ClassTag @@ -29,9 +30,11 @@ trait JavaEdgeRDDLike [ED, VD, This <: JavaEdgeRDDLike[ED, VD, This, R], R <: JavaRDDLike[Edge[ED], R]] extends Serializable { + def wrapRDD(edgeRDD: RDD[Edge[ED]]): This + def edgeRDD: EdgeRDD[ED, VD] - def setName(name: String) = edgeRDD.setName(name) + def setName() = edgeRDD.setName("JavaEdgeRDD") def collect(): JList[Edge[ED]] = edgeRDD.collect().toList.asInstanceOf @@ -39,22 +42,22 @@ R <: JavaRDDLike[Edge[ED], R]] def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { - JavaEdgeRDD(edgeRDD.mapEdgePartitions(f)) + JavaEdgeRDD.fromEdgeRDD(edgeRDD.mapEdgePartitions(f)) } def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { - JavaEdgeRDD(edgeRDD.mapValues(f)) + JavaEdgeRDD.fromEdgeRDD[ED2, VD](edgeRDD.mapValues(f)) } - def filter - (epred: EdgeTriplet[VD, ED] => Boolean, - vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { - JavaEdgeRDD(edgeRDD.filter(epred, vpred)) - } - - def innerJoin[ED2: ClassTag, ED3: ClassTag] - (other: EdgeRDD[ED2, _]) - (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { - JavaEdgeRDD[ED3, VD](edgeRDD.innerJoin(other)(f)) - } +// def filter +// (epred: EdgeTriplet[VD, ED] => Boolean, +// vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { +// JavaEdgeRDD.fromEdgeRDD(edgeRDD.filter(epred, vpred)) +// } +// +// def innerJoin[ED2: ClassTag, ED3: ClassTag] +// (other: EdgeRDD[ED2, _]) +// (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { +// JavaEdgeRDD.fromEdgeRDD(edgeRDD.innerJoin(other)(f)) +// } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index f603afbf643a..830018293560 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -30,7 +30,7 @@ class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] (vertexRDD : RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]) { def vertices: JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD) - def edges: JavaEdgeRDD[ED, VD] = JavaEdgeRDD(edgeRDD) + def edges: JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edgeRDD) @transient lazy val graph : Graph[VD, ED] = Graph(vertexRDD, edgeRDD) def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): JavaGraph[VD, ED] = { @@ -102,8 +102,14 @@ object JavaGraph { } implicit def apply[VD: ClassTag, ED: ClassTag] - (graph: Graph[VD, ED]): JavaGraph[VD, ED] = { + (graph: Graph[VD, ED]): JavaGraph[VD, ED] = { new JavaGraph[VD, ED](graph.vertices, graph.edges) } + + implicit def apply [VD: ClassTag, ED: ClassTag] + (vertices: JavaVertexRDD[VD], edges: JavaEdgeRDD[ED, VD]): JavaGraph[VD, ED] = { + + new JavaGraph(vertices.toRDD, edges.toRDD) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index 7e9241b63733..cadb93f925bb 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -25,7 +25,6 @@ import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.api.java.JavaEdgeRDD import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel private[graphx] class PythonEdgeRDD( @transient parent: RDD[_], @@ -35,8 +34,7 @@ private[graphx] class PythonEdgeRDD( preservePartitioning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], - accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel : String = StorageLevel.MEMORY_ONLY) + accumulator: Accumulator[JList[Array[Byte]]]) extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index 3fd3b6710cbe..067ac206d512 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -28,8 +28,9 @@ private[graphx] class PythonGraph ( // extends Graph[Array[Byte], Array[Byte]] with Serializable { extends Serializable { - val asJavaGraph = JavaGraph.fromRDD(vertexRDD.asJavaRDD, edgeRDD.asJavaEdgeRDD) - + val vertices = vertexRDD.asJavaVertexRDD + val edges = edgeRDD.asJavaEdgeRDD + val asJavaGraph = JavaGraph(vertices, edges) } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 8615a8c1f617..64db1edd98c5 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -26,7 +26,6 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.VertexRDD import org.apache.spark.graphx.api.java.JavaVertexRDD import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel private[graphx] class PythonVertexRDD( @transient parent: RDD[_], @@ -36,8 +35,7 @@ private[graphx] class PythonVertexRDD( preservePartitioning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[Array[Byte]]], - accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel : String = StorageLevel.MEMORY_ONLY) + accumulator: Accumulator[JList[Array[Byte]]]) extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { @@ -65,11 +63,11 @@ private[graphx] class PythonVertexRDD( val newIter = Seq(first).iterator ++ items // Assuming the type of this RDD will always be Array[Byte] newIter.asInstanceOf[Iterator[Array[Byte]]].foreach { bytes => - stream.writeInt(bytes.length) - stream.write(bytes) + stream.writeInt(bytes.length) + stream.write(bytes) + } } } - } object PythonVertexRDD { From e02a8ee087a178530f0cf60f11fcf7a9b1387448 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Tue, 16 Dec 2014 14:47:37 -0500 Subject: [PATCH 14/25] SPARK-3789: JavaEdgeRDDLike compiler errors. --- .../spark/graphx/api/java/JavaEdgeRDD.scala | 68 +++++++++++++++++-- .../graphx/api/java/JavaEdgeRDDLike.scala | 38 ++--------- .../spark/graphx/api/java/JavaGraph.scala | 13 +--- .../spark/graphx/api/java/JavaVertexRDD.scala | 2 + .../graphx/api/python/PythonEdgeRDD.scala | 7 +- .../spark/graphx/api/python/PythonGraph.scala | 4 +- .../org/apache/spark/graphx/JavaAPISuite.java | 13 ++-- python/pyspark/graphx/graph.py | 9 +++ 8 files changed, 95 insertions(+), 59 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 0e4a56f02134..755094fd606f 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -16,21 +16,40 @@ */ package org.apache.spark.graphx.api.java +import java.lang.{Long => JLong} + import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.EdgePartition import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -import scala.language.implicitConversions +import scala.language.implicitConversions import scala.reflect.ClassTag +/** + * EdgeRDD['ED', 'VD'] is a column-oriented edge partition RDD created from RDD[Edge[ED]]. + * JavaEdgeRDD class provides a Java API to access implementations of the EdgeRDD class + * + * @param edges + * @tparam ED + * @tparam VD + */ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] - (edges: RDD[Edge[ED]]) - extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], JavaRDD[Edge[ED]]] { + (edges: EdgeRDD[ED, VD]) + extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], Edge[ED]] { + + /** + * Java Wrapper for RDD of Edges + * + * @param edgeRDD + * @return + */ + def wrapRDD(edgeRDD: EdgeRDD[ED, VD]): JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edgeRDD) - override def wrapRDD(edgeRDD: RDD[Edge[ED]]): JavaRDD[Edge[ED]] = JavaRDD.fromRDD(edgeRDD) + def edgeRDD = edges - override def edgeRDD: EdgeRDD[ED, VD] = EdgeRDD.fromEdges(edges) + def count(): Long = edgeRDD.count() /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ def cache(): JavaEdgeRDD[ED, VD] = edges.cache().asInstanceOf[JavaEdgeRDD[ED, VD]] @@ -43,13 +62,48 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] edges.persist(storageLevel).asInstanceOf[JavaEdgeRDD[ED, VD]] def unpersist(blocking: Boolean = true) : JavaEdgeRDD[ED, VD] = - JavaEdgeRDD.fromEdgeRDD(edgeRDD.unpersist(blocking)) + JavaEdgeRDD(edgeRDD.unpersist(blocking)) + + def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { + JavaEdgeRDD[ED2, VD](edgeRDD.mapValues(f)) + } + + def reverse: JavaEdgeRDD[ED, VD] = edges.reverse.asInstanceOf[JavaEdgeRDD[ED, VD]] + + def filter + (epred: EdgeTriplet[VD, ED] => Boolean, + vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { + JavaEdgeRDD(edgeRDD.filter(epred, vpred)) + } + + def innerJoin[ED2: ClassTag, ED3: ClassTag] + (other: EdgeRDD[ED2, _]) + (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { + JavaEdgeRDD(edgeRDD.innerJoin(other)(f)) + } + + def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] + (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { + edges.mapEdgePartitions(f).asInstanceOf[JavaEdgeRDD[ED2, VD2]] + } } object JavaEdgeRDD { - implicit def fromEdgeRDD[ED: ClassTag, VD: ClassTag] + implicit def apply[ED: ClassTag, VD: ClassTag] (edges: EdgeRDD[ED, VD]): JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edges) + + implicit def apply[ED: ClassTag, VD: ClassTag](edges: JavaRDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { + new JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges(edges.rdd)) + } + + def toEdgeRDD[ED: ClassTag, VD: ClassTag](edges: JavaEdgeRDD[ED, VD]): EdgeRDD[ED, VD] = { + edges.edgeRDD + } + + def fromRDDOfEdges[ED: ClassTag, VD: ClassTag](edges: RDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { + new JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges(edges)) + } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala index 1692ed662b6c..c7eb8d5c6bcd 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -17,47 +17,19 @@ package org.apache.spark.graphx.api.java import java.lang.{Long => JLong} -import javax.swing.JList import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.EdgePartition -import org.apache.spark.rdd.RDD - -import scala.reflect.ClassTag trait JavaEdgeRDDLike [ED, VD, This <: JavaEdgeRDDLike[ED, VD, This, R], R <: JavaRDDLike[Edge[ED], R]] extends Serializable { - def wrapRDD(edgeRDD: RDD[Edge[ED]]): This - - def edgeRDD: EdgeRDD[ED, VD] - - def setName() = edgeRDD.setName("JavaEdgeRDD") - - def collect(): JList[Edge[ED]] = edgeRDD.collect().toList.asInstanceOf +// def wrapRDD(edgeRDD: RDD[Edge[ED]]): This - def count(): Long = edgeRDD.count() - - def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] - (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { - JavaEdgeRDD.fromEdgeRDD(edgeRDD.mapEdgePartitions(f)) - } - - def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { - JavaEdgeRDD.fromEdgeRDD[ED2, VD](edgeRDD.mapValues(f)) - } - -// def filter -// (epred: EdgeTriplet[VD, ED] => Boolean, -// vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { -// JavaEdgeRDD.fromEdgeRDD(edgeRDD.filter(epred, vpred)) -// } +// def setName() = toRDDEdges.setName("JavaEdgeRDD") +// +// def collect(): JList[Edge[ED]] = toRDDEdges.collect().toList.asInstanceOf // -// def innerJoin[ED2: ClassTag, ED3: ClassTag] -// (other: EdgeRDD[ED2, _]) -// (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { -// JavaEdgeRDD.fromEdgeRDD(edgeRDD.innerJoin(other)(f)) -// } +// def count(): Long = toRDDEdges.count() } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index 830018293560..e9aab331c318 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -18,7 +18,6 @@ package org.apache.spark.graphx.api.java import java.lang.{Double => JDouble, Long => JLong} -import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ import org.apache.spark.graphx.lib.PageRank import org.apache.spark.rdd.RDD @@ -27,7 +26,7 @@ import scala.language.implicitConversions import scala.reflect.ClassTag class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] - (vertexRDD : RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]) { + (vertexRDD : VertexRDD[VD], edgeRDD: EdgeRDD[ED, VD]) { def vertices: JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD) def edges: JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edgeRDD) @@ -91,14 +90,9 @@ class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] object JavaGraph { - implicit def fromRDD[VD: ClassTag, ED: ClassTag] - (javaVertexRDD: JavaRDD[(VertexId, VD)], javaEdgeRDD: JavaRDD[Edge[ED]]): JavaGraph[VD, ED] = { - new JavaGraph[VD, ED](javaVertexRDD, javaEdgeRDD) - } - implicit def apply[VD: ClassTag, ED: ClassTag] (vertexRDD: RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]): JavaGraph[VD, ED] = { - new JavaGraph[VD, ED](vertexRDD, edgeRDD) + new JavaGraph[VD, ED](VertexRDD(vertexRDD), EdgeRDD.fromEdges(edgeRDD)) } implicit def apply[VD: ClassTag, ED: ClassTag] @@ -108,8 +102,7 @@ object JavaGraph { implicit def apply [VD: ClassTag, ED: ClassTag] (vertices: JavaVertexRDD[VD], edges: JavaEdgeRDD[ED, VD]): JavaGraph[VD, ED] = { - - new JavaGraph(vertices.toRDD, edges.toRDD) + new JavaGraph(VertexRDD(vertices.toRDD), edges.edgeRDD) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index 5ca3cf0011fd..151df01dfe7a 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -68,6 +68,8 @@ class JavaVertexRDD[VD]( /** Return a new VertexRDD containing only the elements that satisfy a predicate. */ def filter(f: JFunction[(VertexId, VD), Boolean]): JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD.filter(x => f.call(x).booleanValue())) + + def toRDD : RDD[(VertexId, VD)] = vertices } object JavaVertexRDD { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index cadb93f925bb..c12a3f60cd1a 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -23,9 +23,12 @@ import java.util.{List => JList, Map => JMap} import org.apache.spark.Accumulator import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast +import org.apache.spark.graphx.EdgeRDD import org.apache.spark.graphx.api.java.JavaEdgeRDD import org.apache.spark.rdd.RDD +import scala.reflect.ClassTag + private[graphx] class PythonEdgeRDD( @transient parent: RDD[_], command: Array[Byte], @@ -39,7 +42,9 @@ private[graphx] class PythonEdgeRDD( pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { - val asJavaEdgeRDD = JavaEdgeRDD.fromEdgeRDD(parent.asInstanceOf) + def asJavaEdgeRDD[ED, VD: ClassTag]() : JavaEdgeRDD[ED, VD] = { + JavaEdgeRDD.fromRDDOfEdges[ED, VD](EdgeRDD.fromEdges[ED, VD](parent.map(x => x.asInstanceOf))) + } def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index 067ac206d512..23a5ed5f93c8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -18,8 +18,6 @@ package org.apache.spark.graphx.api.python import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.graphx.api.java.{JavaEdgeRDD, JavaVertexRDD, JavaGraph} @DeveloperApi private[graphx] class PythonGraph ( @@ -30,7 +28,7 @@ private[graphx] class PythonGraph ( val vertices = vertexRDD.asJavaVertexRDD val edges = edgeRDD.asJavaEdgeRDD - val asJavaGraph = JavaGraph(vertices, edges) + val asJavaGraph = (vertices, edges) } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index 715ea9b4e980..a47476454bc8 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -29,6 +29,8 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; +import scala.Array; +import scala.Function1; import scala.Tuple2; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; @@ -50,9 +52,9 @@ public void initialize() { this.ssc = new JavaSparkContext("local", "GraphX JavaAPISuite"); this.myList = new ArrayList>>(); - this.myList.add(new Tuple2(1L, new VertexProperty("abc", "ABC"))); - this.myList.add(new Tuple2(2L, new VertexProperty("def", "DEF"))); - this.myList.add(new Tuple2(3L, new VertexProperty("xyz", "XYZ"))); + this.myList.add(new Tuple2(1L, new VertexProperty("001", "kushal"))); + this.myList.add(new Tuple2(2L, new VertexProperty("002", "xia"))); + this.myList.add(new Tuple2(3L, new VertexProperty("003", "briton"))); this.classTag = ClassTag$.MODULE$.apply(VertexProperty.class); } @@ -91,7 +93,7 @@ public void testVertexRDDCount() { } @Test - public void testEdgeRDDMapValues() { + public void testEdgeRDDCount() { List> edgeList = new ArrayList>(); edgeList.add(new Edge(0, 1, "abcd")); @@ -103,7 +105,8 @@ public void testEdgeRDDMapValues() { ClassTag classTag = ClassTag$.MODULE$.apply(String.class); - JavaEdgeRDD javaEdgeRDD = JavaEdgeRDD.apply(javaRDD, classTag); + JavaEdgeRDD javaEdgeRDD = + JavaEdgeRDD.apply(javaRDD, classTag, classTag); assertEquals(javaEdgeRDD.count(), 4L); } diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index 4c6e38e9f5cd..76fefc4786b0 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -164,4 +164,13 @@ def triangleCount(self): def stronglyConnectedComponents(self, iterations): return + def vertexProgram(self): + return + + def initialMessage(self): + return + + def sendMessage(self): + return + From 49e3845a344738076acb625303477bf80942ca3d Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Thu, 18 Dec 2014 11:48:00 -0500 Subject: [PATCH 15/25] SPARK-3789: Updated JavaEdgeRDD class according to EdgeRDD and EdgeRDDImpl class hierarchy. Fixed compile issues. --- .../spark/graphx/api/java/JavaEdgeRDD.scala | 80 +++++++++++-------- .../graphx/api/java/JavaEdgeRDDLike.scala | 41 ++++++++-- .../spark/graphx/api/java/JavaGraph.scala | 18 +++-- .../graphx/api/java/JavaVertexRDDLike.scala | 8 +- .../graphx/api/python/PythonEdgeRDD.scala | 14 ++-- .../spark/graphx/api/python/PythonGraph.scala | 6 +- .../graphx/api/python/PythonVertexRDD.scala | 15 +--- 7 files changed, 107 insertions(+), 75 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 755094fd606f..4ee4bd032885 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -20,7 +20,7 @@ import java.lang.{Long => JLong} import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.EdgePartition +import org.apache.spark.graphx.impl.{EdgePartition, EdgeRDDImpl} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -31,13 +31,21 @@ import scala.reflect.ClassTag * EdgeRDD['ED', 'VD'] is a column-oriented edge partition RDD created from RDD[Edge[ED]]. * JavaEdgeRDD class provides a Java API to access implementations of the EdgeRDD class * - * @param edges + * @param partitionsRDD + * @param targetStorageLevel * @tparam ED * @tparam VD */ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] - (edges: EdgeRDD[ED, VD]) - extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], Edge[ED]] { + (val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], + val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) + extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], + JavaRDD[(PartitionID, EdgePartition[ED, VD])]] { + + /* Convert RDD[(PartitionID, EdgePartition[ED, VD])] to EdgeRDD[ED, VD] */ + override def edgeRDD: EdgeRDDImpl[ED, VD] = { + new EdgeRDDImpl(partitionsRDD, targetStorageLevel) + } /** * Java Wrapper for RDD of Edges @@ -45,65 +53,69 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] * @param edgeRDD * @return */ - def wrapRDD(edgeRDD: EdgeRDD[ED, VD]): JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edgeRDD) - - def edgeRDD = edges - - def count(): Long = edgeRDD.count() - - /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ - def cache(): JavaEdgeRDD[ED, VD] = edges.cache().asInstanceOf[JavaEdgeRDD[ED, VD]] + override def wrapRDD(edgeRDD: RDD[(PartitionID, EdgePartition[ED, VD])]) : + JavaRDD[(PartitionID, EdgePartition[ED, VD])] = { + JavaRDD.fromRDD(edgeRDD) + } - /** Persist RDDs of this EdgeRDD with the default storage level (MEMORY_ONLY_SER) */ - def persist(): JavaEdgeRDD[ED, VD] = edges.persist().asInstanceOf[JavaEdgeRDD[ED, VD]] + /** Persist RDDs of this JavaEdgeRDD with the default storage level (MEMORY_ONLY_SER) */ + def cache(): this.type = { + partitionsRDD.persist(StorageLevel.MEMORY_ONLY) + this + } - /** Persist the RDDs of this DStream with the given storage level */ - def persist(storageLevel: StorageLevel): JavaEdgeRDD[ED, VD] = - edges.persist(storageLevel).asInstanceOf[JavaEdgeRDD[ED, VD]] + /** Persist the RDDs of this JavaEdgeRDD with the given storage level */ + def persist(newLevel: StorageLevel): this.type = { + partitionsRDD.persist(newLevel) + this + } - def unpersist(blocking: Boolean = true) : JavaEdgeRDD[ED, VD] = - JavaEdgeRDD(edgeRDD.unpersist(blocking)) + def unpersist(blocking: Boolean = true) : this.type = { + edgeRDD.unpersist(blocking) + this + } - def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { - JavaEdgeRDD[ED2, VD](edgeRDD.mapValues(f)) + override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { + edgeRDD.mapValues(f) } - def reverse: JavaEdgeRDD[ED, VD] = edges.reverse.asInstanceOf[JavaEdgeRDD[ED, VD]] + override def reverse: JavaEdgeRDD[ED, VD] = edgeRDD.reverse - def filter + override def filter (epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { - JavaEdgeRDD(edgeRDD.filter(epred, vpred)) + edgeRDD.filter(epred, vpred) } - def innerJoin[ED2: ClassTag, ED3: ClassTag] - (other: EdgeRDD[ED2, _]) + override def innerJoin[ED2: ClassTag, ED3: ClassTag] + (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { - JavaEdgeRDD(edgeRDD.innerJoin(other)(f)) + edgeRDD.innerJoin(other)(f) } - def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] + override def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { - edges.mapEdgePartitions(f).asInstanceOf[JavaEdgeRDD[ED2, VD2]] + edgeRDD.mapEdgePartitions(f) } } object JavaEdgeRDD { implicit def apply[ED: ClassTag, VD: ClassTag] - (edges: EdgeRDD[ED, VD]): JavaEdgeRDD[ED, VD] = - new JavaEdgeRDD(edges) + (edges: EdgeRDDImpl[ED, VD]): JavaEdgeRDD[ED, VD] = { + new JavaEdgeRDD(edges.partitionsRDD) + } implicit def apply[ED: ClassTag, VD: ClassTag](edges: JavaRDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { - new JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges(edges.rdd)) + JavaEdgeRDD(EdgeRDD.fromEdges[ED, VD](edges.rdd)) } - def toEdgeRDD[ED: ClassTag, VD: ClassTag](edges: JavaEdgeRDD[ED, VD]): EdgeRDD[ED, VD] = { + def toEdgeRDD[ED: ClassTag, VD: ClassTag](edges: JavaEdgeRDD[ED, VD]): EdgeRDDImpl[ED, VD] = { edges.edgeRDD } def fromRDDOfEdges[ED: ClassTag, VD: ClassTag](edges: RDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { - new JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges(edges)) + JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges[ED, VD](edges)) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala index c7eb8d5c6bcd..5e4bb21c0a22 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -17,19 +17,46 @@ package org.apache.spark.graphx.api.java import java.lang.{Long => JLong} +import java.util.{List => JList} import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.{EdgePartition, EdgeRDDImpl} +import org.apache.spark.rdd.RDD +import org.apache.spark.{Partition, TaskContext} + +import scala.reflect.ClassTag trait JavaEdgeRDDLike [ED, VD, This <: JavaEdgeRDDLike[ED, VD, This, R], -R <: JavaRDDLike[Edge[ED], R]] +R <: JavaRDDLike[(PartitionID, EdgePartition[ED, VD]), R]] extends Serializable { -// def wrapRDD(edgeRDD: RDD[Edge[ED]]): This + def edgeRDD: EdgeRDDImpl[ED, VD] + + def wrapRDD(edgeRDD: RDD[(PartitionID, EdgePartition[ED, VD])]) : R + + def setName() = edgeRDD.setName("JavaEdgeRDD") + + def collect(): Array[Edge[ED]] = edgeRDD.map(_.copy()).collect().asInstanceOf[Array[Edge[ED]]] + + def count() : JLong = edgeRDD.count() + + def compute(part: Partition, context: TaskContext): Iterator[Edge[ED]] = { + edgeRDD.compute(part, context) + } + + def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] + + def reverse: JavaEdgeRDD[ED, VD] + + def filter + (epred: EdgeTriplet[VD, ED] => Boolean, + vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] + + def innerJoin[ED2: ClassTag, ED3: ClassTag] + (other: EdgeRDD[ED2]) + (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] -// def setName() = toRDDEdges.setName("JavaEdgeRDD") -// -// def collect(): JList[Edge[ED]] = toRDDEdges.collect().toList.asInstanceOf -// -// def count(): Long = toRDDEdges.count() + def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] + (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index e9aab331c318..4190083f10ea 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -19,6 +19,7 @@ package org.apache.spark.graphx.api.java import java.lang.{Double => JDouble, Long => JLong} import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.EdgeRDDImpl import org.apache.spark.graphx.lib.PageRank import org.apache.spark.rdd.RDD @@ -26,10 +27,10 @@ import scala.language.implicitConversions import scala.reflect.ClassTag class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] - (vertexRDD : VertexRDD[VD], edgeRDD: EdgeRDD[ED, VD]) { + (vertexRDD : VertexRDD[VD], edgeRDD: EdgeRDDImpl[ED, VD]) { def vertices: JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD) - def edges: JavaEdgeRDD[ED, VD] = new JavaEdgeRDD(edgeRDD) + def edges: JavaEdgeRDD[ED, VD] = JavaEdgeRDD(edgeRDD) @transient lazy val graph : Graph[VD, ED] = Graph(vertexRDD, edgeRDD) def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): JavaGraph[VD, ED] = { @@ -71,14 +72,21 @@ class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] JavaGraph(graph.groupEdges(merge)) } + @deprecated("use aggregateMessages", "1.2.0") def mapReduceTriplets[A: ClassTag]( mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], reduceFunc: (A, A) => A, - activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) - : JavaVertexRDD[A] = { + activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) : JavaVertexRDD[A] = { JavaVertexRDD(graph.mapReduceTriplets(mapFunc, reduceFunc, activeSetOpt)) } + def aggregateMessages[A: ClassTag]( + sendMsg: EdgeContext[VD, ED, A] => Unit, + mergeMsg: (A, A) => A, + tripletFields: TripletFields = TripletFields.All) : JavaVertexRDD[A] = { + JavaVertexRDD(graph.aggregateMessages(sendMsg, mergeMsg, tripletFields)) + } + def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) (mapFunc: (VertexId, VD, Option[U]) => VD2) : JavaGraph[VD2, ED] = { JavaGraph(graph.outerJoinVertices(other)(mapFunc)) @@ -97,7 +105,7 @@ object JavaGraph { implicit def apply[VD: ClassTag, ED: ClassTag] (graph: Graph[VD, ED]): JavaGraph[VD, ED] = { - new JavaGraph[VD, ED](graph.vertices, graph.edges) + new JavaGraph[VD, ED](graph.vertices, EdgeRDD.fromEdges(graph.edges)) } implicit def apply [VD: ClassTag, ED: ClassTag] diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala index 389b71f74d5c..7a79fc90b64f 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -23,7 +23,7 @@ import java.util.{List => JList} import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.api.java.function.{Function => JFunction, Function2 => JFunction2, Function3 => JFunction3} import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.ShippableVertexPartition +import org.apache.spark.graphx.impl.{EdgeRDDImpl, ShippableVertexPartition} import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, TaskContext} @@ -117,8 +117,8 @@ trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], JavaVertexRDD(vertexRDD.aggregateUsingIndex(messages, reduceFunc)) } - def fromEdges[VD: ClassTag] - (edges: EdgeRDD[_, _], numPartitions: Int, defaultVal: VD): JavaVertexRDD[VD] = { - JavaVertexRDD(VertexRDD.fromEdges(edges, numPartitions, defaultVal)) + def fromEdges[ED: ClassTag, VD: ClassTag] + (edges: EdgeRDDImpl[ED, VD], numPartitions: Int, defaultVal: VD): JavaVertexRDD[VD] = { + JavaVertexRDD(VertexRDD.fromEdges[VD](EdgeRDD.fromEdges[ED, VD](edges), numPartitions, defaultVal)) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index c12a3f60cd1a..13d426e82897 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -21,14 +21,10 @@ import java.io.{DataOutputStream, FileOutputStream} import java.util.{List => JList, Map => JMap} import org.apache.spark.Accumulator -import org.apache.spark.api.python.PythonRDD +import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.graphx.EdgeRDD -import org.apache.spark.graphx.api.java.JavaEdgeRDD import org.apache.spark.rdd.RDD -import scala.reflect.ClassTag - private[graphx] class PythonEdgeRDD( @transient parent: RDD[_], command: Array[Byte], @@ -36,15 +32,15 @@ private[graphx] class PythonEdgeRDD( pythonIncludes: JList[String], preservePartitioning: Boolean, pythonExec: String, - broadcastVars: JList[Broadcast[Array[Byte]]], + broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]]) extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { - def asJavaEdgeRDD[ED, VD: ClassTag]() : JavaEdgeRDD[ED, VD] = { - JavaEdgeRDD.fromRDDOfEdges[ED, VD](EdgeRDD.fromEdges[ED, VD](parent.map(x => x.asInstanceOf))) - } +// def asJavaEdgeRDD[ED, VD: ClassTag]() : JavaEdgeRDD[ED, VD] = { +// JavaEdgeRDD.fromRDDOfEdges[ED, VD](EdgeRDD.fromEdges[ED, VD](parent.map(x => x.asInstanceOf))) +// } def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index 23a5ed5f93c8..5c26f83e16b8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -26,9 +26,9 @@ private[graphx] class PythonGraph ( // extends Graph[Array[Byte], Array[Byte]] with Serializable { extends Serializable { - val vertices = vertexRDD.asJavaVertexRDD - val edges = edgeRDD.asJavaEdgeRDD - val asJavaGraph = (vertices, edges) +// val vertices = vertexRDD.asJavaVertexRDD +// val edges = edgeRDD.asJavaEdgeRDD +// val asJavaGraph = (vertices, edges) } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 64db1edd98c5..07717c47dfe9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -21,7 +21,7 @@ import java.io.{DataOutputStream, FileOutputStream} import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import org.apache.spark.Accumulator -import org.apache.spark.api.python.PythonRDD +import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.VertexRDD import org.apache.spark.graphx.api.java.JavaVertexRDD @@ -34,7 +34,7 @@ private[graphx] class PythonVertexRDD( pythonIncludes: JList[String], preservePartitioning: Boolean, pythonExec: String, - broadcastVars: JList[Broadcast[Array[Byte]]], + broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]]) extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, @@ -74,14 +74,3 @@ object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 } -class VertexProperty(val schemaString: String) { - val schema : List[Any] = fromString(schemaString) - - /** - * The vertex property schema is - * @param schemaString - * @return - */ - def fromString(schemaString: String) : List[String] = - schemaString.split(" ").toList -} From db8cff0b5ee00bf2058e0876624ff623e4c2540d Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 22 Dec 2014 09:39:02 -0500 Subject: [PATCH 16/25] SPARK-3789: temp commit before merging master on 12/22 --- .../graphx/api/java/JavaVertexRDDLike.scala | 3 +- .../graphx/api/python/PythonEdgeRDD.scala | 5 +- .../spark/graphx/api/python/PythonGraph.scala | 6 +- .../graphx/api/python/PythonVertexRDD.scala | 26 ++- .../org/apache/spark/graphx/JavaAPISuite.java | 2 +- python/pyspark/__init__.py | 8 +- python/pyspark/graphx/__init__.py | 4 +- python/pyspark/graphx/edge.py | 6 +- python/pyspark/graphx/graph.py | 10 +- python/pyspark/graphx/vertex.py | 189 ++++++++---------- python/pyspark/rdd.py | 2 - 11 files changed, 125 insertions(+), 136 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala index 7a79fc90b64f..1e40bde22573 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -119,6 +119,7 @@ trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], def fromEdges[ED: ClassTag, VD: ClassTag] (edges: EdgeRDDImpl[ED, VD], numPartitions: Int, defaultVal: VD): JavaVertexRDD[VD] = { - JavaVertexRDD(VertexRDD.fromEdges[VD](EdgeRDD.fromEdges[ED, VD](edges), numPartitions, defaultVal)) + JavaVertexRDD(VertexRDD.fromEdges[VD] + (EdgeRDD.fromEdges[ED, VD](edges), numPartitions, defaultVal)) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index 13d426e82897..259e17ea00ee 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -21,6 +21,7 @@ import java.io.{DataOutputStream, FileOutputStream} import java.util.{List => JList, Map => JMap} import org.apache.spark.Accumulator +import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD @@ -38,9 +39,7 @@ private[graphx] class PythonEdgeRDD( pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { -// def asJavaEdgeRDD[ED, VD: ClassTag]() : JavaEdgeRDD[ED, VD] = { -// JavaEdgeRDD.fromRDDOfEdges[ED, VD](EdgeRDD.fromEdges[ED, VD](parent.map(x => x.asInstanceOf))) -// } + val asJavaEdgeRDD = JavaRDD.fromRDD(parent) def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala index 5c26f83e16b8..23a5ed5f93c8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonGraph.scala @@ -26,9 +26,9 @@ private[graphx] class PythonGraph ( // extends Graph[Array[Byte], Array[Byte]] with Serializable { extends Serializable { -// val vertices = vertexRDD.asJavaVertexRDD -// val edges = edgeRDD.asJavaEdgeRDD -// val asJavaGraph = (vertices, edges) + val vertices = vertexRDD.asJavaVertexRDD + val edges = edgeRDD.asJavaEdgeRDD + val asJavaGraph = (vertices, edges) } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 07717c47dfe9..b298c5fc786a 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -21,26 +21,27 @@ import java.io.{DataOutputStream, FileOutputStream} import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import org.apache.spark.Accumulator +import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.graphx.VertexRDD -import org.apache.spark.graphx.api.java.JavaVertexRDD import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel private[graphx] class PythonVertexRDD( - @transient parent: RDD[_], + @transient parent: JavaRDD[_], command: Array[Byte], envVars: JMap[String, String], pythonIncludes: JList[String], preservePartitioning: Boolean, pythonExec: String, broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) - extends PythonRDD (parent, command, envVars, + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) + extends PythonRDD (parent.rdd, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { - val asJavaVertexRDD = JavaVertexRDD.fromVertexRDD(VertexRDD(parent.asInstanceOf)) + val asJavaVertexRDD = JavaRDD.fromRDD(parent) def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ @@ -72,5 +73,18 @@ private[graphx] class PythonVertexRDD( object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 + + implicit def apply(@transient parent: RDD[_], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[PythonBroadcast]], + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) = { + new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, preservePartitioning, + pythonExec, broadcastVars, accumulator, targetStorageLevel) + } } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index a47476454bc8..18dbe6642271 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -108,6 +108,6 @@ public void testEdgeRDDCount() { JavaEdgeRDD javaEdgeRDD = JavaEdgeRDD.apply(javaRDD, classTag, classTag); - assertEquals(javaEdgeRDD.count(), 4L); + assertEquals(javaEdgeRDD.count().longValue(), 4L); } } diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 095ba35b7a09..07c88475b109 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -46,8 +46,12 @@ from pyspark.broadcast import Broadcast from pyspark.serializers import MarshalSerializer, PickleSerializer +from pyspark.graphx.vertex import VertexRDD +from pyspark.graphx.edge import EdgeRDD, Edge +from pyspark.graphx.graph import Graph + __all__ = [ "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", - "Vertex", "Edge", "Graph", -] + "VertexRDD", "EdgeRDD", "Edge", "Graph"] + diff --git a/python/pyspark/graphx/__init__.py b/python/pyspark/graphx/__init__.py index 7e025a50d3c7..b196a4b9777a 100644 --- a/python/pyspark/graphx/__init__.py +++ b/python/pyspark/graphx/__init__.py @@ -19,8 +19,8 @@ Python bindings for GraphX. """ -from pyspark.graphx.vertex import VertexRDD, Vertex, VertexId +from pyspark.graphx.vertex import VertexRDD, VertexId from pyspark.graphx.edge import Edge, EdgeRDD from pyspark.graphx.graph import Graph -__all__ = ["PartitioningStrategy", "VertexRDD", "EdgeRDD", "Graph", "Vertex", "Edge"] +__all__ = ["PartitioningStrategy", "VertexRDD", "EdgeRDD", "Graph", "Edge"] diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index 5222429f066d..bebc9f500921 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -18,14 +18,12 @@ """ Python bindings for GraphX. """ -import operator +import os import itertools from tempfile import NamedTemporaryFile -from build.py4j.java_collections import MapConverter, ListConverter +# from build.py4j.java_collections import MapConverter, ListConverter from pyspark.accumulators import PStatsParam -from pyspark.graphx.partitionstrategy import PartitionStrategy from pyspark import RDD, StorageLevel -from pyspark.rdd import PipelinedRDD from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer, \ NoOpSerializer from pyspark.traceback_utils import SCCallSiteSync diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index 76fefc4786b0..2d9bdbef56dc 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -164,13 +164,5 @@ def triangleCount(self): def stronglyConnectedComponents(self, iterations): return - def vertexProgram(self): + def pregel(self, initial_message, vertex_program, send_message, combine_message): return - - def initialMessage(self): - return - - def sendMessage(self): - return - - diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index d44d55409dbe..866fd03d6db3 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -21,20 +21,19 @@ import itertools import os -from py4j.java_collections import MapConverter, ListConverter from tempfile import NamedTemporaryFile -from types import TupleType, IntType -import operator + from numpy.numarray.numerictypes import Long + +from py4j.java_collections import MapConverter, ListConverter from pyspark.accumulators import PStatsParam -from pyspark.rdd import PipelinedRDD -from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, BatchedSerializer -from pyspark import RDD, PickleSerializer, StorageLevel -from pyspark.graphx.partitionstrategy import PartitionStrategy -from pyspark.sql import StringType, LongType +from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, AutoBatchedSerializer, \ + BatchedSerializer +from pyspark import RDD, PickleSerializer, StorageLevel, SparkContext from pyspark.traceback_utils import SCCallSiteSync -__all__ = ["VertexRDD", "VertexId", "Vertex"] + +__all__ = ["VertexRDD", "VertexId"] """ @@ -46,30 +45,6 @@ VertexId = Long -class Vertex(object): - """ - Vertex class is a tuple of (VertexId and VertexProperty) - """ - def __init__(self, vertex_id, vertex_property): - self._id = VertexId(vertex_id) - self._property = vertex_property - - @property - def property(self): - return self._property - - def asTuple(self): - return (self._id, self._property) - - def __str__(self): - return self._id + self._property - - -class VertexPropertySchema(object): - def __init__(self, tuple): - self.schema = list(tuple) - - class VertexRDD(object): """ VertexRDD class defines vertex operations/transformation and vertex properties @@ -79,8 +54,7 @@ class VertexRDD(object): in PythonVertexRDD class in [[org.apache.spark.graphx.api.python package]] """ - def __init__(self, jrdd, - jrdd_deserializer = BatchedSerializer(PickleSerializer())): + def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())): """ Constructor :param jrdd: A JavaRDD reference passed from the parent @@ -91,31 +65,25 @@ def __init__(self, jrdd, """ - self._jrdd = jrdd - self._ctx = jrdd._jrdd.context - self._jrdd_deserializer = jrdd_deserializer - self._preserve_partitioning = False - self._name = "VertexRDD" - self._is_cached = False - self._is_checkpointed = False - self._id = jrdd.id() - self._partitionFunc = None - self._jrdd_val = None - self._bypass_serializer = False + self.name = "VertexRDD" + # self.jvertex_rdd = jrdd + self.is_cached = False + self.is_checkpointed = False + self.ctx = SparkContext._active_spark_context + self.jvertex_rdd_deserializer = jrdd_deserializer + self.id = jrdd.id() + self.partitionFunc = None + self.bypass_serializer = False + self.preserve_partitioning = False + self.jvertex_rdd = self.getJavaVertexRDD(jrdd, jrdd_deserializer) - def id(self): - """ - VertexRDD has a unique id - """ - return self._id - - # TODO: Does not work def __repr__(self): - return self._jrdd.toString() + return self.jvertex_rdd.toString() + @property def context(self): - return self._ctx + return self.ctx def cache(self): """ @@ -126,49 +94,28 @@ def cache(self): return self def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): - self._is_cached = True - javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) - self._jrdd.persist(javaStorageLevel) + self.is_cached = True + java_storage_level = self.context._getJavaStorageLevel(storageLevel) + self.jvertex_rdd.persist(java_storage_level) return self def unpersist(self): - self._is_cached = False - self._jrdd.unpersist() + self.is_cached = False + self.jvertex_rdd.unpersist() return self def checkpoint(self): self.is_checkpointed = True - self._jrdd.rdd().checkpoint() + self.jvertex_rdd.rdd().checkpoint() def count(self): - return self._jrdd.count() - - def collect(self): - """ - Return all of the elements in this vertex RDD as a list - """ - with SCCallSiteSync(self._ctx) as css: - bytesInJava = self._jrdd.collect().iterator() - return list(self._collect_iterator_through_file(bytesInJava)) - - def _collect_iterator_through_file(self, iterator): - # Transferring lots of data through Py4J can be slow because - # socket.readline() is inefficient. Instead, we'll dump the data to a - # file and read it back. - tempFile = NamedTemporaryFile(delete=False, dir=self._ctx._temp_dir) - tempFile.close() - self._ctx._writeToFile(iterator, tempFile.name) - # Read the data into Python and deserialize it: - with open(tempFile.name, 'rb') as tempFile: - for item in self._jrdd_deserializer.load_stream(tempFile): - yield item - os.unlink(tempFile.name) + return self.jvertex_rdd.count() def take(self, num=10): - return self._jrdd.take(num) + return self.jvertex_rdd.take(num) def sum(self): - self._jrdd.sum() + self.jvertex_rdd.sum() def mapValues(self, f, preserves_partitioning=False): """ @@ -250,6 +197,42 @@ def dispatch(seq): # return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() + def collect(self): + """ + Return a list that contains all of the elements in this RDD. + """ + + + def getJavaVertexRDD(self, rdd, rdd_deserializer): + if self.bypass_serializer: + self.jvertex_rdd_deserializer = NoOpSerializer() + enable_profile = self.context._conf.get("spark.python.profile", "false") == "true" + profileStats = self.context.accumulator(None, PStatsParam) if enable_profile else None + + # the serialized command will be compressed by broadcast + broadcast_vars = ListConverter().convert( + [x._jbroadcast for x in self.context._pickled_broadcast_vars], + self.context._gateway._gateway_client) + self.context._pickled_broadcast_vars.clear() + env = MapConverter().convert(self.context.environment, + self.context._gateway._gateway_client) + includes = ListConverter().convert(self.context._python_includes, + self.context._gateway._gateway_client) + target_storage_level = StorageLevel.MEMORY_ONLY + java_storage_level = self.context._getJavaStorageLevel(target_storage_level) + jvertex_rdd = self.context._jvm.PythonVertexRDD(rdd._jrdd, + bytearray(" "), + env, includes, self.preserve_partitioning, + self.context.pythonExec, + broadcast_vars, self.context._javaAccumulator, + java_storage_level) + + if enable_profile: + self.id = self.jvertex_rdd.id() + self.context._add_profile(self.id, profileStats) + return jvertex_rdd + + class PipelinedVertexRDD(VertexRDD): """ @@ -288,11 +271,11 @@ def pipeline_func(split, iterator): self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer self.is_cached = False self.is_checkpointed = False - self._ctx = prev._ctx + self.ctx = prev.ctx self.prev = prev self._jrdd_val = None self._id = None - self._jrdd_deserializer = self._ctx.serializer + self._jrdd_deserializer = self.ctx.serializer self._bypass_serializer = False self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None self._broadcast = None @@ -309,36 +292,36 @@ def _jrdd(self): return self._jrdd_val if self._bypass_serializer: self._jrdd_deserializer = NoOpSerializer() - enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" - profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None + enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None command = (self.func, profileStats, self._prev_jrdd_deserializer, self._jrdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M - self._broadcast = self._ctx.broadcast(pickled_command) + self._broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(self._broadcast) broadcast_vars = ListConverter().convert( - [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], - self._ctx._gateway._gateway_client) - self._ctx._pickled_broadcast_vars.clear() - env = MapConverter().convert(self._ctx.environment, - self._ctx._gateway._gateway_client) - includes = ListConverter().convert(self._ctx._python_includes, - self._ctx._gateway._gateway_client) - targetStorageLevel = StorageLevel.MEMORY_ONLY - python_rdd = self._ctx._jvm.PythonVertexRDD(self._prev_jrdd.rdd(), + [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], + self.ctx._gateway._gateway_client) + self.ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self.ctx.environment, + self.ctx._gateway._gateway_client) + includes = ListConverter().convert(self.ctx._python_includes, + self.ctx._gateway._gateway_client) + target_storage_level = StorageLevel.MEMORY_ONLY + python_rdd = self.ctx._jvm.PythonVertexRDD(self._prev_jrdd.rdd(), bytearray(pickled_command), env, includes, self.preservesPartitioning, - self._ctx.pythonExec, - broadcast_vars, self._ctx._javaAccumulator, - targetStorageLevel) + self.ctx.pythonExec, + broadcast_vars, self.ctx._javaAccumulator, + target_storage_level) self._jrdd_val = python_rdd.asJavaVertexRDD() if enable_profile: self._id = self._jrdd_val.id() - self._ctx._add_profile(self._id, profileStats) + self.ctx._add_profile(self._id, profileStats) return self._jrdd_val def id(self): diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index c12e1aff70bd..a5d5fb787deb 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -668,7 +668,6 @@ def func(it): self.mapPartitions(func).count() # Force evaluation def collect(self): - print "in collect() of rdd.py" """ Return a list that contains all of the elements in this RDD. """ @@ -677,7 +676,6 @@ def collect(self): return list(self._collect_iterator_through_file(bytesInJava)) def _collect_iterator_through_file(self, iterator): - print "in _collect_iterator_through_file() of rdd.py" # Transferring lots of data through Py4J can be slow because # socket.readline() is inefficient. Instead, we'll dump the data to a # file and read it back. From 375411764e36be082483020662a33aca5c5be861 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Sun, 4 Jan 2015 22:54:07 -0500 Subject: [PATCH 17/25] SPARK-3789: temp commit before merging master on 1/5 --- .../org/apache/spark/graphx/api/python/PythonVertexRDD.scala | 3 ++- python/pyspark/graphx/vertex.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index b298c5fc786a..590af8de83e8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -83,7 +83,8 @@ object PythonVertexRDD { broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]], targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) = { - new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, preservePartitioning, + new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, + preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) } } diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 866fd03d6db3..730ca9b81ec2 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -201,6 +201,9 @@ def collect(self): """ Return a list that contains all of the elements in this RDD. """ + pyrdd = self.getJavaVertexRDD(self.jvertex_rdd, self.jvertex_rdd_deserializer) + pyrdd.collect() + def getJavaVertexRDD(self, rdd, rdd_deserializer): From 5717578d52ad745f52a5f0cd7bec2211bc05c968 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Wed, 7 Jan 2015 13:03:37 -0500 Subject: [PATCH 18/25] SPARK-3789: temp commit before merging master on 1/7 --- python/pyspark/graphx/vertex.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 730ca9b81ec2..566f42cd5e98 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -27,8 +27,7 @@ from py4j.java_collections import MapConverter, ListConverter from pyspark.accumulators import PStatsParam -from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, AutoBatchedSerializer, \ - BatchedSerializer +from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, AutoBatchedSerializer from pyspark import RDD, PickleSerializer, StorageLevel, SparkContext from pyspark.traceback_utils import SCCallSiteSync @@ -109,7 +108,8 @@ def checkpoint(self): self.jvertex_rdd.rdd().checkpoint() def count(self): - return self.jvertex_rdd.count() + return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() + # return self.jvertex_rdd.count() def take(self, num=10): return self.jvertex_rdd.take(num) @@ -175,7 +175,8 @@ def dispatch(seq): return [(v, w) for v in vbuf for w in wbuf] vs = self.map(lambda (k, v): (k, (1, v))) ws = other.map(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) + return vs.union(ws).groupByKey(numPartitions)\ + .flatMapValues(lambda x: dispatch(x.__iter__())) def innerJoin(self, other, numPartitions=None): From 1bbfffa892c7a1f3859186469602a9a0b90640ca Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Fri, 16 Jan 2015 10:33:00 -0500 Subject: [PATCH 19/25] SPARK-3789: temp commit before merging master on 1/16 --- .../apache/spark/api/python/PythonRDD.scala | 18 +++---- .../graphx/api/python/PythonVertexRDD.scala | 8 ++- python/pyspark/graphx/vertex.py | 52 ++++++++++++------- 3 files changed, 48 insertions(+), 30 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index f3d569fa971a..751917bcc0ea 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -19,26 +19,24 @@ package org.apache.spark.api.python import java.io._ import java.net._ -import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, UUID, Collections} - -import org.apache.spark.input.PortableDataStream - -import scala.collection.JavaConversions._ -import scala.collection.mutable -import scala.language.existentials +import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap} import com.google.common.base.Charsets.UTF_8 - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.CompressionCodec -import org.apache.hadoop.mapred.{InputFormat, OutputFormat, JobConf} +import org.apache.hadoop.mapred.{InputFormat, JobConf, OutputFormat} import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, OutputFormat => NewOutputFormat} import org.apache.spark._ -import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD} +import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.broadcast.Broadcast +import org.apache.spark.input.PortableDataStream import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.language.existentials + private[spark] class PythonRDD( @transient parent: RDD[_], command: Array[Byte], diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 590af8de83e8..adf1d77e94be 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -21,7 +21,7 @@ import java.io.{DataOutputStream, FileOutputStream} import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import org.apache.spark.Accumulator -import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD @@ -69,6 +69,11 @@ private[graphx] class PythonVertexRDD( } } } + + def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): + JavaRDD[Array[Byte]] = { + readRDDFromFile(sc, filename, parallelism) + } } object PythonVertexRDD { @@ -83,6 +88,7 @@ object PythonVertexRDD { broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]], targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) = { + System.out.println("DEBUG: in PythonVertexRDD:apply") new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 566f42cd5e98..7a0922bc5793 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -22,11 +22,11 @@ import itertools import os from tempfile import NamedTemporaryFile - from numpy.numarray.numerictypes import Long from py4j.java_collections import MapConverter, ListConverter from pyspark.accumulators import PStatsParam +from pyspark.rdd import PipelinedRDD from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, AutoBatchedSerializer from pyspark import RDD, PickleSerializer, StorageLevel, SparkContext from pyspark.traceback_utils import SCCallSiteSync @@ -46,11 +46,11 @@ class VertexRDD(object): """ - VertexRDD class defines vertex operations/transformation and vertex properties - The schema of the vertex properties are specified as a tuple to the vertex - The vertex operations are mapValues, filter, diff, innerJoin, leftOuterJoin - and aggergateUsingIndex. These operations are mapped to Scala functions defined - in PythonVertexRDD class in [[org.apache.spark.graphx.api.python package]] + VertexRDD class defines the vertex operations/transformation. The list of + vertex transformations and actions are available at + `http://spark.apache.org/docs/latest/graphx-programming-guide.html` + These operations are mapped to Scala functions defined + in `org.apache.spark.graphx.api.python.PythonVertexRDD` """ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())): @@ -65,7 +65,6 @@ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerialize """ self.name = "VertexRDD" - # self.jvertex_rdd = jrdd self.is_cached = False self.is_checkpointed = False self.ctx = SparkContext._active_spark_context @@ -185,6 +184,7 @@ def dispatch(seq): for (n, v) in seq: if n == 1: vbuf.append(v) + vbuf.append(v) elif n == 2: wbuf.append(v) return [(v, w) for v in vbuf for w in wbuf] @@ -202,10 +202,22 @@ def collect(self): """ Return a list that contains all of the elements in this RDD. """ - pyrdd = self.getJavaVertexRDD(self.jvertex_rdd, self.jvertex_rdd_deserializer) - pyrdd.collect() - - + with SCCallSiteSync(self.context) as css: + bytesInJava = self.jvertex_rdd.collect().iterator() + return list(self._collect_iterator_through_file(bytesInJava)) + + def _collect_iterator_through_file(self, iterator): + # Transferring lots of data through Py4J can be slow because + # socket.readline() is inefficient. Instead, we'll dump the data to a + # file and read it back. + tempFile = NamedTemporaryFile(delete=False, dir=self.ctx._temp_dir) + tempFile.close() + self.ctx._writeToFile(iterator, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + for item in self.jvertex_rdd_deserializer.load_stream(tempFile): + yield item + os.unlink(tempFile.name) def getJavaVertexRDD(self, rdd, rdd_deserializer): if self.bypass_serializer: @@ -257,13 +269,13 @@ class PipelinedVertexRDD(VertexRDD): """ def __init__(self, prev, func, preservesPartitioning=False): - if not isinstance(prev, PipelinedVertexRDD) or not prev._is_pipelinable(): + if isinstance(prev, PipelinedRDD) or not prev._is_pipelinable(): # This transformation is the first in its stage: self.func = func self.preservesPartitioning = preservesPartitioning - self._prev_jrdd = prev._jrdd - self._prev_jrdd_deserializer = prev._jrdd_deserializer - else: + self.prev_jvertex_rdd = prev._prev_jrdd + self.prev_jvertex_rdd_deserializer = prev._prev_jrdd_deserializer + elif isinstance(prev, PipelinedVertexRDD) or isinstance(prev, RDD): prev_func = prev.func def pipeline_func(split, iterator): @@ -271,8 +283,8 @@ def pipeline_func(split, iterator): self.func = pipeline_func self.preservesPartitioning = \ prev.preservesPartitioning and preservesPartitioning - self._prev_jrdd = prev._prev_jrdd # maintain the pipeline - self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer + self.prev_jvertex_rdd = prev.prev_jvertex_rdd + self.prev_jvertex_rdd_deserializer = prev.prev_jvertex_rdd_deserializer self.is_cached = False self.is_checkpointed = False self.ctx = prev.ctx @@ -290,8 +302,10 @@ def __del__(self): self._broadcast = None @property - def _jrdd(self): - print "in _jrdd of vertex.py" + def jvertex_rdd(self): + print "**********************************" + print "in jvertex_rdd of vertex.py" + print "**********************************" if self._jrdd_val: return self._jrdd_val if self._bypass_serializer: From 36d15df0c79f8e1431ae971d311461175f47c3a4 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Fri, 16 Jan 2015 17:35:51 -0500 Subject: [PATCH 20/25] SPARK-3789: collect(), take() fixed --- python/pyspark/graphx/vertex.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 7a0922bc5793..5aa42c1405c0 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -65,6 +65,7 @@ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerialize """ self.name = "VertexRDD" + self.jrdd = jrdd self.is_cached = False self.is_checkpointed = False self.ctx = SparkContext._active_spark_context @@ -111,7 +112,7 @@ def count(self): # return self.jvertex_rdd.count() def take(self, num=10): - return self.jvertex_rdd.take(num) + return self.jrdd.take(num) def sum(self): self.jvertex_rdd.sum() @@ -197,27 +198,13 @@ def dispatch(seq): # def aggregateUsingIndex(self, other, reduceFunc): # return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() - def collect(self): """ Return a list that contains all of the elements in this RDD. """ with SCCallSiteSync(self.context) as css: - bytesInJava = self.jvertex_rdd.collect().iterator() - return list(self._collect_iterator_through_file(bytesInJava)) - - def _collect_iterator_through_file(self, iterator): - # Transferring lots of data through Py4J can be slow because - # socket.readline() is inefficient. Instead, we'll dump the data to a - # file and read it back. - tempFile = NamedTemporaryFile(delete=False, dir=self.ctx._temp_dir) - tempFile.close() - self.ctx._writeToFile(iterator, tempFile.name) - # Read the data into Python and deserialize it: - with open(tempFile.name, 'rb') as tempFile: - for item in self.jvertex_rdd_deserializer.load_stream(tempFile): - yield item - os.unlink(tempFile.name) + bytesInJava = self.jrdd.collect() + return list(bytesInJava) def getJavaVertexRDD(self, rdd, rdd_deserializer): if self.bypass_serializer: @@ -347,5 +334,5 @@ def id(self): self._id = self._jrdd.id() return self._id - def _is_pipelinable(self): + def is_pipelinable(self): return not (self.is_cached or self.is_checkpointed) From 08d420924d3c3a50c89a0cbd1109c16f84f0447e Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 19 Jan 2015 09:46:09 -0500 Subject: [PATCH 21/25] SPARK-3789: temp commit before merging master on 1/18 --- python/pyspark/rdd.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index a5d5fb787deb..c1120cf781e5 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2091,8 +2091,6 @@ def __del__(self): @property def _jrdd(self): - import traceback - print traceback.print_stack() if self._jrdd_val: return self._jrdd_val if self._bypass_serializer: From 6a6b7ec1822599d5d0cb4d618c8057e798f0436f Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Wed, 21 Jan 2015 14:46:30 -0500 Subject: [PATCH 22/25] SPARK-3789: filter in VertexRDD fixed --- .../graphx/api/python/PythonVertexRDD.scala | 7 +- python/pyspark/graphx/edge.py | 3 +- python/pyspark/graphx/graph.py | 3 +- python/pyspark/graphx/vertex.py | 176 +++++++++--------- 4 files changed, 93 insertions(+), 96 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index adf1d77e94be..33734f03323f 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -36,7 +36,7 @@ private[graphx] class PythonVertexRDD( pythonExec: String, broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) + targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PythonRDD (parent.rdd, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { @@ -87,11 +87,10 @@ object PythonVertexRDD { pythonExec: String, broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel : StorageLevel = StorageLevel.MEMORY_ONLY) = { + targetStorageLevel : StorageLevel) = { System.out.println("DEBUG: in PythonVertexRDD:apply") new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, - preservePartitioning, - pythonExec, broadcastVars, accumulator, targetStorageLevel) + preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) } } diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index bebc9f500921..79c7d74ef295 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -16,8 +16,9 @@ # """ -Python bindings for GraphX. +Python bindings for EdgeRDD in GraphX """ + import os import itertools from tempfile import NamedTemporaryFile diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index 2d9bdbef56dc..5a3251d2d4c8 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -16,8 +16,9 @@ # """ -Python bindings for GraphX. +Python bindings for Graph[VertexRDD, EdgeRDD] in GraphX """ + import itertools from pyspark import PickleSerializer, RDD, StorageLevel, SparkContext from pyspark.graphx import VertexRDD, EdgeRDD diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index 5aa42c1405c0..bfaddcf90fe0 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -16,7 +16,7 @@ # """ -Python bindings for GraphX. +Python bindings for VertexRDD in GraphX """ import itertools @@ -25,6 +25,7 @@ from numpy.numarray.numerictypes import Long from py4j.java_collections import MapConverter, ListConverter +import operator from pyspark.accumulators import PStatsParam from pyspark.rdd import PipelinedRDD from pyspark.serializers import CloudPickleSerializer, NoOpSerializer, AutoBatchedSerializer @@ -36,21 +37,21 @@ """ -Vertex id type is long by default. -Defining a type for that enables -us to override it in future if -need be +The default type of vertex id is Long +A separate VertexId type is defined +here so that other types can be used +for vertex ids in future """ VertexId = Long class VertexRDD(object): """ - VertexRDD class defines the vertex operations/transformation. The list of - vertex transformations and actions are available at + VertexRDD class defines the vertex actions and transformations. The complete list of + transformations and actions for vertices are available at `http://spark.apache.org/docs/latest/graphx-programming-guide.html` These operations are mapped to Scala functions defined - in `org.apache.spark.graphx.api.python.PythonVertexRDD` + in `org.apache.spark.graphx.impl.VertexRDDImpl` """ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())): @@ -61,7 +62,6 @@ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerialize :param jrdd_deserializer: The deserializer used in Python workers created from PythonRDD to execute a serialized Python function and RDD - """ self.name = "VertexRDD" @@ -80,10 +80,6 @@ def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerialize def __repr__(self): return self.jvertex_rdd.toString() - @property - def context(self): - return self.ctx - def cache(self): """ Persist this vertex RDD with the default storage level (C{MEMORY_ONLY_SER}). @@ -94,29 +90,31 @@ def cache(self): def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): self.is_cached = True - java_storage_level = self.context._getJavaStorageLevel(storageLevel) + java_storage_level = self.ctx._getJavaStorageLevel(storageLevel) self.jvertex_rdd.persist(java_storage_level) return self - def unpersist(self): + def unpersist(self, blocking = False): self.is_cached = False - self.jvertex_rdd.unpersist() + self.jvertex_rdd.unpersist(blocking) return self def checkpoint(self): self.is_checkpointed = True - self.jvertex_rdd.rdd().checkpoint() + self.jvertex_rdd.checkpoint() + + def isCheckpointed(self): + """ + Return whether this RDD has been checkpointed or not + """ + return self.jvertex_rdd.isCheckpointed() def count(self): - return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() - # return self.jvertex_rdd.count() + return self.jrdd.count() def take(self, num=10): return self.jrdd.take(num) - def sum(self): - self.jvertex_rdd.sum() - def mapValues(self, f, preserves_partitioning=False): """ Return a new vertex RDD by applying a function to each vertex attributes, @@ -127,9 +125,10 @@ def mapValues(self, f, preserves_partitioning=False): >>> sorted(vertices.mapValues(lambda x: (x + ":" + x)).collect()) [(1, 'a:a'), (2, 'b:b'), (3, 'c:c')] """ - map_func = lambda (k, v): (k, f(v)) def func(_, iterator): - return itertools.imap(map_func, iterator) + print "in func" + return itertools.imap(lambda (k, v): (k, f(v)), iterator) + print "in mapValues" return PipelinedVertexRDD(self, func, preserves_partitioning) def mapVertexPartitions(self, f, preserve_partitioning=False): @@ -146,9 +145,7 @@ def filter(self, f): >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() [2] """ - def func(iterator): - return itertools.ifilter(f, iterator) - return self.mapVertexPartitions(func, True) + return self.jrdd.filter(f) def diff(self, other, numPartitions=2): """ @@ -178,7 +175,6 @@ def dispatch(seq): return vs.union(ws).groupByKey(numPartitions)\ .flatMapValues(lambda x: dispatch(x.__iter__())) - def innerJoin(self, other, numPartitions=None): def dispatch(seq): vbuf, wbuf = [], [] @@ -191,63 +187,60 @@ def dispatch(seq): return [(v, w) for v in vbuf for w in wbuf] vs = self.map(lambda (k, v): (k, (1, v))) ws = other.map(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) - - - - # def aggregateUsingIndex(self, other, reduceFunc): - # return self._jrdd._jvm.org.apache.spark.PythonVertexRDD.aggregateUsingIndex() + return vs.union(ws).groupByKey(numPartitions).\ + flatMapValues(lambda x: dispatch(x.__iter__())) def collect(self): """ Return a list that contains all of the elements in this RDD. """ - with SCCallSiteSync(self.context) as css: + with SCCallSiteSync(self.ctx) as css: bytesInJava = self.jrdd.collect() return list(bytesInJava) def getJavaVertexRDD(self, rdd, rdd_deserializer): if self.bypass_serializer: self.jvertex_rdd_deserializer = NoOpSerializer() - enable_profile = self.context._conf.get("spark.python.profile", "false") == "true" - profileStats = self.context.accumulator(None, PStatsParam) if enable_profile else None + enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None # the serialized command will be compressed by broadcast broadcast_vars = ListConverter().convert( - [x._jbroadcast for x in self.context._pickled_broadcast_vars], - self.context._gateway._gateway_client) - self.context._pickled_broadcast_vars.clear() - env = MapConverter().convert(self.context.environment, - self.context._gateway._gateway_client) - includes = ListConverter().convert(self.context._python_includes, - self.context._gateway._gateway_client) + [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], + self.ctx._gateway._gateway_client) + self.ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self.ctx.environment, + self.ctx._gateway._gateway_client) + includes = ListConverter().convert(self.ctx._python_includes, + self.ctx._gateway._gateway_client) target_storage_level = StorageLevel.MEMORY_ONLY - java_storage_level = self.context._getJavaStorageLevel(target_storage_level) - jvertex_rdd = self.context._jvm.PythonVertexRDD(rdd._jrdd, + java_storage_level = self.ctx._getJavaStorageLevel(target_storage_level) + jvertex_rdd = self.ctx._jvm.PythonVertexRDD(rdd._jrdd, bytearray(" "), env, includes, self.preserve_partitioning, - self.context.pythonExec, - broadcast_vars, self.context._javaAccumulator, + self.ctx.pythonExec, + broadcast_vars, self.ctx._javaAccumulator, java_storage_level) if enable_profile: self.id = self.jvertex_rdd.id() - self.context._add_profile(self.id, profileStats) + self.ctx._add_profile(self.id, profileStats) return jvertex_rdd class PipelinedVertexRDD(VertexRDD): """ - Pipelined maps: + Pipelined mapValues in VertexRDD: - >>> rdd = sc.parallelize([1, 2, 3, 4]) - >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() - [4, 8, 12, 16] - >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() - [4, 8, 12, 16] + >>> rdd = sc.parallelize([(1, ("Alice", 29)), (2, ("Bob", 30)), \ + (3, ("Charlie", 31)), (4, ("Dwayne", 32))]) + >>> vertices = VertexRDD(rdd) + >>> vertices.mapValues(lambda x: x[1] * 2).cache().collect() + [(1, ("Alice", 58)), (2, ("Bob", 60)), \ + (3, ("Charlie", 62)), (4, ("Dwayne", 64))] - Pipelined reduces: + Pipelined reduces in VertexRDD: >>> from operator import add >>> rdd.map(lambda x: 2 * x).reduce(add) 20 @@ -256,13 +249,13 @@ class PipelinedVertexRDD(VertexRDD): """ def __init__(self, prev, func, preservesPartitioning=False): - if isinstance(prev, PipelinedRDD) or not prev._is_pipelinable(): + if not isinstance(prev, PipelinedVertexRDD) or not prev.is_pipelinable(): # This transformation is the first in its stage: self.func = func self.preservesPartitioning = preservesPartitioning - self.prev_jvertex_rdd = prev._prev_jrdd - self.prev_jvertex_rdd_deserializer = prev._prev_jrdd_deserializer - elif isinstance(prev, PipelinedVertexRDD) or isinstance(prev, RDD): + self.prev_jvertex_rdd = prev.jvertex_rdd + self.prev_jvertex_rdd_deserializer = prev.jvertex_rdd_deserializer + else: prev_func = prev.func def pipeline_func(split, iterator): @@ -270,43 +263,46 @@ def pipeline_func(split, iterator): self.func = pipeline_func self.preservesPartitioning = \ prev.preservesPartitioning and preservesPartitioning - self.prev_jvertex_rdd = prev.prev_jvertex_rdd + self.prev_jvertex_rdd = prev.jvrdd_val self.prev_jvertex_rdd_deserializer = prev.prev_jvertex_rdd_deserializer + self.is_cached = False self.is_checkpointed = False self.ctx = prev.ctx self.prev = prev - self._jrdd_val = None - self._id = None - self._jrdd_deserializer = self.ctx.serializer - self._bypass_serializer = False - self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None - self._broadcast = None + self.jvrdd_val = None + self.id = None + self.jvertex_rdd_deserializer = self.ctx.serializer + self.bypass_serializer = False + self.partitionFunc = prev._partitionFunc if self.preservesPartitioning else None + self.broadcast = None def __del__(self): - if self._broadcast: - self._broadcast.unpersist() - self._broadcast = None + if self.broadcast: + self.broadcast.unpersist() + self.broadcast = None @property def jvertex_rdd(self): - print "**********************************" + print "\n**********************************" print "in jvertex_rdd of vertex.py" - print "**********************************" - if self._jrdd_val: - return self._jrdd_val - if self._bypass_serializer: - self._jrdd_deserializer = NoOpSerializer() + print "**********************************\n" + import traceback + traceback.print_stack() + if self.jvrdd_val: + return self.jvrdd_val + if self.bypass_serializer: + self.jvertex_rdd_deserializer = NoOpSerializer() enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None - command = (self.func, profileStats, self._prev_jrdd_deserializer, - self._jrdd_deserializer) + command = (self.func, profileStats, self.prev_jvertex_rdd_deserializer, + self.jvertex_rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M - self._broadcast = self.ctx.broadcast(pickled_command) - pickled_command = ser.dumps(self._broadcast) + self.broadcast = self.ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self.broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) @@ -315,24 +311,24 @@ def jvertex_rdd(self): self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) - target_storage_level = StorageLevel.MEMORY_ONLY - python_rdd = self.ctx._jvm.PythonVertexRDD(self._prev_jrdd.rdd(), + java_storage_level = self.ctx._getJavaStorageLevel(StorageLevel.MEMORY_ONLY) + python_rdd = self.ctx._jvm.PythonVertexRDD(self.prev_jvertex_rdd.jrdd, bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, - target_storage_level) - self._jrdd_val = python_rdd.asJavaVertexRDD() + java_storage_level) + self.jvrdd_val = python_rdd.asJavaVertexRDD() if enable_profile: - self._id = self._jrdd_val.id() - self.ctx._add_profile(self._id, profileStats) - return self._jrdd_val + self.id = self.jvrdd_val.id() + self.ctx._add_profile(self.id, profileStats) + return self.jvrdd_val def id(self): - if self._id is None: - self._id = self._jrdd.id() - return self._id + if self.id is None: + self.id = self.jvertex_rdd.id() + return self.id def is_pipelinable(self): return not (self.is_cached or self.is_checkpointed) From b9e98770c85e6cae6c572a01a1c4d9a0c8ca7b3c Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Thu, 22 Jan 2015 17:01:33 -0500 Subject: [PATCH 23/25] SPARK-3789: Following methods are complete in VertexRDD(vertex.py), PythonVertexRDD and JavaVertexRDD - toString, count, take, collect, mapValues --- bin/spark-class | 3 ++ .../apache/spark/api/python/PythonRDD.scala | 3 ++ .../spark/graphx/api/java/JavaVertexRDD.scala | 30 +++++++++-- .../graphx/api/java/JavaVertexRDDLike.scala | 14 ++--- .../graphx/api/python/PythonVertexRDD.scala | 38 +++++++------ .../org/apache/spark/graphx/JavaAPISuite.java | 2 +- python/pyspark/graphx/vertex.py | 54 ++++++++++++------- 7 files changed, 98 insertions(+), 46 deletions(-) diff --git a/bin/spark-class b/bin/spark-class index 1b945461fabc..c1a2c41c12fb 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -169,6 +169,9 @@ export CLASSPATH # the driver JVM itself. Instead of handling this complexity in Bash, we launch a separate JVM # to prepare the launch environment of this driver JVM. +export JAVA_OPTS+=" -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005" +echo $JAVA_OPTS + if [ -n "$SPARK_SUBMIT_BOOTSTRAP_DRIVER" ]; then # This is used only if the properties file actually contains these special configs # Export the environment variables needed by SparkSubmitDriverBootstrapper diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 9b61404b5e29..bfdce624e7e2 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -56,6 +56,9 @@ private[spark] class PythonRDD( override val partitioner = if (preservePartitioning) firstParent.partitioner else None override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { + + logError("Inside PythonRDD.compute()") + val startTime = System.currentTimeMillis val env = SparkEnv.get val localdir = env.blockManager.diskBlockManager.localDirs.map( diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index 151df01dfe7a..694d060cfa5c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -16,6 +16,9 @@ */ package org.apache.spark.graphx.api.java +import java.util.{List => JList} +import java.lang.{Long => JLong} + import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.function.{Function => JFunction} import org.apache.spark.graphx.{VertexId, VertexRDD} @@ -48,14 +51,22 @@ class JavaVertexRDD[VD]( } /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ - def cache(): JavaVertexRDD[VD] = vertices.cache().asInstanceOf[JavaVertexRDD[VD]] + def cache(): this.type = { + vertices.cache() + this + } /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ - def persist(): JavaVertexRDD[VD] = vertices.persist().asInstanceOf[JavaVertexRDD[VD]] + def persist(): this.type = { + vertices.persist() + this + } /** Persist the RDDs of this DStream with the given storage level */ - def persist(storageLevel: StorageLevel): JavaVertexRDD[VD] = - vertices.persist(storageLevel).asInstanceOf[JavaVertexRDD[VD]] + def persist(storageLevel: StorageLevel): this.type = { + vertices.persist(storageLevel) + this + } /** Generate a VertexRDD for the given duration */ override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = @@ -70,6 +81,17 @@ class JavaVertexRDD[VD]( JavaVertexRDD(vertexRDD.filter(x => f.call(x).booleanValue())) def toRDD : RDD[(VertexId, VD)] = vertices + + def collect(): JList[(VertexId, VD)] = { + import scala.collection.JavaConversions._ + val arr: java.util.Collection[(VertexId, VD)] = vertices.collect().toSeq + new java.util.ArrayList(arr) + } + + /** + * Return a new single long element generated by counting all elements in the vertex RDD + */ + def count(): JLong = vertices.count() } object JavaVertexRDD { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala index 1e40bde22573..29f61cb295cc 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDDLike.scala @@ -17,7 +17,6 @@ package org.apache.spark.graphx.api.java -import java.lang.{Long => JLong} import java.util.{List => JList} import org.apache.spark.api.java.JavaRDDLike @@ -25,14 +24,14 @@ import org.apache.spark.api.java.function.{Function => JFunction, Function2 => J import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.{EdgeRDDImpl, ShippableVertexPartition} import org.apache.spark.rdd.RDD -import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.{Logging, Partition, TaskContext} import scala.language.implicitConversions import scala.reflect.ClassTag trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], R <: JavaRDDLike[(VertexId, VD), R]] - extends Serializable { + extends Serializable with Logging { implicit val classTag: ClassTag[VD] @@ -41,12 +40,15 @@ trait JavaVertexRDDLike[VD, This <: JavaVertexRDDLike[VD, This, R], def wrapRDD(in: RDD[(VertexId, VD)]): R - def collect(): List[(VertexId, VD)] = vertexRDD.collect().toList + override def toString: String = vertexRDD.toDebugString /** - * Return a new single long element generated by counting all elements in the vertex RDD + * Return an array of the first num values + * + * @param num + * @return */ - def count(): Long = vertexRDD.count() + def take(num: Int) : Array[(VertexId, VD)] = vertexRDD.take(num) def setName(name: String) = vertexRDD.setName(name) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index 33734f03323f..e701cd1c1328 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -24,7 +24,8 @@ import org.apache.spark.Accumulator import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD +import org.apache.spark.graphx.VertexId +import org.apache.spark.graphx.api.java.JavaVertexRDD import org.apache.spark.storage.StorageLevel private[graphx] class PythonVertexRDD( @@ -37,11 +38,28 @@ private[graphx] class PythonVertexRDD( broadcastVars: JList[Broadcast[PythonBroadcast]], accumulator: Accumulator[JList[Array[Byte]]], targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends PythonRDD (parent.rdd, command, envVars, + extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { - val asJavaVertexRDD = JavaRDD.fromRDD(parent) + def this(@transient parent: JavaVertexRDD[_], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[PythonBroadcast]], + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : StorageLevel) = { + this(parent.toRDD, command, envVars, pythonIncludes, + preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) + } + + val asJavaVertexRDD = { +// new JavaVertexRDD[Array[Byte]](parent.asInstanceOf[JavaRDD[(VertexId, Array[Byte])]]) + JavaVertexRDD(JavaRDD.fromRDD(this).asInstanceOf[JavaRDD[(VertexId, Array[Byte])]]) + } + def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ @@ -78,19 +96,5 @@ private[graphx] class PythonVertexRDD( object PythonVertexRDD { val DEFAULT_SPARK_BUFFER_SIZE = 65536 - - implicit def apply(@transient parent: RDD[_], - command: Array[Byte], - envVars: JMap[String, String], - pythonIncludes: JList[String], - preservePartitioning: Boolean, - pythonExec: String, - broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]], - targetStorageLevel : StorageLevel) = { - System.out.println("DEBUG: in PythonVertexRDD:apply") - new PythonVertexRDD(JavaRDD.fromRDD(parent), command, envVars, pythonIncludes, - preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) - } } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index 18dbe6642271..94360cb7362d 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -89,7 +89,7 @@ public void testVertexRDDCount() { JavaVertexRDD> javaVertexRDD = JavaVertexRDD.apply(javaRDD, this.classTag); - assertEquals(javaVertexRDD.count(), 3L); + assertEquals(3L, javaVertexRDD.count().intValue()); } @Test diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index bfaddcf90fe0..f865ad49ed7b 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -110,8 +110,9 @@ def isCheckpointed(self): return self.jvertex_rdd.isCheckpointed() def count(self): - return self.jrdd.count() + return self.jvertex_rdd.count() + # TODO: This is a hack. take() must call JavaVertexRDD.take() def take(self, num=10): return self.jrdd.take(num) @@ -126,9 +127,7 @@ def mapValues(self, f, preserves_partitioning=False): [(1, 'a:a'), (2, 'b:b'), (3, 'c:c')] """ def func(_, iterator): - print "in func" return itertools.imap(lambda (k, v): (k, f(v)), iterator) - print "in mapValues" return PipelinedVertexRDD(self, func, preserves_partitioning) def mapVertexPartitions(self, f, preserve_partitioning=False): @@ -185,8 +184,8 @@ def dispatch(seq): elif n == 2: wbuf.append(v) return [(v, w) for v in vbuf for w in wbuf] - vs = self.map(lambda (k, v): (k, (1, v))) - ws = other.map(lambda (k, v): (k, (2, v))) + vs = self.mapValues(lambda (k, v): (k, (1, v))) + ws = other.mapValues(lambda (k, v): (k, (2, v))) return vs.union(ws).groupByKey(numPartitions).\ flatMapValues(lambda x: dispatch(x.__iter__())) @@ -195,14 +194,38 @@ def collect(self): Return a list that contains all of the elements in this RDD. """ with SCCallSiteSync(self.ctx) as css: - bytesInJava = self.jrdd.collect() - return list(bytesInJava) + bytesInJava = self.jvertex_rdd.collect().iterator() + return list(self._collect_iterator_through_file(bytesInJava)) + + def _collect_iterator_through_file(self, iterator): + # Transferring lots of data through Py4J can be slow because + # socket.readline() is inefficient. Instead, we'll dump the data to a + # file and read it back. + tempFile = NamedTemporaryFile(delete=False, dir=self.ctx._temp_dir) + tempFile.close() + self.ctx._writeToFile(iterator, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + for item in self.jvertex_rdd_deserializer.load_stream(tempFile): + yield item + os.unlink(tempFile.name) def getJavaVertexRDD(self, rdd, rdd_deserializer): if self.bypass_serializer: self.jvertex_rdd_deserializer = NoOpSerializer() + rdd_deserializer = NoOpSerializer() enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None + def f(index, iterator): + return iterator + command = (f, profileStats, rdd_deserializer, + rdd_deserializer) + # the serialized command will be compressed by broadcast + ser = CloudPickleSerializer() + pickled_command = ser.dumps(command) + if len(pickled_command) > (1 << 20): # 1M + self.broadcast = self.ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self.broadcast) # the serialized command will be compressed by broadcast broadcast_vars = ListConverter().convert( @@ -215,17 +238,17 @@ def getJavaVertexRDD(self, rdd, rdd_deserializer): self.ctx._gateway._gateway_client) target_storage_level = StorageLevel.MEMORY_ONLY java_storage_level = self.ctx._getJavaStorageLevel(target_storage_level) - jvertex_rdd = self.ctx._jvm.PythonVertexRDD(rdd._jrdd, - bytearray(" "), + prdd = self.ctx._jvm.PythonVertexRDD(rdd._jrdd, + bytearray(pickled_command), env, includes, self.preserve_partitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, java_storage_level) - + self.jvertex_rdd = prdd.asJavaVertexRDD() if enable_profile: self.id = self.jvertex_rdd.id() self.ctx._add_profile(self.id, profileStats) - return jvertex_rdd + return self.jvertex_rdd class PipelinedVertexRDD(VertexRDD): @@ -263,7 +286,7 @@ def pipeline_func(split, iterator): self.func = pipeline_func self.preservesPartitioning = \ prev.preservesPartitioning and preservesPartitioning - self.prev_jvertex_rdd = prev.jvrdd_val + self.prev_jvertex_rdd = prev.jvertex_rdd self.prev_jvertex_rdd_deserializer = prev.prev_jvertex_rdd_deserializer self.is_cached = False @@ -284,11 +307,6 @@ def __del__(self): @property def jvertex_rdd(self): - print "\n**********************************" - print "in jvertex_rdd of vertex.py" - print "**********************************\n" - import traceback - traceback.print_stack() if self.jvrdd_val: return self.jvrdd_val if self.bypass_serializer: @@ -312,7 +330,7 @@ def jvertex_rdd(self): includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) java_storage_level = self.ctx._getJavaStorageLevel(StorageLevel.MEMORY_ONLY) - python_rdd = self.ctx._jvm.PythonVertexRDD(self.prev_jvertex_rdd.jrdd, + python_rdd = self.ctx._jvm.PythonVertexRDD(self.prev_jvertex_rdd, bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, From 9e8f7dbd760bff951a5864b25c2819e33566f5f1 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Fri, 23 Jan 2015 14:56:23 -0500 Subject: [PATCH 24/25] SPARK-3789: innerJoin fixed in vertexrdd --- .../apache/spark/api/python/PythonRDD.scala | 2 - .../spark/graphx/api/java/JavaVertexRDD.scala | 59 +++++++------ .../org/apache/spark/graphx/JavaAPISuite.java | 10 +-- python/pyspark/graphx/vertex.py | 86 +++++++++---------- 4 files changed, 77 insertions(+), 80 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index bfdce624e7e2..5bf78a82e8e1 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -57,8 +57,6 @@ private[spark] class PythonRDD( override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { - logError("Inside PythonRDD.compute()") - val startTime = System.currentTimeMillis val env = SparkEnv.get val localdir = env.blockManager.diskBlockManager.localDirs.map( diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index 694d060cfa5c..f029d0f9fac6 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -16,15 +16,15 @@ */ package org.apache.spark.graphx.api.java -import java.util.{List => JList} import java.lang.{Long => JLong} +import java.util.{List => JList} +import org.apache.spark.{TaskContext, Partition} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.function.{Function => JFunction} import org.apache.spark.graphx.{VertexId, VertexRDD} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -import org.apache.spark.{Partition, TaskContext} import scala.language.implicitConversions import scala.reflect._ @@ -50,48 +50,59 @@ class JavaVertexRDD[VD]( JavaRDD.fromRDD(rdd) } + /** Convert [[org.apache.spark.api.java.JavaRDD]] to + * [[org.apache.spark.graphx.api.java.JavaVertexRDD]] instance */ + def asJavaVertexRDD = JavaRDD.fromRDD(this.vertexRDD) + /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ def cache(): this.type = { vertices.cache() this } - /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ - def persist(): this.type = { - vertices.persist() - this - } - - /** Persist the RDDs of this DStream with the given storage level */ - def persist(storageLevel: StorageLevel): this.type = { - vertices.persist(storageLevel) - this + def collect(): JList[(VertexId, VD)] = { + import scala.collection.JavaConversions._ + val arr: java.util.Collection[(VertexId, VD)] = vertices.collect().toSeq + new java.util.ArrayList(arr) } /** Generate a VertexRDD for the given duration */ override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = vertexRDD.compute(part, context) - /** Convert [[org.apache.spark.api.java.JavaRDD]] to - * [[org.apache.spark.graphx.api.java.JavaVertexRDD]] instance */ - def asJavaVertexRDD = JavaRDD.fromRDD(this.vertexRDD) + /** + * Return a new single long element generated by counting all elements in the vertex RDD + */ + def count(): JLong = vertices.count() /** Return a new VertexRDD containing only the elements that satisfy a predicate. */ def filter(f: JFunction[(VertexId, VD), Boolean]): JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD.filter(x => f.call(x).booleanValue())) - def toRDD : RDD[(VertexId, VD)] = vertices + def id: JLong = vertices.id.toLong - def collect(): JList[(VertexId, VD)] = { - import scala.collection.JavaConversions._ - val arr: java.util.Collection[(VertexId, VD)] = vertices.collect().toSeq - new java.util.ArrayList(arr) + def innerJoin[U: ClassTag, VD2: ClassTag](other: JavaVertexRDD[U]): JavaVertexRDD[VD2] = { + def attribute_combiner(vid: VertexId, vd: VD, u: U): VD2 = { + (vd, u).asInstanceOf[VD2] + } + val t = vertexRDD.innerJoin(other.vertexRDD)(attribute_combiner) + JavaVertexRDD[VD2](t).asJavaVertexRDD } - /** - * Return a new single long element generated by counting all elements in the vertex RDD - */ - def count(): JLong = vertices.count() + /** Persist RDDs of this JavaVertexRDD with the default storage level (MEMORY_ONLY_SER) */ + def persist(): this.type = { + vertices.persist() + this + } + + /** Persist the RDDs of this DStream with the given storage level */ + def persist(storageLevel: StorageLevel): this.type = { + vertices.persist(storageLevel) + this + } + + def toRDD : RDD[(VertexId, VD)] = vertices + } object JavaVertexRDD { diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index 94360cb7362d..82c0b1372640 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -17,26 +17,20 @@ package org.apache.spark.graphx; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; import org.apache.spark.graphx.api.java.JavaEdgeRDD; import org.apache.spark.graphx.api.java.JavaVertexRDD; -import org.apache.spark.rdd.RDD; -import org.apache.spark.storage.StorageLevel; import org.junit.After; import org.junit.Before; import org.junit.Test; -import scala.Array; -import scala.Function1; import scala.Tuple2; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import static org.junit.Assert.assertEquals; diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index f865ad49ed7b..dd0e5ff02c85 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -88,33 +88,30 @@ def cache(self): self.persist(StorageLevel.MEMORY_ONLY_SER) return self - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): - self.is_cached = True - java_storage_level = self.ctx._getJavaStorageLevel(storageLevel) - self.jvertex_rdd.persist(java_storage_level) - return self - - def unpersist(self, blocking = False): - self.is_cached = False - self.jvertex_rdd.unpersist(blocking) - return self - def checkpoint(self): self.is_checkpointed = True self.jvertex_rdd.checkpoint() + def count(self): + return self.jvertex_rdd.count() + + def diff(self, other, numPartitions=2): + """ + Hides vertices that are the same between `this` and `other`. + For vertices that are different, keeps the values from `other`. + + TODO: give an example + """ + if (isinstance(other, RDD)): + vs = self.map(lambda (k, v): (k, (1, v))) + ws = other.map(lambda (k, v): (k, (2, v))) + return vs.union(ws).groupByKey(numPartitions).mapValues(lambda x: x.diff(x.__iter__())) + def isCheckpointed(self): """ Return whether this RDD has been checkpointed or not """ - return self.jvertex_rdd.isCheckpointed() - - def count(self): - return self.jvertex_rdd.count() - - # TODO: This is a hack. take() must call JavaVertexRDD.take() - def take(self, num=10): - return self.jrdd.take(num) + return self.is_checkpointed def mapValues(self, f, preserves_partitioning=False): """ @@ -130,11 +127,27 @@ def func(_, iterator): return itertools.imap(lambda (k, v): (k, f(v)), iterator) return PipelinedVertexRDD(self, func, preserves_partitioning) + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + self.is_cached = True + java_storage_level = self.ctx._getJavaStorageLevel(storageLevel) + self.jvertex_rdd.persist(java_storage_level) + return self + + # TODO: This is a hack. take() must call JavaVertexRDD.take() + def take(self, num=10): + return self.jrdd.take(num) + + def unpersist(self, blocking = False): + self.is_cached = False + self.jvertex_rdd.unpersist(blocking) + return self + def mapVertexPartitions(self, f, preserve_partitioning=False): def func(s, iterator): return f(iterator) return PipelinedVertexRDD(self, func, preserve_partitioning) + # TODO def filter(self, f): """ Return a new vertex RDD containing only the elements that satisfy a predicate. @@ -144,19 +157,7 @@ def filter(self, f): >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() [2] """ - return self.jrdd.filter(f) - - def diff(self, other, numPartitions=2): - """ - Hides vertices that are the same between `this` and `other`. - For vertices that are different, keeps the values from `other`. - - TODO: give an example - """ - if (isinstance(other, RDD)): - vs = self.map(lambda (k, v): (k, (1, v))) - ws = other.map(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions).mapValues(lambda x: x.diff(x.__iter__())) + return self.jvertex_rdd.filter(f) def leftJoin(self, other, numPartitions=None): def dispatch(seq): @@ -174,20 +175,13 @@ def dispatch(seq): return vs.union(ws).groupByKey(numPartitions)\ .flatMapValues(lambda x: dispatch(x.__iter__())) - def innerJoin(self, other, numPartitions=None): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - vbuf.append(v) - elif n == 2: - wbuf.append(v) - return [(v, w) for v in vbuf for w in wbuf] - vs = self.mapValues(lambda (k, v): (k, (1, v))) - ws = other.mapValues(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions).\ - flatMapValues(lambda x: dispatch(x.__iter__())) + # TODO: The best way to do an innerJoin on vertex RDDs is to use the optimized inner + # TODO: technique defined in VertexRDDImpl. This solution does not scale + def innerJoin(self, other): + return self.jrdd.join(other.jrdd) + + def leftJoin(self, other, numPartitions=None): + return self.jrdd.leftOuterJoin(other.jrdd, numPartitions) def collect(self): """ From 51e72908de59becd241530232c354fd0d627e857 Mon Sep 17 00:00:00 2001 From: Kushal Datta Date: Mon, 26 Jan 2015 03:09:11 -0500 Subject: [PATCH 25/25] SPARK-3789: EdgeRDD bugs fixed --- .../spark/graphx/api/java/JavaEdgeRDD.scala | 107 +++--- .../graphx/api/java/JavaEdgeRDDLike.scala | 27 +- .../spark/graphx/api/java/JavaGraph.scala | 17 +- .../spark/graphx/api/java/JavaVertexRDD.scala | 4 +- .../graphx/api/python/PythonEdgeRDD.scala | 39 ++- .../graphx/api/python/PythonVertexRDD.scala | 1 - .../org/apache/spark/graphx/JavaAPISuite.java | 4 +- python/pyspark/graphx/edge.py | 321 +++++++++--------- python/pyspark/graphx/graph.py | 2 +- python/pyspark/graphx/tests.py | 14 +- python/pyspark/graphx/vertex.py | 16 - 11 files changed, 276 insertions(+), 276 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala index 4ee4bd032885..443aa7109bb8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDD.scala @@ -17,10 +17,11 @@ package org.apache.spark.graphx.api.java import java.lang.{Long => JLong} +import java.util.{List => JList} import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.function.{Function => JFunction} import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.{EdgePartition, EdgeRDDImpl} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -28,24 +29,30 @@ import scala.language.implicitConversions import scala.reflect.ClassTag /** - * EdgeRDD['ED', 'VD'] is a column-oriented edge partition RDD created from RDD[Edge[ED]]. + * EdgeRDD['ED'] is a column-oriented edge partition RDD created from RDD[Edge[ED]]. * JavaEdgeRDD class provides a Java API to access implementations of the EdgeRDD class * - * @param partitionsRDD * @param targetStorageLevel * @tparam ED - * @tparam VD */ -class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] - (val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], +class JavaEdgeRDD[ED]( + val edges: RDD[Edge[ED]], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) - extends JavaEdgeRDDLike[ED, VD, JavaEdgeRDD[ED, VD], - JavaRDD[(PartitionID, EdgePartition[ED, VD])]] { + (implicit val classTag: ClassTag[ED]) + extends JavaEdgeRDDLike[ED, JavaEdgeRDD[ED], JavaRDD[(VertexId, VertexId, ED)]] { + +// /** +// * To create JavaEdgeRDD from JavaRDDs of tuples +// * (source vertex id, destination vertex id and edge property class). +// * The edge property class can be Array[Byte] +// * @param jEdges +// */ +// def this(jEdges: JavaRDD[(VertexId, VertexId, ED)]) = { +// this(jEdges.rdd.map(x => Edge[ED](x._1, x._2, x._3))) +// } /* Convert RDD[(PartitionID, EdgePartition[ED, VD])] to EdgeRDD[ED, VD] */ - override def edgeRDD: EdgeRDDImpl[ED, VD] = { - new EdgeRDDImpl(partitionsRDD, targetStorageLevel) - } + override def edgeRDD = EdgeRDD.fromEdges(edges) /** * Java Wrapper for RDD of Edges @@ -53,20 +60,42 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] * @param edgeRDD * @return */ - override def wrapRDD(edgeRDD: RDD[(PartitionID, EdgePartition[ED, VD])]) : - JavaRDD[(PartitionID, EdgePartition[ED, VD])] = { + def wrapRDD(edgeRDD: RDD[Edge[ED]]): JavaRDD[Edge[ED]] = { JavaRDD.fromRDD(edgeRDD) } /** Persist RDDs of this JavaEdgeRDD with the default storage level (MEMORY_ONLY_SER) */ def cache(): this.type = { - partitionsRDD.persist(StorageLevel.MEMORY_ONLY) + edges.cache() + this + } + + def collect(): JList[Edge[ED]] = { + import scala.collection.JavaConversions._ + val arr: java.util.Collection[Edge[ED]] = edges.collect().toSeq + new java.util.ArrayList(arr) + } + + /** + * Return a new single long element generated by counting all elements in the vertex RDD + */ + override def count(): JLong = edges.count() + + /** Return a new VertexRDD containing only the elements that satisfy a predicate. */ + def filter(f: JFunction[Edge[ED], Boolean]): JavaEdgeRDD[ED] = + JavaEdgeRDD(edgeRDD.filter(x => f.call(x).booleanValue())) + + def id: JLong = edges.id.toLong + + /** Persist RDDs of this JavaEdgeRDD with the default storage level (MEMORY_ONLY_SER) */ + def persist(): this.type = { + edges.persist() this } - /** Persist the RDDs of this JavaEdgeRDD with the given storage level */ - def persist(newLevel: StorageLevel): this.type = { - partitionsRDD.persist(newLevel) + /** Persist the RDDs of this EdgeRDD with the given storage level */ + def persist(storageLevel: StorageLevel): this.type = { + edges.persist(storageLevel) this } @@ -75,47 +104,35 @@ class JavaEdgeRDD[ED: ClassTag, VD: ClassTag] this } - override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] = { - edgeRDD.mapValues(f) + override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2] = { + JavaEdgeRDD(edgeRDD.mapValues(f)) } - override def reverse: JavaEdgeRDD[ED, VD] = edgeRDD.reverse - - override def filter - (epred: EdgeTriplet[VD, ED] => Boolean, - vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] = { - edgeRDD.filter(epred, vpred) - } + override def reverse: JavaEdgeRDD[ED] = JavaEdgeRDD(edgeRDD.reverse) - override def innerJoin[ED2: ClassTag, ED3: ClassTag] + def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) - (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] = { - edgeRDD.innerJoin(other)(f) + (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3] = { + JavaEdgeRDD(edgeRDD.innerJoin(other)(f)) } - override def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] - (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] = { - edgeRDD.mapEdgePartitions(f) - } + def toRDD : RDD[Edge[ED]] = edges } object JavaEdgeRDD { - implicit def apply[ED: ClassTag, VD: ClassTag] - (edges: EdgeRDDImpl[ED, VD]): JavaEdgeRDD[ED, VD] = { - new JavaEdgeRDD(edges.partitionsRDD) - } - - implicit def apply[ED: ClassTag, VD: ClassTag](edges: JavaRDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { - JavaEdgeRDD(EdgeRDD.fromEdges[ED, VD](edges.rdd)) + implicit def apply[ED: ClassTag](edges: JavaRDD[Edge[ED]]) : JavaEdgeRDD[ED] = { + JavaEdgeRDD(EdgeRDD.fromEdges(edges.rdd)) } - def toEdgeRDD[ED: ClassTag, VD: ClassTag](edges: JavaEdgeRDD[ED, VD]): EdgeRDDImpl[ED, VD] = { - edges.edgeRDD + def toEdgeRDD[ED: ClassTag](edges: JavaEdgeRDD[ED]): RDD[Edge[ED]] = { + JavaEdgeRDD(edges.edgeRDD).toRDD } - def fromRDDOfEdges[ED: ClassTag, VD: ClassTag](edges: RDD[Edge[ED]]) : JavaEdgeRDD[ED, VD] = { - JavaEdgeRDD[ED, VD](EdgeRDD.fromEdges[ED, VD](edges)) - } +// def apply[ED: ClassTag]( +// jEdges: JavaRDD[(VertexId, VertexId, ED)]): JavaEdgeRDD[ED] = { +// val edges : RDD[Edge[ED]] = jEdges.rdd.map(x => Edge(x._1, x._2, x._3)) +// new JavaEdgeRDD(edges) +// } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala index 5e4bb21c0a22..1148e28e7718 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaEdgeRDDLike.scala @@ -21,42 +21,25 @@ import java.util.{List => JList} import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.{EdgePartition, EdgeRDDImpl} -import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, TaskContext} import scala.reflect.ClassTag -trait JavaEdgeRDDLike [ED, VD, This <: JavaEdgeRDDLike[ED, VD, This, R], -R <: JavaRDDLike[(PartitionID, EdgePartition[ED, VD]), R]] +trait JavaEdgeRDDLike [ED, This <: JavaEdgeRDDLike[ED, This, R], +R <: JavaRDDLike[(VertexId, VertexId, ED), R]] extends Serializable { - def edgeRDD: EdgeRDDImpl[ED, VD] - - def wrapRDD(edgeRDD: RDD[(PartitionID, EdgePartition[ED, VD])]) : R + def edgeRDD: EdgeRDD[ED] def setName() = edgeRDD.setName("JavaEdgeRDD") - def collect(): Array[Edge[ED]] = edgeRDD.map(_.copy()).collect().asInstanceOf[Array[Edge[ED]]] - def count() : JLong = edgeRDD.count() def compute(part: Partition, context: TaskContext): Iterator[Edge[ED]] = { edgeRDD.compute(part, context) } - def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2, VD] - - def reverse: JavaEdgeRDD[ED, VD] - - def filter - (epred: EdgeTriplet[VD, ED] => Boolean, - vpred: (VertexId, VD) => Boolean): JavaEdgeRDD[ED, VD] - - def innerJoin[ED2: ClassTag, ED3: ClassTag] - (other: EdgeRDD[ED2]) - (f: (VertexId, VertexId, ED, ED2) => ED3): JavaEdgeRDD[ED3, VD] + def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): JavaEdgeRDD[ED2] - def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag] - (f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): JavaEdgeRDD[ED2, VD2] + def reverse: JavaEdgeRDD[ED] } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala index 4190083f10ea..ca07f7d10ae0 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaGraph.scala @@ -19,7 +19,6 @@ package org.apache.spark.graphx.api.java import java.lang.{Double => JDouble, Long => JLong} import org.apache.spark.graphx._ -import org.apache.spark.graphx.impl.EdgeRDDImpl import org.apache.spark.graphx.lib.PageRank import org.apache.spark.rdd.RDD @@ -27,10 +26,10 @@ import scala.language.implicitConversions import scala.reflect.ClassTag class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] - (vertexRDD : VertexRDD[VD], edgeRDD: EdgeRDDImpl[ED, VD]) { + (vertexRDD : VertexRDD[VD], edgeRDD: EdgeRDD[ED]) { def vertices: JavaVertexRDD[VD] = JavaVertexRDD(vertexRDD) - def edges: JavaEdgeRDD[ED, VD] = JavaEdgeRDD(edgeRDD) + def edges: JavaEdgeRDD[ED] = JavaEdgeRDD(edgeRDD) @transient lazy val graph : Graph[VD, ED] = Graph(vertexRDD, edgeRDD) def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): JavaGraph[VD, ED] = { @@ -98,18 +97,18 @@ class JavaGraph[@specialized VD: ClassTag, @specialized ED: ClassTag] object JavaGraph { - implicit def apply[VD: ClassTag, ED: ClassTag] - (vertexRDD: RDD[(VertexId, VD)], edgeRDD: RDD[Edge[ED]]): JavaGraph[VD, ED] = { - new JavaGraph[VD, ED](VertexRDD(vertexRDD), EdgeRDD.fromEdges(edgeRDD)) - } +// implicit def apply[VD: ClassTag, ED: ClassTag] +// (vertexRDD: RDD[(VertexId, VD)], edges: RDD[Edge[ED]]): JavaGraph[VD, ED] = { +// new JavaGraph[VD, ED](VertexRDD(vertexRDD), EdgeRDD.fromEdges(edges)) +// } implicit def apply[VD: ClassTag, ED: ClassTag] (graph: Graph[VD, ED]): JavaGraph[VD, ED] = { - new JavaGraph[VD, ED](graph.vertices, EdgeRDD.fromEdges(graph.edges)) + new JavaGraph[VD, ED](graph.vertices, EdgeRDD.fromEdges[ED, VD](graph.edges)) } implicit def apply [VD: ClassTag, ED: ClassTag] - (vertices: JavaVertexRDD[VD], edges: JavaEdgeRDD[ED, VD]): JavaGraph[VD, ED] = { + (vertices: JavaVertexRDD[VD], edges: JavaEdgeRDD[ED]): JavaGraph[VD, ED] = { new JavaGraph(VertexRDD(vertices.toRDD), edges.edgeRDD) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala index f029d0f9fac6..0a5debdb175b 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/java/JavaVertexRDD.scala @@ -54,7 +54,7 @@ class JavaVertexRDD[VD]( * [[org.apache.spark.graphx.api.java.JavaVertexRDD]] instance */ def asJavaVertexRDD = JavaRDD.fromRDD(this.vertexRDD) - /** Persist RDDs of this DStream with the default storage level (MEMORY_ONLY_SER) */ + /** Persist RDDs of this VertexRDD with the default storage level (MEMORY_ONLY_SER) */ def cache(): this.type = { vertices.cache() this @@ -95,7 +95,7 @@ class JavaVertexRDD[VD]( this } - /** Persist the RDDs of this DStream with the given storage level */ + /** Persist the RDDs of this VertexRDD with the given storage level */ def persist(storageLevel: StorageLevel): this.type = { vertices.persist(storageLevel) this diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala index 259e17ea00ee..c353bdfdd8cc 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonEdgeRDD.scala @@ -24,22 +24,41 @@ import org.apache.spark.Accumulator import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.{PythonBroadcast, PythonRDD} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD +import org.apache.spark.graphx.Edge +import org.apache.spark.graphx.api.java.JavaEdgeRDD +import org.apache.spark.storage.StorageLevel private[graphx] class PythonEdgeRDD( - @transient parent: RDD[_], - command: Array[Byte], - envVars: JMap[String, String], - pythonIncludes: JList[String], - preservePartitioning: Boolean, - pythonExec: String, - broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) + @transient parent: JavaRDD[_], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[PythonBroadcast]], + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PythonRDD (parent, command, envVars, pythonIncludes, preservePartitioning, pythonExec, broadcastVars, accumulator) { - val asJavaEdgeRDD = JavaRDD.fromRDD(parent) + def this(@transient parent: JavaEdgeRDD[_], + command: Array[Byte], + envVars: JMap[String, String], + pythonIncludes: JList[String], + preservePartitioning: Boolean, + pythonExec: String, + broadcastVars: JList[Broadcast[PythonBroadcast]], + accumulator: Accumulator[JList[Array[Byte]]], + targetStorageLevel : StorageLevel) = { + this(parent.toRDD, command, envVars, pythonIncludes, + preservePartitioning, pythonExec, broadcastVars, accumulator, targetStorageLevel) + } + + val asJavaEdgeRDD = { + val jRDD = JavaRDD.fromRDD(this) + JavaEdgeRDD.apply(jRDD.asInstanceOf[JavaRDD[Edge[Array[Byte]]]]) + } def writeToFile[T](items: java.util.Iterator[T], filename: String) { import scala.collection.JavaConverters._ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala index e701cd1c1328..dd0a42ad0310 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/api/python/PythonVertexRDD.scala @@ -56,7 +56,6 @@ private[graphx] class PythonVertexRDD( } val asJavaVertexRDD = { -// new JavaVertexRDD[Array[Byte]](parent.asInstanceOf[JavaRDD[(VertexId, Array[Byte])]]) JavaVertexRDD(JavaRDD.fromRDD(this).asInstanceOf[JavaRDD[(VertexId, Array[Byte])]]) } diff --git a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java index 82c0b1372640..36d528ff278b 100644 --- a/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java +++ b/graphx/src/test/java/org/apache/spark/graphx/JavaAPISuite.java @@ -99,8 +99,8 @@ public void testEdgeRDDCount() { ClassTag classTag = ClassTag$.MODULE$.apply(String.class); - JavaEdgeRDD javaEdgeRDD = - JavaEdgeRDD.apply(javaRDD, classTag, classTag); + JavaEdgeRDD javaEdgeRDD = + JavaEdgeRDD.apply(javaRDD, classTag); assertEquals(javaEdgeRDD.count().longValue(), 4L); } diff --git a/python/pyspark/graphx/edge.py b/python/pyspark/graphx/edge.py index 79c7d74ef295..7ea5b4c619a3 100644 --- a/python/pyspark/graphx/edge.py +++ b/python/pyspark/graphx/edge.py @@ -23,8 +23,9 @@ import itertools from tempfile import NamedTemporaryFile # from build.py4j.java_collections import MapConverter, ListConverter +from py4j.java_collections import ListConverter, MapConverter from pyspark.accumulators import PStatsParam -from pyspark import RDD, StorageLevel +from pyspark import RDD, StorageLevel, SparkContext from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer, \ NoOpSerializer from pyspark.traceback_utils import SCCallSiteSync @@ -58,8 +59,15 @@ def __str__(self): class EdgeRDD(object): - def __init__(self, jrdd, - jrdd_deserializer = BatchedSerializer(PickleSerializer())): + """ + EdgeRDD class defines the edge actions and transformations. The complete list of + transformations and actions is available at + `http://spark.apache.org/docs/latest/graphx-programming-guide.html` + These operations are mapped to Scala functions defined + in `org.apache.spark.graphx.impl.EdgeRDDImpl` + """ + + def __init__(self, jrdd, jrdd_deserializer = BatchedSerializer(PickleSerializer())): """ Constructor :param jrdd: A JavaRDD reference passed from the parent @@ -70,31 +78,21 @@ def __init__(self, jrdd, """ - self._jrdd = jrdd - self._ctx = jrdd._jrdd.context - self._jrdd_deserializer = jrdd_deserializer - self._preserve_partitioning = False - self._name = "VertexRDD" - self._is_cached = False - self._is_checkpointed = False - self._id = jrdd.id() - self._partitionFunc = None - self._jrdd_val = None - self._bypass_serializer = False - + self.name = "EdgeRDD" + self.jrdd = jrdd + self.is_cached = False + self.is_checkpointed = False + self.ctx = SparkContext._active_spark_context + self.jedge_rdd_deserializer = jrdd_deserializer + self.id = jrdd.id() + self.partitionFunc = None + self.bypass_serializer = False + self.preserve_partitioning = False - def id(self): - """ - VertexRDD has a unique id - """ - return self._id + self.jedge_rdd = self.getJavaEdgeRDD(jrdd, jrdd_deserializer) - # TODO: Does not work def __repr__(self): - return self._jrdd.toString() - - def context(self): - return self._ctx + return self.jedge_rdd.toString() def cache(self): """ @@ -104,124 +102,135 @@ def cache(self): self.persist(StorageLevel.MEMORY_ONLY_SER) return self - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): - self._is_cached = True - javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) - self._jrdd.persist(javaStorageLevel) - return self - - def unpersist(self): - self._is_cached = False - self._jrdd.unpersist() - return self - def checkpoint(self): self.is_checkpointed = True - self._jrdd.rdd().checkpoint() + self.jedge_rdd.checkpoint() def count(self): - return self._jrdd.count() + return self.jedge_rdd.count() - def collect(self): + def isCheckpointed(self): """ - Return all of the elements in this vertex RDD as a list + Return whether this RDD has been checkpointed or not """ - with SCCallSiteSync(self._ctx) as css: - bytesInJava = self._jrdd.collect().iterator() - return list(self._collect_iterator_through_file(bytesInJava)) - - def _collect_iterator_through_file(self, iterator): - # Transferring lots of data through Py4J can be slow because - # socket.readline() is inefficient. Instead, we'll dump the data to a - # file and read it back. - tempFile = NamedTemporaryFile(delete=False, dir=self._ctx._temp_dir) - tempFile.close() - self._ctx._writeToFile(iterator, tempFile.name) - # Read the data into Python and deserialize it: - with open(tempFile.name, 'rb') as tempFile: - for item in self._jrdd_deserializer.load_stream(tempFile): - yield item - os.unlink(tempFile.name) - - def take(self, num=10): - return self._jrdd.take(num) - - def sum(self): - self._jrdd.sum() + return self.is_checkpointed def mapValues(self, f, preserves_partitioning=False): """ Return a new vertex RDD by applying a function to each vertex attributes, preserving the index - >>> rdd = sc.parallelize([Edge(1, 2, "b"), (2, 3, "a"), (3, 2, "c")]) - >>> vertices = EdgeRDD(rdd) + >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) + >>> edges = EdgeRDD(rdd) >>> sorted(edges.mapValues(lambda x: (x + ":" + x)).collect()) - [(1, 2, 'a:a'), (2, 3, 'b:b'), (3, 2, 'c:c')] + [(1, 'a:a'), (2, 'b:b'), (3, 'c:c')] """ - map_func = lambda (k, v): (k, f(v)) def func(_, iterator): - return itertools.imap(map_func, iterator) + return itertools.imap(lambda (k, v): (k, f(v)), iterator) return PipelinedEdgeRDD(self, func, preserves_partitioning) + def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + self.is_cached = True + java_storage_level = self.ctx._getJavaStorageLevel(storageLevel) + self.jedge_rdd.persist(java_storage_level) + return self + + # TODO: This is a hack. take() must call JavaVertexRDD.take() + def take(self, num=10): + return self.jrdd.take(num) + + def unpersist(self, blocking = False): + self.is_cached = False + self.jedge_rdd.unpersist(blocking) + return self + def mapEdgePartitions(self, f, preserve_partitioning=False): def func(s, iterator): return f(iterator) return PipelinedEdgeRDD(self, func, preserve_partitioning) - def filter(self, f): - """ - Return a new vertex RDD containing only the elements that satisfy a predicate. + # TODO: The best way to do an innerJoin on vertex RDDs is to use the optimized inner + # TODO: technique defined in VertexRDDImpl. This solution does not scale + def innerJoin(self, other): + return self.jrdd.join(other.jrdd) - >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) - >>> vertices = VertexRDD(rdd) - >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() - [2] - """ - def func(iterator): - return itertools.ifilter(f, iterator) - return self.mapEdgePartitions(func, True) + def leftJoin(self, other, numPartitions=None): + return self.jrdd.leftOuterJoin(other.jrdd, numPartitions) - def filter(self, f): + def collect(self): """ - Return a new vertex RDD containing only the elements that satisfy a predicate. - - >>> rdd = sc.parallelize([(1, "b"), (2, "a"), (3, "c")]) - >>> vertices = VertexRDD(rdd) - >>> vertices.filter(lambda x: x._1 % 2 == 0).collect() - [2] + Return a list that contains all of the elements in this RDD. """ - def func(iterator): - return itertools.ifilter(f, iterator) - return self.maEdgePartitions(func, True) + with SCCallSiteSync(self.ctx) as css: + bytesInJava = self.jedge_rdd.collect().iterator() + return list(self._collect_iterator_through_file(bytesInJava)) + + def _collect_iterator_through_file(self, iterator): + # Transferring lots of data through Py4J can be slow because + # socket.readline() is inefficient. Instead, we'll dump the data to a + # file and read it back. + tempFile = NamedTemporaryFile(delete=False, dir=self.ctx._temp_dir) + tempFile.close() + self.ctx._writeToFile(iterator, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + for item in self.jedge_rdd_deserializer.load_stream(tempFile): + yield item + os.unlink(tempFile.name) - def innerJoin(self, other, numPartitions=None): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - return [(v, w) for v in vbuf for w in wbuf] - vs = self.map(lambda (k, v): (k, (1, v))) - ws = other.map(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) + def getJavaEdgeRDD(self, rdd, rdd_deserializer): + if self.bypass_serializer: + self.jedge_rdd_deserializer = NoOpSerializer() + rdd_deserializer = NoOpSerializer() + enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None + def f(index, iterator): + return iterator + command = (f, profileStats, rdd_deserializer, + rdd_deserializer) + # the serialized command will be compressed by broadcast + ser = CloudPickleSerializer() + pickled_command = ser.dumps(command) + if len(pickled_command) > (1 << 20): # 1M + self.broadcast = self.ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self.broadcast) + # the serialized command will be compressed by broadcast + broadcast_vars = ListConverter().convert( + [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], + self.ctx._gateway._gateway_client) + self.ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self.ctx.environment, + self.ctx._gateway._gateway_client) + includes = ListConverter().convert(self.ctx._python_includes, + self.ctx._gateway._gateway_client) + java_storage_level = self.ctx._getJavaStorageLevel(StorageLevel.MEMORY_ONLY) + prdd = self.ctx._jvm.PythonEdgeRDD(rdd._jrdd, + bytearray(pickled_command), + env, includes, self.preserve_partitioning, + self.ctx.pythonExec, + broadcast_vars, self.ctx._javaAccumulator, + java_storage_level) + self.jedge_rdd = prdd.asJavaEdgeRDD() + if enable_profile: + self.id = self.jedge_rdd.id() + self.ctx._add_profile(self.id, profileStats) + return self.jedge_rdd class PipelinedEdgeRDD(EdgeRDD): """ - Pipelined maps: + Pipelined mapValues in EdgeRDD: - >>> rdd = sc.parallelize([1, 2, 3, 4]) - >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() - [4, 8, 12, 16] - >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() - [4, 8, 12, 16] + >>> rdd = sc.parallelize([(1, ("Alice", 29)), (2, ("Bob", 30)), \ + (3, ("Charlie", 31)), (4, ("Dwayne", 32))]) + >>> vertices = VertexRDD(rdd) + >>> vertices.mapValues(lambda x: x[1] * 2).cache().collect() + [(1, ("Alice", 58)), (2, ("Bob", 60)), \ + (3, ("Charlie", 62)), (4, ("Dwayne", 64))] - Pipelined reduces: + Pipelined reduces in EdgeRDD: >>> from operator import add >>> rdd.map(lambda x: 2 * x).reduce(add) 20 @@ -230,12 +239,12 @@ class PipelinedEdgeRDD(EdgeRDD): """ def __init__(self, prev, func, preservesPartitioning=False): - if not isinstance(prev, PipelinedEdgeRDD) or not prev._is_pipelinable(): + if not isinstance(prev, PipelinedEdgeRDD) or not prev.is_pipelinable(): # This transformation is the first in its stage: self.func = func self.preservesPartitioning = preservesPartitioning - self._prev_jrdd = prev._jrdd - self._prev_jrdd_deserializer = prev._jrdd_deserializer + self.prev_jedge_rdd = prev.jedge_rdd + self.prev_jedge_rdd_deserializer = prev.jedge_rdd_deserializer else: prev_func = prev.func @@ -244,67 +253,67 @@ def pipeline_func(split, iterator): self.func = pipeline_func self.preservesPartitioning = \ prev.preservesPartitioning and preservesPartitioning - self._prev_jrdd = prev._prev_jrdd # maintain the pipeline - self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer + self.prev_jedge_rdd = prev.jedge_rdd + self.prev_jedge_rdd_deserializer = prev.prev_jedge_rdd_deserializer + self.is_cached = False self.is_checkpointed = False - self._ctx = prev._ctx + self.ctx = prev.ctx self.prev = prev - self._jrdd_val = None - self._id = None - self._jrdd_deserializer = self._ctx.serializer - self._bypass_serializer = False - self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None - self._broadcast = None + self.jerdd_val = None + self.id = None + self.jedge_rdd_deserializer = self.ctx.serializer + self.bypass_serializer = False + self.partitionFunc = prev._partitionFunc if self.preservesPartitioning else None + self.broadcast = None def __del__(self): - if self._broadcast: - self._broadcast.unpersist() - self._broadcast = None + if self.broadcast: + self.broadcast.unpersist() + self.broadcast = None @property - def _jrdd(self): - print "in _jrdd of edge.py" - if self._jrdd_val: - return self._jrdd_val - if self._bypass_serializer: - self._jrdd_deserializer = NoOpSerializer() - enable_profile = self._ctx._conf.get("spark.python.profile", "false") == "true" - profileStats = self._ctx.accumulator(None, PStatsParam) if enable_profile else None - command = (self.func, profileStats, self._prev_jrdd_deserializer, - self._jrdd_deserializer) + def jedge_rdd(self): + if self.jerdd_val: + return self.jerdd_val + if self.bypass_serializer: + self.jedge_rdd_deserializer = NoOpSerializer() + enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" + profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None + command = (self.func, profileStats, self.prev_jedge_rdd_deserializer, + self.jedge_rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M - self._broadcast = self._ctx.broadcast(pickled_command) - pickled_command = ser.dumps(self._broadcast) + self.broadcast = self.ctx.broadcast(pickled_command) + pickled_command = ser.dumps(self.broadcast) broadcast_vars = ListConverter().convert( - [x._jbroadcast for x in self._ctx._pickled_broadcast_vars], - self._ctx._gateway._gateway_client) - self._ctx._pickled_broadcast_vars.clear() - env = MapConverter().convert(self._ctx.environment, - self._ctx._gateway._gateway_client) - includes = ListConverter().convert(self._ctx._python_includes, - self._ctx._gateway._gateway_client) - targetStorageLevel = StorageLevel.MEMORY_ONLY - python_rdd = self._ctx._jvm.PythonEdgeRDD(self._prev_jrdd.rdd(), - bytearray(pickled_command), - env, includes, self.preservesPartitioning, - self._ctx.pythonExec, - broadcast_vars, self._ctx._javaAccumulator, - targetStorageLevel) - self._jrdd_val = python_rdd.asJavaEdgeRDD() + [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], + self.ctx._gateway._gateway_client) + self.ctx._pickled_broadcast_vars.clear() + env = MapConverter().convert(self.ctx.environment, + self.ctx._gateway._gateway_client) + includes = ListConverter().convert(self.ctx._python_includes, + self.ctx._gateway._gateway_client) + java_storage_level = self.ctx._getJavaStorageLevel(StorageLevel.MEMORY_ONLY) + python_rdd = self.ctx._jvm.PythonEdgeRDD(self.prev_jedge_rdd, + bytearray(pickled_command), + env, includes, self.preservesPartitioning, + self.ctx.pythonExec, + broadcast_vars, self.ctx._javaAccumulator, + java_storage_level) + self.jerdd_val = python_rdd.asJavaEdgeRDD() if enable_profile: - self._id = self._jrdd_val.id() - self._ctx._add_profile(self._id, profileStats) - return self._jrdd_val + self.id = self.jerdd_val.id() + self.ctx._add_profile(self.id, profileStats) + return self.jerdd_val def id(self): - if self._id is None: - self._id = self._jrdd.id() - return self._id + if self.id is None: + self.id = self.jedge_rdd.id() + return self.id - def _is_pipelinable(self): + def is_pipelinable(self): return not (self.is_cached or self.is_checkpointed) diff --git a/python/pyspark/graphx/graph.py b/python/pyspark/graphx/graph.py index 5a3251d2d4c8..3c65c62e7316 100644 --- a/python/pyspark/graphx/graph.py +++ b/python/pyspark/graphx/graph.py @@ -165,5 +165,5 @@ def triangleCount(self): def stronglyConnectedComponents(self, iterations): return - def pregel(self, initial_message, vertex_program, send_message, combine_message): + def Pregel(self, initial_message, vertex_program, send_message, combine_message): return diff --git a/python/pyspark/graphx/tests.py b/python/pyspark/graphx/tests.py index 1dded0aff5ae..b24a1ef49ed3 100644 --- a/python/pyspark/graphx/tests.py +++ b/python/pyspark/graphx/tests.py @@ -62,24 +62,14 @@ def mapValues(self): self.assertEqual(results, [(3, ("rxin:rxin", "student:student")), (7, ("jgonzal:jgonzal", "postdoc:postdoc"))]) - def diff(self): - vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) - vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) - vertices0 = VertexRDD(vertexData0) - vertices1 = VertexRDD(vertexData1) - results = vertices0.diff(vertices1) - self.assertEqual(results, 2) - - # TODO def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) - results = vertices0.innerJoin(vertices1) - self.assertEqual(results, 2) + results = vertices0.innerJoin(vertices1).collect() + self.assertEqual(results, []) - # TODO def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) diff --git a/python/pyspark/graphx/vertex.py b/python/pyspark/graphx/vertex.py index dd0e5ff02c85..44e7eaf0ef14 100644 --- a/python/pyspark/graphx/vertex.py +++ b/python/pyspark/graphx/vertex.py @@ -159,22 +159,6 @@ def filter(self, f): """ return self.jvertex_rdd.filter(f) - def leftJoin(self, other, numPartitions=None): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - if not wbuf: - wbuf.append(None) - return [(v, w) for v in vbuf for w in wbuf] - vs = self.map(lambda (k, v): (k, (1, v))) - ws = other.map(lambda (k, v): (k, (2, v))) - return vs.union(ws).groupByKey(numPartitions)\ - .flatMapValues(lambda x: dispatch(x.__iter__())) - # TODO: The best way to do an innerJoin on vertex RDDs is to use the optimized inner # TODO: technique defined in VertexRDDImpl. This solution does not scale def innerJoin(self, other):