map = pairRDD.collectAsMap(); // Used to crash with ClassCastException
+ pairRDD.collectAsMap(); // Used to crash with ClassCastException
}
}
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index de866ed7ffed8..bae3b37e267d5 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -20,9 +20,12 @@ package org.apache.spark.deploy
import java.io.File
import java.util.Date
-import net.liftweb.json.Diff
-import net.liftweb.json.{JsonAST, JsonParser}
-import net.liftweb.json.JsonAST.{JNothing, JValue}
+import org.json4s._
+
+import org.json4s.JValue
+import org.json4s.jackson.JsonMethods
+import com.fasterxml.jackson.core.JsonParseException
+
import org.scalatest.FunSuite
import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse}
@@ -34,31 +37,31 @@ class JsonProtocolSuite extends FunSuite {
test("writeApplicationInfo") {
val output = JsonProtocol.writeApplicationInfo(createAppInfo())
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.appInfoJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.appInfoJsonStr))
}
test("writeWorkerInfo") {
val output = JsonProtocol.writeWorkerInfo(createWorkerInfo())
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.workerInfoJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.workerInfoJsonStr))
}
test("writeApplicationDescription") {
val output = JsonProtocol.writeApplicationDescription(createAppDesc())
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.appDescJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.appDescJsonStr))
}
test("writeExecutorRunner") {
val output = JsonProtocol.writeExecutorRunner(createExecutorRunner())
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.executorRunnerJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.executorRunnerJsonStr))
}
test("writeDriverInfo") {
val output = JsonProtocol.writeDriverInfo(createDriverInfo())
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.driverInfoJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.driverInfoJsonStr))
}
test("writeMasterState") {
@@ -71,7 +74,7 @@ class JsonProtocolSuite extends FunSuite {
activeDrivers, completedDrivers, RecoveryState.ALIVE)
val output = JsonProtocol.writeMasterState(stateResponse)
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.masterStateJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.masterStateJsonStr))
}
test("writeWorkerState") {
@@ -83,7 +86,7 @@ class JsonProtocolSuite extends FunSuite {
finishedExecutors, drivers, finishedDrivers, "masterUrl", 4, 1234, 4, 1234, "masterWebUiUrl")
val output = JsonProtocol.writeWorkerState(stateResponse)
assertValidJson(output)
- assertValidDataInJson(output, JsonParser.parse(JsonConstants.workerStateJsonStr))
+ assertValidDataInJson(output, JsonMethods.parse(JsonConstants.workerStateJsonStr))
}
def createAppDesc(): ApplicationDescription = {
@@ -125,9 +128,9 @@ class JsonProtocolSuite extends FunSuite {
def assertValidJson(json: JValue) {
try {
- JsonParser.parse(JsonAST.compactRender(json))
+ JsonMethods.parse(JsonMethods.compact(json))
} catch {
- case e: JsonParser.ParseException => fail("Invalid Json detected", e)
+ case e: JsonParseException => fail("Invalid Json detected", e)
}
}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
index ac07f60e284bb..c4e7a4bb7d385 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
@@ -93,10 +93,10 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndA
// If this test hangs, it's probably because no resource offers were made after the task
// failed.
val scheduler: TaskSchedulerImpl = sc.taskScheduler match {
- case clusterScheduler: TaskSchedulerImpl =>
- clusterScheduler
+ case taskScheduler: TaskSchedulerImpl =>
+ taskScheduler
case _ =>
- assert(false, "Expect local cluster to use ClusterScheduler")
+ assert(false, "Expect local cluster to use TaskSchedulerImpl")
throw new ClassCastException
}
scheduler.taskResultGetter = new ResultDeletingTaskResultGetter(sc.env, scheduler)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
similarity index 79%
rename from core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala
rename to core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 85e929925e3b5..f4e62c64daf12 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -29,9 +29,9 @@ class FakeTaskSetManager(
initPriority: Int,
initStageId: Int,
initNumTasks: Int,
- clusterScheduler: TaskSchedulerImpl,
+ taskScheduler: TaskSchedulerImpl,
taskSet: TaskSet)
- extends TaskSetManager(clusterScheduler, taskSet, 0) {
+ extends TaskSetManager(taskScheduler, taskSet, 0) {
parent = null
weight = 1
@@ -105,7 +105,7 @@ class FakeTaskSetManager(
}
}
-class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging {
+class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Logging {
def createDummyTaskSetManager(priority: Int, stage: Int, numTasks: Int, cs: TaskSchedulerImpl, taskSet: TaskSet): FakeTaskSetManager = {
new FakeTaskSetManager(priority, stage, numTasks, cs , taskSet)
@@ -133,8 +133,8 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
}
test("FIFO Scheduler Test") {
- sc = new SparkContext("local", "ClusterSchedulerSuite")
- val clusterScheduler = new TaskSchedulerImpl(sc)
+ sc = new SparkContext("local", "TaskSchedulerImplSuite")
+ val taskScheduler = new TaskSchedulerImpl(sc)
var tasks = ArrayBuffer[Task[_]]()
val task = new FakeTask(0)
tasks += task
@@ -144,9 +144,9 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
schedulableBuilder.buildPools()
- val taskSetManager0 = createDummyTaskSetManager(0, 0, 2, clusterScheduler, taskSet)
- val taskSetManager1 = createDummyTaskSetManager(0, 1, 2, clusterScheduler, taskSet)
- val taskSetManager2 = createDummyTaskSetManager(0, 2, 2, clusterScheduler, taskSet)
+ val taskSetManager0 = createDummyTaskSetManager(0, 0, 2, taskScheduler, taskSet)
+ val taskSetManager1 = createDummyTaskSetManager(0, 1, 2, taskScheduler, taskSet)
+ val taskSetManager2 = createDummyTaskSetManager(0, 2, 2, taskScheduler, taskSet)
schedulableBuilder.addTaskSetManager(taskSetManager0, null)
schedulableBuilder.addTaskSetManager(taskSetManager1, null)
schedulableBuilder.addTaskSetManager(taskSetManager2, null)
@@ -160,8 +160,8 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
}
test("Fair Scheduler Test") {
- sc = new SparkContext("local", "ClusterSchedulerSuite")
- val clusterScheduler = new TaskSchedulerImpl(sc)
+ sc = new SparkContext("local", "TaskSchedulerImplSuite")
+ val taskScheduler = new TaskSchedulerImpl(sc)
var tasks = ArrayBuffer[Task[_]]()
val task = new FakeTask(0)
tasks += task
@@ -189,15 +189,15 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
val properties2 = new Properties()
properties2.setProperty("spark.scheduler.pool","2")
- val taskSetManager10 = createDummyTaskSetManager(1, 0, 1, clusterScheduler, taskSet)
- val taskSetManager11 = createDummyTaskSetManager(1, 1, 1, clusterScheduler, taskSet)
- val taskSetManager12 = createDummyTaskSetManager(1, 2, 2, clusterScheduler, taskSet)
+ val taskSetManager10 = createDummyTaskSetManager(1, 0, 1, taskScheduler, taskSet)
+ val taskSetManager11 = createDummyTaskSetManager(1, 1, 1, taskScheduler, taskSet)
+ val taskSetManager12 = createDummyTaskSetManager(1, 2, 2, taskScheduler, taskSet)
schedulableBuilder.addTaskSetManager(taskSetManager10, properties1)
schedulableBuilder.addTaskSetManager(taskSetManager11, properties1)
schedulableBuilder.addTaskSetManager(taskSetManager12, properties1)
- val taskSetManager23 = createDummyTaskSetManager(2, 3, 2, clusterScheduler, taskSet)
- val taskSetManager24 = createDummyTaskSetManager(2, 4, 2, clusterScheduler, taskSet)
+ val taskSetManager23 = createDummyTaskSetManager(2, 3, 2, taskScheduler, taskSet)
+ val taskSetManager24 = createDummyTaskSetManager(2, 4, 2, taskScheduler, taskSet)
schedulableBuilder.addTaskSetManager(taskSetManager23, properties2)
schedulableBuilder.addTaskSetManager(taskSetManager24, properties2)
@@ -217,8 +217,8 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
}
test("Nested Pool Test") {
- sc = new SparkContext("local", "ClusterSchedulerSuite")
- val clusterScheduler = new TaskSchedulerImpl(sc)
+ sc = new SparkContext("local", "TaskSchedulerImplSuite")
+ val taskScheduler = new TaskSchedulerImpl(sc)
var tasks = ArrayBuffer[Task[_]]()
val task = new FakeTask(0)
tasks += task
@@ -240,23 +240,23 @@ class ClusterSchedulerSuite extends FunSuite with LocalSparkContext with Logging
pool1.addSchedulable(pool10)
pool1.addSchedulable(pool11)
- val taskSetManager000 = createDummyTaskSetManager(0, 0, 5, clusterScheduler, taskSet)
- val taskSetManager001 = createDummyTaskSetManager(0, 1, 5, clusterScheduler, taskSet)
+ val taskSetManager000 = createDummyTaskSetManager(0, 0, 5, taskScheduler, taskSet)
+ val taskSetManager001 = createDummyTaskSetManager(0, 1, 5, taskScheduler, taskSet)
pool00.addSchedulable(taskSetManager000)
pool00.addSchedulable(taskSetManager001)
- val taskSetManager010 = createDummyTaskSetManager(1, 2, 5, clusterScheduler, taskSet)
- val taskSetManager011 = createDummyTaskSetManager(1, 3, 5, clusterScheduler, taskSet)
+ val taskSetManager010 = createDummyTaskSetManager(1, 2, 5, taskScheduler, taskSet)
+ val taskSetManager011 = createDummyTaskSetManager(1, 3, 5, taskScheduler, taskSet)
pool01.addSchedulable(taskSetManager010)
pool01.addSchedulable(taskSetManager011)
- val taskSetManager100 = createDummyTaskSetManager(2, 4, 5, clusterScheduler, taskSet)
- val taskSetManager101 = createDummyTaskSetManager(2, 5, 5, clusterScheduler, taskSet)
+ val taskSetManager100 = createDummyTaskSetManager(2, 4, 5, taskScheduler, taskSet)
+ val taskSetManager101 = createDummyTaskSetManager(2, 5, 5, taskScheduler, taskSet)
pool10.addSchedulable(taskSetManager100)
pool10.addSchedulable(taskSetManager101)
- val taskSetManager110 = createDummyTaskSetManager(3, 6, 5, clusterScheduler, taskSet)
- val taskSetManager111 = createDummyTaskSetManager(3, 7, 5, clusterScheduler, taskSet)
+ val taskSetManager110 = createDummyTaskSetManager(3, 6, 5, taskScheduler, taskSet)
+ val taskSetManager111 = createDummyTaskSetManager(3, 7, 5, taskScheduler, taskSet)
pool11.addSchedulable(taskSetManager110)
pool11.addSchedulable(taskSetManager111)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 34a7d8cefeea2..20f6e503872ac 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark._
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.util.FakeClock
-class FakeDAGScheduler(taskScheduler: FakeClusterScheduler) extends DAGScheduler(taskScheduler) {
+class FakeDAGScheduler(taskScheduler: FakeTaskScheduler) extends DAGScheduler(taskScheduler) {
override def taskStarted(task: Task[_], taskInfo: TaskInfo) {
taskScheduler.startedTasks += taskInfo.index
}
@@ -51,12 +51,12 @@ class FakeDAGScheduler(taskScheduler: FakeClusterScheduler) extends DAGScheduler
}
/**
- * A mock ClusterScheduler implementation that just remembers information about tasks started and
+ * A mock TaskSchedulerImpl implementation that just remembers information about tasks started and
* feedback received from the TaskSetManagers. Note that it's important to initialize this with
* a list of "live" executors and their hostnames for isExecutorAlive and hasExecutorsAliveOnHost
* to work, and these are required for locality in TaskSetManager.
*/
-class FakeClusterScheduler(sc: SparkContext, liveExecutors: (String, String)* /* execId, host */)
+class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* execId, host */)
extends TaskSchedulerImpl(sc)
{
val startedTasks = new ArrayBuffer[Long]
@@ -87,7 +87,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("TaskSet with no preferences") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
val taskSet = createTaskSet(1)
val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES)
@@ -113,7 +113,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("multiple offers with no preferences") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
val taskSet = createTaskSet(3)
val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES)
@@ -144,7 +144,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("basic delay scheduling") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
val taskSet = createTaskSet(4,
Seq(TaskLocation("host1", "exec1")),
Seq(TaskLocation("host2", "exec2")),
@@ -188,7 +188,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("delay scheduling with fallback") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc,
+ val sched = new FakeTaskScheduler(sc,
("exec1", "host1"), ("exec2", "host2"), ("exec3", "host3"))
val taskSet = createTaskSet(5,
Seq(TaskLocation("host1")),
@@ -228,7 +228,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("delay scheduling with failed hosts") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
val taskSet = createTaskSet(3,
Seq(TaskLocation("host1")),
Seq(TaskLocation("host2")),
@@ -260,7 +260,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("task result lost") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
val taskSet = createTaskSet(1)
val clock = new FakeClock
val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
@@ -277,7 +277,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("repeated failures lead to task set abortion") {
sc = new SparkContext("local", "test")
- val sched = new FakeClusterScheduler(sc, ("exec1", "host1"))
+ val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
val taskSet = createTaskSet(1)
val clock = new FakeClock
val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
diff --git a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
index c51d12bfe0bc6..757476efdb789 100644
--- a/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/XORShiftRandomSuite.scala
@@ -72,4 +72,8 @@ class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
}
+ test ("XORShift with zero seed") {
+ val random = new XORShiftRandom(0L)
+ assert(random.nextInt() != 0)
+ }
}
diff --git a/docs/README.md b/docs/README.md
index cc09d6e88f41e..cac65d97e488b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,6 @@
Welcome to the Spark documentation!
-This readme will walk you through navigating and building the Spark documentation, which is included here with the Spark source code. You can also find documentation specific to release versions of Spark at http://spark.incubator.apache.org/documentation.html.
+This readme will walk you through navigating and building the Spark documentation, which is included here with the Spark source code. You can also find documentation specific to release versions of Spark at http://spark.apache.org/documentation.html.
Read on to learn more about viewing documentation in plain text (i.e., markdown) or building the documentation yourself. Why build it yourself? So that you have the docs that corresponds to whichever version of Spark you currently have checked out of revision control.
diff --git a/docs/_config.yml b/docs/_config.yml
index 9e5a95fe53af6..aa5a5adbc1743 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -3,10 +3,10 @@ markdown: kramdown
# These allow the documentation to be updated with nerw releases
# of Spark, Scala, and Mesos.
-SPARK_VERSION: 1.0.0-incubating-SNAPSHOT
+SPARK_VERSION: 1.0.0-SNAPSHOT
SPARK_VERSION_SHORT: 1.0.0
SCALA_BINARY_VERSION: "2.10"
SCALA_VERSION: "2.10.3"
MESOS_VERSION: 0.13.0
SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net
-SPARK_GITHUB_URL: https://github.com/apache/incubator-spark
+SPARK_GITHUB_URL: https://github.com/apache/spark
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 7114e1f5dd5b9..ebb58e8b9af79 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -159,16 +159,6 @@ Heading
-->
-
-
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index b070d8e73a38b..da6d0c9dcd97b 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -108,7 +108,7 @@ _Example_
## Operations
-Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/incubator-spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details.
+Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details.
### Actions
diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md
index ded12926885b9..40cac8eb4f0db 100644
--- a/docs/building-with-maven.md
+++ b/docs/building-with-maven.md
@@ -76,3 +76,7 @@ The maven build includes support for building a Debian package containing the as
$ mvn -Pdeb -DskipTests clean package
The debian package can then be found under assembly/target. We added the short commit hash to the file name so that we can distinguish individual packages built for SNAPSHOT versions.
+
+## A note about Hadoop version 0.23.x
+
+For building spark with hadoop 0.23.x and also yarn, you will have to manually add a dependency on avro (org.apache.avro, avro, 1.7.4).
diff --git a/docs/index.md b/docs/index.md
index aa9c8666e7d75..4eb297df39144 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -9,7 +9,7 @@ It also supports a rich set of higher-level tools including [Shark](http://shark
# Downloading
-Get Spark by visiting the [downloads page](http://spark.incubator.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}.
+Get Spark by visiting the [downloads page](http://spark.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}.
Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). All you need to run it is to have `java` to installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
@@ -96,7 +96,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
* [Amazon EC2](ec2-scripts.html): scripts that let you launch a cluster on EC2 in about 5 minutes
* [Standalone Deploy Mode](spark-standalone.html): launch a standalone cluster quickly without a third-party cluster manager
* [Mesos](running-on-mesos.html): deploy a private cluster using
- [Apache Mesos](http://incubator.apache.org/mesos)
+ [Apache Mesos](http://mesos.apache.org)
* [YARN](running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN)
**Other documents:**
@@ -110,20 +110,20 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
**External resources:**
-* [Spark Homepage](http://spark.incubator.apache.org)
+* [Spark Homepage](http://spark.apache.org)
* [Shark](http://shark.cs.berkeley.edu): Apache Hive over Spark
-* [Mailing Lists](http://spark.incubator.apache.org/mailing-lists.html): ask questions about Spark here
+* [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here
* [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and
exercises about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012),
[slides](http://ampcamp.berkeley.edu/agenda-2012) and [exercises](http://ampcamp.berkeley.edu/exercises-2012) are
available online for free.
-* [Code Examples](http://spark.incubator.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/) of Spark
+* [Code Examples](http://spark.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/spark/tree/master/examples/src/main/scala/) of Spark
* [Paper Describing Spark](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf)
* [Paper Describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf)
# Community
-To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.incubator.apache.org/mailing-lists.html).
+To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.apache.org/mailing-lists.html).
If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users.
diff --git a/docs/java-programming-guide.md b/docs/java-programming-guide.md
index 07732fa1229f3..5c73dbb25ede8 100644
--- a/docs/java-programming-guide.md
+++ b/docs/java-programming-guide.md
@@ -189,7 +189,7 @@ We hope to generate documentation with Java-style syntax in the future.
# Where to Go from Here
Spark includes several sample programs using the Java API in
-[`examples/src/main/java`](https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples). You can run them by passing the class name to the
+[`examples/src/main/java`](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples). You can run them by passing the class name to the
`bin/run-example` script included in Spark; for example:
./bin/run-example org.apache.spark.examples.JavaWordCount
diff --git a/docs/js/main.js b/docs/js/main.js
index 102699789a71a..0bd2286cced19 100755
--- a/docs/js/main.js
+++ b/docs/js/main.js
@@ -1,26 +1,3 @@
-
-// From docs.scala-lang.org
-function styleCode() {
- if (typeof disableStyleCode != "undefined") {
- return;
- }
- $(".codetabs pre code").parent().each(function() {
- if (!$(this).hasClass("prettyprint")) {
- var lang = $(this).parent().data("lang");
- if (lang == "python") {
- lang = "py"
- }
- if (lang == "bash") {
- lang = "bsh"
- }
- $(this).addClass("prettyprint lang-"+lang+" linenums");
- }
- });
- console.log("runningPrettyPrint()")
- prettyPrint();
-}
-
-
function codeTabs() {
var counter = 0;
var langImages = {
@@ -97,11 +74,7 @@ function viewSolution() {
}
-$(document).ready(function() {
+$(function() {
codeTabs();
viewSolution();
- $('#chapter-toc').toc({exclude: '', context: '.container'});
- $('#chapter-toc').prepend('');
- makeCollapsable($('#global-toc'), "", "global-toc", "Show Table of Contents");
- //styleCode();
});
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
index 7c5283fb0b6fb..57ed54c9cf4c0 100644
--- a/docs/python-programming-guide.md
+++ b/docs/python-programming-guide.md
@@ -157,7 +157,7 @@ some example applications.
# Where to Go from Here
-PySpark also includes several sample programs in the [`python/examples` folder](https://github.com/apache/incubator-spark/tree/master/python/examples).
+PySpark also includes several sample programs in the [`python/examples` folder](https://github.com/apache/spark/tree/master/python/examples).
You can run them by passing the files to `pyspark`; e.g.:
./bin/pyspark python/examples/wordcount.py
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index cd4509ede735a..ee1d892a3b630 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -99,13 +99,12 @@ With this mode, your application is actually run on the remote machine where the
## Launch spark application with yarn-client mode.
-With yarn-client mode, the application will be launched locally. Just like running application or spark-shell on Local / Mesos / Standalone mode. The launch method is also the similar with them, just make sure that when you need to specify a master url, use "yarn-client" instead. And you also need to export the env value for SPARK_JAR and SPARK_YARN_APP_JAR
+With yarn-client mode, the application will be launched locally. Just like running application or spark-shell on Local / Mesos / Standalone mode. The launch method is also the similar with them, just make sure that when you need to specify a master url, use "yarn-client" instead. And you also need to export the env value for SPARK_JAR.
Configuration in yarn-client mode:
In order to tune worker core/number/memory etc. You need to export environment variables or add them to the spark configuration file (./conf/spark_env.sh). The following are the list of options.
-* `SPARK_YARN_APP_JAR`, Path to your application's JAR file (required)
* `SPARK_WORKER_INSTANCES`, Number of workers to start (Default: 2)
* `SPARK_WORKER_CORES`, Number of cores for the workers (Default: 1).
* `SPARK_WORKER_MEMORY`, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
@@ -118,12 +117,11 @@ In order to tune worker core/number/memory etc. You need to export environment v
For example:
SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
- SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
./bin/run-example org.apache.spark.examples.SparkPi yarn-client
+or
SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
- SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
MASTER=yarn-client ./bin/spark-shell
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
index 506d3faa767f3..99412733d4268 100644
--- a/docs/scala-programming-guide.md
+++ b/docs/scala-programming-guide.md
@@ -365,7 +365,7 @@ res2: Int = 10
# Where to Go from Here
-You can see some [example Spark programs](http://spark.incubator.apache.org/examples.html) on the Spark website.
+You can see some [example Spark programs](http://spark.apache.org/examples.html) on the Spark website.
In addition, Spark includes several samples in `examples/src/main/scala`. Some of them have both Spark versions and local (non-parallel) versions, allowing you to see what had to be changed to make the program run on a cluster. You can run them using by passing the class name to the `bin/run-example` script included in Spark; for example:
./bin/run-example org.apache.spark.examples.SparkPi
diff --git a/docs/spark-debugger.md b/docs/spark-debugger.md
index 11c51d5cde7c9..891c2bfa8943d 100644
--- a/docs/spark-debugger.md
+++ b/docs/spark-debugger.md
@@ -2,7 +2,7 @@
layout: global
title: The Spark Debugger
---
-**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/incubator-spark/tree/arthur).
+**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/spark/tree/arthur).
## Introduction
@@ -19,7 +19,7 @@ For deterministic errors, debugging a Spark program is now as easy as debugging
## Approach
-As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/incubator-spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running.
+As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running.
_A note on nondeterminism:_ For fault recovery, Spark requires RDD transformations (for example, the function passed to `RDD.map`) to be deterministic. The Spark debugger also relies on this property, and it can also warn you if your transformation is nondeterministic. This works by checksumming the contents of each RDD and comparing the checksums from the original execution to the checksums after recomputing the RDD in the debugger.
diff --git a/docs/tuning.md b/docs/tuning.md
index 6b010aed618a3..704778681cb8f 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -44,7 +44,10 @@ This setting configures the serializer used for not only shuffling data between
nodes but also when serializing RDDs to disk. The only reason Kryo is not the default is because of the custom
registration requirement, but we recommend trying it in any network-intensive application.
-Finally, to register your classes with Kryo, create a public class that extends
+Spark automatically includes Kryo serializers for the many commonly-used core Scala classes covered
+in the AllScalaRegistrar from the [Twitter chill](https://github.com/twitter/chill) library.
+
+To register your own custom classes with Kryo, create a public class that extends
[`org.apache.spark.serializer.KryoRegistrator`](api/core/index.html#org.apache.spark.serializer.KryoRegistrator) and set the
`spark.kryo.registrator` config property to point to it, as follows:
@@ -72,8 +75,8 @@ If your objects are large, you may also need to increase the `spark.kryoserializ
config property. The default is 2, but this value needs to be large enough to hold the *largest*
object you will serialize.
-Finally, if you don't register your classes, Kryo will still work, but it will have to store the
-full class name with each object, which is wasteful.
+Finally, if you don't register your custom classes, Kryo will still work, but it will have to store
+the full class name with each object, which is wasteful.
# Memory Tuning
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index 0fc1e4df6813c..377d9d6bd5e72 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -18,11 +18,11 @@
package org.apache.spark.graphx
import scala.reflect.ClassTag
-
import org.apache.spark.SparkContext._
import org.apache.spark.SparkException
import org.apache.spark.graphx.lib._
import org.apache.spark.rdd.RDD
+import scala.util.Random
/**
* Contains additional functionality for [[Graph]]. All operations are expressed in terms of the
@@ -137,6 +137,42 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
}
} // end of collectNeighbor
+ /**
+ * Returns an RDD that contains for each vertex v its local edges,
+ * i.e., the edges that are incident on v, in the user-specified direction.
+ * Warning: note that singleton vertices, those with no edges in the given
+ * direction will not be part of the return value.
+ *
+ * @note This function could be highly inefficient on power-law
+ * graphs where high degree vertices may force a large amount of
+ * information to be collected to a single location.
+ *
+ * @param edgeDirection the direction along which to collect
+ * the local edges of vertices
+ *
+ * @return the local edges for each vertex
+ */
+ def collectEdges(edgeDirection: EdgeDirection): VertexRDD[Array[Edge[ED]]] = {
+ edgeDirection match {
+ case EdgeDirection.Either =>
+ graph.mapReduceTriplets[Array[Edge[ED]]](
+ edge => Iterator((edge.srcId, Array(new Edge(edge.srcId, edge.dstId, edge.attr))),
+ (edge.dstId, Array(new Edge(edge.srcId, edge.dstId, edge.attr)))),
+ (a, b) => a ++ b)
+ case EdgeDirection.In =>
+ graph.mapReduceTriplets[Array[Edge[ED]]](
+ edge => Iterator((edge.dstId, Array(new Edge(edge.srcId, edge.dstId, edge.attr)))),
+ (a, b) => a ++ b)
+ case EdgeDirection.Out =>
+ graph.mapReduceTriplets[Array[Edge[ED]]](
+ edge => Iterator((edge.srcId, Array(new Edge(edge.srcId, edge.dstId, edge.attr)))),
+ (a, b) => a ++ b)
+ case EdgeDirection.Both =>
+ throw new SparkException("collectEdges does not support EdgeDirection.Both. Use" +
+ "EdgeDirection.Either instead.")
+ }
+ }
+
/**
* Join the vertices with an RDD and then apply a function from the
* the vertex and RDD entry to a new vertex value. The input table
@@ -209,6 +245,27 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
graph.mask(preprocess(graph).subgraph(epred, vpred))
}
+ /**
+ * Picks a random vertex from the graph and returns its ID.
+ */
+ def pickRandomVertex(): VertexId = {
+ val probability = 50 / graph.numVertices
+ var found = false
+ var retVal: VertexId = null.asInstanceOf[VertexId]
+ while (!found) {
+ val selectedVertices = graph.vertices.flatMap { vidVvals =>
+ if (Random.nextDouble() < probability) { Some(vidVvals._1) }
+ else { None }
+ }
+ if (selectedVertices.count > 1) {
+ found = true
+ val collectedVertices = selectedVertices.collect()
+ retVal = collectedVertices(Random.nextInt(collectedVertices.size))
+ }
+ }
+ retVal
+ }
+
/**
* Execute a Pregel-like iterative vertex-parallel abstraction. The
* user-defined vertex-program `vprog` is executed in parallel on
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
index bc2ad5677f806..6386306c048fc 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
@@ -42,21 +42,20 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
test("collectNeighborIds") {
withSpark { sc =>
- val chain = (0 until 100).map(x => (x, (x+1)%100) )
- val rawEdges = sc.parallelize(chain, 3).map { case (s,d) => (s.toLong, d.toLong) }
- val graph = Graph.fromEdgeTuples(rawEdges, 1.0).cache()
+ val graph = getCycleGraph(sc, 100)
val nbrs = graph.collectNeighborIds(EdgeDirection.Either).cache()
- assert(nbrs.count === chain.size)
+ assert(nbrs.count === 100)
assert(graph.numVertices === nbrs.count)
nbrs.collect.foreach { case (vid, nbrs) => assert(nbrs.size === 2) }
- nbrs.collect.foreach { case (vid, nbrs) =>
- val s = nbrs.toSet
- assert(s.contains((vid + 1) % 100))
- assert(s.contains(if (vid > 0) vid - 1 else 99 ))
+ nbrs.collect.foreach {
+ case (vid, nbrs) =>
+ val s = nbrs.toSet
+ assert(s.contains((vid + 1) % 100))
+ assert(s.contains(if (vid > 0) vid - 1 else 99))
}
}
}
-
+
test ("filter") {
withSpark { sc =>
val n = 5
@@ -80,4 +79,121 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
}
}
+ test("collectEdgesCycleDirectionOut") {
+ withSpark { sc =>
+ val graph = getCycleGraph(sc, 100)
+ val edges = graph.collectEdges(EdgeDirection.Out).cache()
+ assert(edges.count == 100)
+ edges.collect.foreach { case (vid, edges) => assert(edges.size == 1) }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeDstIds = s.map(e => e.dstId)
+ assert(edgeDstIds.contains((vid + 1) % 100))
+ }
+ }
+ }
+
+ test("collectEdgesCycleDirectionIn") {
+ withSpark { sc =>
+ val graph = getCycleGraph(sc, 100)
+ val edges = graph.collectEdges(EdgeDirection.In).cache()
+ assert(edges.count == 100)
+ edges.collect.foreach { case (vid, edges) => assert(edges.size == 1) }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeSrcIds = s.map(e => e.srcId)
+ assert(edgeSrcIds.contains(if (vid > 0) vid - 1 else 99))
+ }
+ }
+ }
+
+ test("collectEdgesCycleDirectionEither") {
+ withSpark { sc =>
+ val graph = getCycleGraph(sc, 100)
+ val edges = graph.collectEdges(EdgeDirection.Either).cache()
+ assert(edges.count == 100)
+ edges.collect.foreach { case (vid, edges) => assert(edges.size == 2) }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeIds = s.map(e => if (vid != e.srcId) e.srcId else e.dstId)
+ assert(edgeIds.contains((vid + 1) % 100))
+ assert(edgeIds.contains(if (vid > 0) vid - 1 else 99))
+ }
+ }
+ }
+
+ test("collectEdgesChainDirectionOut") {
+ withSpark { sc =>
+ val graph = getChainGraph(sc, 50)
+ val edges = graph.collectEdges(EdgeDirection.Out).cache()
+ assert(edges.count == 49)
+ edges.collect.foreach { case (vid, edges) => assert(edges.size == 1) }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeDstIds = s.map(e => e.dstId)
+ assert(edgeDstIds.contains(vid + 1))
+ }
+ }
+ }
+
+ test("collectEdgesChainDirectionIn") {
+ withSpark { sc =>
+ val graph = getChainGraph(sc, 50)
+ val edges = graph.collectEdges(EdgeDirection.In).cache()
+ // We expect only 49 because collectEdges does not return vertices that do
+ // not have any edges in the specified direction.
+ assert(edges.count == 49)
+ edges.collect.foreach { case (vid, edges) => assert(edges.size == 1) }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeDstIds = s.map(e => e.srcId)
+ assert(edgeDstIds.contains((vid - 1) % 100))
+ }
+ }
+ }
+
+ test("collectEdgesChainDirectionEither") {
+ withSpark { sc =>
+ val graph = getChainGraph(sc, 50)
+ val edges = graph.collectEdges(EdgeDirection.Either).cache()
+ // We expect only 49 because collectEdges does not return vertices that do
+ // not have any edges in the specified direction.
+ assert(edges.count === 50)
+ edges.collect.foreach {
+ case (vid, edges) => if (vid > 0 && vid < 49) assert(edges.size == 2)
+ else assert(edges.size == 1)
+ }
+ edges.collect.foreach {
+ case (vid, edges) =>
+ val s = edges.toSet
+ val edgeIds = s.map(e => if (vid != e.srcId) e.srcId else e.dstId)
+ if (vid == 0) { assert(edgeIds.contains(1)) }
+ else if (vid == 49) { assert(edgeIds.contains(48)) }
+ else {
+ assert(edgeIds.contains(vid + 1))
+ assert(edgeIds.contains(vid - 1))
+ }
+ }
+ }
+ }
+
+ private def getCycleGraph(sc: SparkContext, numVertices: Int): Graph[Double, Int] = {
+ val cycle = (0 until numVertices).map(x => (x, (x + 1) % numVertices))
+ getGraphFromSeq(sc, cycle)
+ }
+
+ private def getChainGraph(sc: SparkContext, numVertices: Int): Graph[Double, Int] = {
+ val chain = (0 until numVertices - 1).map(x => (x, (x + 1)))
+ getGraphFromSeq(sc, chain)
+ }
+
+ private def getGraphFromSeq(sc: SparkContext, seq: IndexedSeq[(Int, Int)]): Graph[Double, Int] = {
+ val rawEdges = sc.parallelize(seq, 3).map { case (s, d) => (s.toLong, d.toLong) }
+ Graph.fromEdgeTuples(rawEdges, 1.0).cache()
+ }
}
diff --git a/pom.xml b/pom.xml
index 3a530685b8e5a..21060ee69c041 100644
--- a/pom.xml
+++ b/pom.xml
@@ -393,9 +393,9 @@
test