Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4e96c01
Add YARN/Stable compiled classes to the CLASSPATH.
berngp Apr 15, 2014
1342886
The `spark-class` shell now ignores non jar files in the assembly dir…
berngp Apr 15, 2014
ddf2547
The `spark-shell` option `--log-conf` also enables the SPARK_PRINT_LA…
berngp Apr 15, 2014
2204539
Root is now Spark and qualify the assembly if it was built with YARN.
berngp Apr 15, 2014
889bf4e
Upgrade the Maven Build to YARN 2.3.0.
berngp Apr 16, 2014
460510a
merge https://github.com/berngp/spark/commits/feature/small-shell-cha…
witgo Apr 29, 2014
f1c7535
Improved build configuration Ⅱ
witgo Apr 29, 2014
8540e83
review commit
witgo Apr 30, 2014
c4c6e45
review commit
witgo Apr 30, 2014
9f08e80
Merge branch 'master' of https://github.com/apache/spark into improve…
witgo May 1, 2014
e1a7e00
improve travis tests coverage
witgo May 1, 2014
effe79c
missing ","
witgo May 1, 2014
9ea1af9
add the dependency of commons-lang
witgo May 1, 2014
0ed124d
SPARK-1693: Most of the tests throw a java.lang.SecurityException whe…
witgo May 1, 2014
03b136f
revert .travis.yml
witgo May 1, 2014
d3488c6
Add the missing yarn dependencies
witgo May 1, 2014
779ae5d
Fix SPARK-1693: Dependent on multiple versions of servlet-api jars le…
witgo May 1, 2014
27bd426
review commit
witgo May 1, 2014
54a86b0
review commit
witgo May 2, 2014
882e35d
review commit
witgo May 2, 2014
31451df
Compile hive optional
witgo May 3, 2014
d9a31db
SPARK-1699: Python relative should be independence from the core, bec…
witgo May 3, 2014
5fb961f
revert exclusion org.eclipse.jetty.orbit:javax.servlet
witgo May 3, 2014
ea53549
Merge branch 'master' of https://github.com/apache/spark into improve…
witgo May 3, 2014
a5ff7d1
revert exclusion org.eclipse.jetty.orbit:javax.servlet
witgo May 3, 2014
bb4c435
merge master
witgo May 3, 2014
bd6cca1
bugfix
witgo May 3, 2014
1cd8cfa
add licenses
witgo May 3, 2014
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ conf/spark-env.sh
conf/streaming-env.sh
conf/log4j.properties
conf/spark-defaults.conf
conf/*.xml
docs/_site
docs/api
target/
Expand Down
20 changes: 15 additions & 5 deletions assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,6 @@
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.8.1</version>
</dependency>
</dependencies>

<build>
Expand Down Expand Up @@ -173,6 +168,21 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>python</id>
<dependencies>
<dependency>
<groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>python-api_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>spark-ganglia-lgpl</id>
<dependencies>
Expand Down
1 change: 1 addition & 0 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"

DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
Expand Down
4 changes: 2 additions & 2 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ export JAVA_OPTS

if [ ! -f "$FWDIR/RELEASE" ]; then
# Exit if the user hasn't compiled Spark
num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep -E "spark-assembly.*hadoop.*.jar$" | wc -l)
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep -E "spark-assembly.*hadoop.*.jar$")
if [ "$num_jars" -eq "0" ]; then
echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
echo "You need to build Spark with 'sbt/sbt assembly' before running this program." >&2
Expand Down
71 changes: 0 additions & 71 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -254,35 +254,6 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>test</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<exportAntProperties>true</exportAntProperties>
<target>
<property name="spark.classpath" refid="maven.test.classpath" />
<property environment="env" />
<fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
<condition>
<not>
<or>
<isset property="env.SCALA_HOME" />
<isset property="env.SCALA_LIBRARY_PATH" />
</or>
</not>
</condition>
</fail>
</target>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
Expand All @@ -294,48 +265,6 @@
</environmentVariables>
</configuration>
</plugin>
<!-- Unzip py4j so we can include its files in the jar -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<executions>
<execution>
<phase>generate-resources</phase>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>unzip</executable>
<workingDirectory>../python</workingDirectory>
<arguments>
<argument>-o</argument>
<argument>lib/py4j*.zip</argument>
<argument>-d</argument>
<argument>build</argument>
</arguments>
</configuration>
</plugin>
</plugins>

<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/*.py</include>
</includes>
</resource>
<resource>
<directory>../python/build</directory>
<includes>
<include>py4j/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>
22 changes: 2 additions & 20 deletions core/src/main/scala/org/apache/spark/SparkEnv.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import akka.actor._
import com.google.common.collect.MapMaker

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.python.PythonWorkerFactory
import org.apache.spark.broadcast.BroadcastManager
import org.apache.spark.metrics.MetricsSystem
import org.apache.spark.network.ConnectionManager
Expand Down Expand Up @@ -67,15 +66,14 @@ class SparkEnv (
// A mapping of thread ID to amount of memory used for shuffle in bytes
// All accesses should be manually synchronized
val shuffleMemoryMap = mutable.HashMap[Long, Long]()

private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()
val closeables = mutable.ListBuffer[java.io.Closeable]()

// A general, soft-reference map for metadata needed during HadoopRDD split computation
// (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats).
private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]()

private[spark] def stop() {
pythonWorkers.foreach { case(key, worker) => worker.stop() }
closeables.toList.foreach(_.close())
httpFileServer.stop()
mapOutputTracker.stop()
shuffleFetcher.stop()
Expand All @@ -89,22 +87,6 @@ class SparkEnv (
// UPDATE: In Akka 2.1.x, this hangs if there are remote actors, so we can't call it.
// actorSystem.awaitTermination()
}

private[spark]
def createPythonWorker(pythonExec: String, envVars: Map[String, String]): java.net.Socket = {
synchronized {
val key = (pythonExec, envVars)
pythonWorkers.getOrElseUpdate(key, new PythonWorkerFactory(pythonExec, envVars)).create()
}
}

private[spark]
def destroyPythonWorker(pythonExec: String, envVars: Map[String, String]) {
synchronized {
val key = (pythonExec, envVars)
pythonWorkers(key).stop()
}
}
}

object SparkEnv extends Logging {
Expand Down
9 changes: 6 additions & 3 deletions docs/building-with-maven.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,20 @@ For Apache Hadoop versions 1.x, Cloudera CDH MRv1, and other Hadoop versions wit
For Apache Hadoop 2.x, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions with YARN, you can enable the "yarn-alpha" or "yarn" profile and set the "hadoop.version", "yarn.version" property. Note that Hadoop 0.23.X requires a special `-Phadoop-0.23` profile:

# Apache Hadoop 2.0.5-alpha
$ mvn -Pyarn-alpha -Dhadoop.version=2.0.5-alpha -Dyarn.version=2.0.5-alpha -DskipTests clean package
$ mvn -Pyarn-alpha -Dhadoop.version=2.0.5-alpha -DskipTests clean package

# Cloudera CDH 4.2.0 with MapReduce v2
$ mvn -Pyarn-alpha -Dhadoop.version=2.0.0-cdh4.2.0 -Dyarn.version=2.0.0-cdh4.2.0 -DskipTests clean package
$ mvn -Pyarn-alpha -Dhadoop.version=2.0.0-cdh4.2.0 -DskipTests clean package

# Apache Hadoop 2.2.X (e.g. 2.2.0 as below) and newer
$ mvn -Pyarn -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -DskipTests clean package
$ mvn -Pyarn -Dhadoop.version=2.2.0 -DskipTests clean package

# Apache Hadoop 0.23.x
$ mvn -Pyarn-alpha -Phadoop-0.23 -Dhadoop.version=0.23.7 -Dyarn.version=0.23.7 -DskipTests clean package

# Different versions of HDFS and YARN.
$ mvn -Pyarn-alpha -Dhadoop.version=2.3.0 -Dyarn.version=0.23.7 -DskipTests clean package

## Spark Tests in Maven ##

Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). Some of the require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time. You can then run the tests with `mvn -Dhadoop.version=... test`.
Expand Down
40 changes: 29 additions & 11 deletions make-distribution.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ SPARK_HADOOP_VERSION=1.0.4
SPARK_YARN=false
SPARK_HIVE=false
SPARK_TACHYON=false
SPARK_PYTHON=true
MAKE_TGZ=false
NAME=none

Expand Down Expand Up @@ -105,6 +106,12 @@ else
echo "YARN disabled"
fi

if [ "$SPARK_PYTHON" == "true" ]; then
echo "Python enabled"
else
echo "Python disabled"
fi

if [ "$SPARK_TACHYON" == "true" ]; then
echo "Tachyon Enabled"
else
Expand All @@ -122,22 +129,31 @@ else
MAYBE_HIVE=""
fi

if [[ "$SPARK_HADOOP_VERSION" =~ "0.23." ]]; then
MAYBE_HADOOP023="-Phadoop-0.23"
else
MAYBE_HADOOP023=""
fi

if [ "$SPARK_YARN" == "true" ]; then
if [[ "$SPARK_HADOOP_VERSION" =~ "0.23." ]]; then
mvn clean package -DskipTests -Pyarn-alpha -Dhadoop.version=$SPARK_HADOOP_VERSION \
-Dyarn.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE -Phadoop-0.23
if [[ "$SPARK_HADOOP_VERSION" =~ "0.23." || "$SPARK_HADOOP_VERSION" =~ "2.0." ]]; then
MAYBE_YARN="-Pyarn-alpha -Dyarn.version=$SPARK_HADOOP_VERSION"
else
mvn clean package -DskipTests -Pyarn -Dhadoop.version=$SPARK_HADOOP_VERSION \
-Dyarn.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE
MAYBE_YARN="-Pyarn -Dyarn.version=$SPARK_HADOOP_VERSION"
fi
else
if [[ "$SPARK_HADOOP_VERSION" =~ "0.23." ]]; then
mvn clean package -Phadoop-0.23 -DskipTests -Dhadoop.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE
else
mvn clean package -DskipTests -Dhadoop.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE
fi
MAYBE_YARN=""
fi

if [ "$SPARK_PYTHON" == "true" ]; then
MAYBE_PYTHON="-Ppython"
else
MAYBE_PYTHON=""
fi

mvn package -Dhadoop.version=$SPARK_HADOOP_VERSION \
-DskipTests $MAYBE_HIVE $MAYBE_HADOOP023 $MAYBE_YARN $MAYBE_PYTHON

# Make directories
rm -rf "$DISTDIR"
mkdir -p "$DISTDIR/lib"
Expand All @@ -152,9 +168,11 @@ mkdir "$DISTDIR"/conf
cp "$FWDIR"/conf/*.template "$DISTDIR"/conf
cp "$FWDIR"/conf/slaves "$DISTDIR"/conf
cp -r "$FWDIR/bin" "$DISTDIR"
cp -r "$FWDIR/python" "$DISTDIR"
cp -r "$FWDIR/sbin" "$DISTDIR"

if [ "$SPARK_PYTHON" == "true" ]; then
cp -r "$FWDIR/python" "$DISTDIR"
fi

# Download and copy in tachyon, if requested
if [ "$SPARK_TACHYON" == "true" ]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.api.python.PythonMLLibAPI


/**
Expand Down Expand Up @@ -68,20 +67,6 @@ class MatrixFactorizationModel(
}
}

/**
* :: DeveloperApi ::
* Predict the rating of many users for many products.
* This is a Java stub for python predictAll()
*
* @param usersProductsJRDD A JavaRDD with serialized tuples (user, product)
* @return JavaRDD of serialized Rating objects.
*/
def predict(usersProductsJRDD: JavaRDD[Array[Byte]]): JavaRDD[Array[Byte]] = {
val pythonAPI = new PythonMLLibAPI()
val usersProducts = usersProductsJRDD.rdd.map(xBytes => pythonAPI.unpackTuple(xBytes))
predict(usersProducts).map(rate => pythonAPI.serializeRating(rate))
}

// TODO: Figure out what other good bulk prediction methods would look like.
// Probably want a way to get the top users for a product or vice-versa.
}
Loading