CodingCat
diff --git a/‎LICENSE‎
Lines changed: 16 additions & 0 deletions b/‎LICENSE‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/timeline-view.css‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/resources/org/apache/spark/ui/static/timeline-view.css‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/timeline-view.js‎
Lines changed: 12 additions & 19 deletions b/‎core/src/main/resources/org/apache/spark/ui/static/timeline-view.js‎
Lines changed: 12 additions & 19 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 4 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/mllib-dimensionality-reduction.md‎
Lines changed: 18 additions & 1 deletion b/‎docs/mllib-dimensionality-reduction.md‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎docs/mllib-feature-extraction.md‎
Lines changed: 54 additions & 1 deletion b/‎docs/mllib-feature-extraction.md‎
Lines changed: 54 additions & 1 deletion
diff --git a/‎docs/streaming-programming-guide.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/streaming-programming-guide.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala‎
Lines changed: 1 addition & 1 deletion b/‎examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala‎
Lines changed: 1 addition & 1 deletion
@@ -836,6 +836,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
+========================================================================
+For vis.js (core/src/main/resources/org/apache/spark/ui/static/vis.min.js):
+========================================================================
+Copyright (C) 2010-2015 Almende B.V.
+
+Vis.js is dual licensed under both
+
+  * The Apache 2.0 License
+    http://www.apache.org/licenses/LICENSE-2.0
+
+and
+
+  * The MIT License
+    http://opensource.org/licenses/MIT
+
+Vis.js may be distributed under either license.
 
 ========================================================================
 BSD-style licenses
 
@@ -67,8 +67,8 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
 })
 
 setMethod("show", "RDD",
-          function(.Object) {
-              cat(paste(callJMethod(.Object@jrdd, "toString"), "\n", sep=""))
+          function(object) {
+              cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep=""))
           })
 
 setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {
 
@@ -19,7 +19,8 @@ div#application-timeline, div#job-timeline {
   margin-bottom: 30px;
 }
 
-#application-timeline div.legend-area {
+#application-timeline div.legend-area,
+#job-timeline div.legend-area {
   margin-top: 5px;
 }
 
 
@@ -39,23 +39,24 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime) {
 
   function setupJobEventAction() {
     $(".item.range.job.application-timeline-object").each(function() {
-      var getJobId = function(baseElem) {
+      var getSelectorForJobEntry = function(baseElem) {
         var jobIdText = $($(baseElem).find(".application-timeline-content")[0]).text();
         var jobId = jobIdText.match("\\(Job (\\d+)\\)")[1];
-       return jobId;
+       return "#job-" + jobId;
       };
 
       $(this).click(function() {
-        window.location.href = "job/?id=" + getJobId(this);
+        var jobPagePath = $(getSelectorForJobEntry(this)).find("a").attr("href")
+          window.location.href = jobPagePath
       });
 
       $(this).hover(
         function() {
-          $("#job-" + getJobId(this)).addClass("corresponding-item-hover");
+          $(getSelectorForJobEntry(this)).addClass("corresponding-item-hover");
           $($(this).find("div.application-timeline-content")[0]).tooltip("show");
         },
         function() {
-          $("#job-" + getJobId(this)).removeClass("corresponding-item-hover");
+          $(getSelectorForJobEntry(this)).removeClass("corresponding-item-hover");
           $($(this).find("div.application-timeline-content")[0]).tooltip("hide");
         }
       );
@@ -97,32 +98,24 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
 
   function setupStageEventAction() {
     $(".item.range.stage.job-timeline-object").each(function() {
-      var getStageIdAndAttempt = function(baseElem) {
+      var getSelectorForStageEntry = function(baseElem) {
         var stageIdText = $($(baseElem).find(".job-timeline-content")[0]).text();
         var stageIdAndAttempt = stageIdText.match("\\(Stage (\\d+\\.\\d+)\\)")[1].split(".");
-        return stageIdAndAttempt;
+        return "#stage-" + stageIdAndAttempt[0] + "-" + stageIdAndAttempt[1];
       };
 
       $(this).click(function() {
-        var idAndAttempt = getStageIdAndAttempt(this);
-        var id = idAndAttempt[0];
-        var attempt = idAndAttempt[1];
-        window.location.href = "../../stages/stage/?id=" + id + "&attempt=" + attempt;
+        var stagePagePath = $(getSelectorForStageEntry(this)).find("a").attr("href")
+        window.location.href = stagePagePath
       });
 
       $(this).hover(
         function() {
-          var idAndAttempt = getStageIdAndAttempt(this);
-          var id = idAndAttempt[0];
-          var attempt = idAndAttempt[1];
-          $("#stage-" + id + "-" + attempt).addClass("corresponding-item-hover");
+          $(getSelectorForStageEntry(this)).addClass("corresponding-item-hover");
           $($(this).find("div.job-timeline-content")[0]).tooltip("show");
         },
         function() {
-          var idAndAttempt = getStageIdAndAttempt(this);
-          var id = idAndAttempt[0];
-          var attempt = idAndAttempt[1];
-          $("#stage-" + id + "-" + attempt).removeClass("corresponding-item-hover");
+          $(getSelectorForStageEntry(this)).removeClass("corresponding-item-hover");
           $($(this).find("div.job-timeline-content")[0]).tooltip("hide");
         }
       );
 
@@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag](
    */
   @Experimental
   def countApproxDistinct(p: Int, sp: Int): Long = withScope {
-    require(p >= 4, s"p ($p) must be at least 4")
-    require(sp <= 32, s"sp ($sp) cannot be greater than 32")
+    require(p >= 4, s"p ($p) must be >= 4")
+    require(sp <= 32, s"sp ($sp) must be <= 32")
     require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
     val zeroCounter = new HyperLogLogPlus(p, sp)
     aggregate(zeroCounter)(
@@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag](
    *                   It must be greater than 0.000017.
    */
   def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
+    require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
     val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
-    countApproxDistinct(p, 0)
+    countApproxDistinct(if (p < 4) 4 else p, 0)
   }
 
   /**
 
@@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     val simpleRdd = sc.makeRDD(uniformDistro, 10)
     assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
     assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
+    assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
+    assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
   }
 
   test("SparkContext.union") {
 
@@ -137,7 +137,7 @@ statistical method to find a rotation such that the first coordinate has the lar
 possible, and each succeeding coordinate in turn has the largest variance possible. The columns of
 the rotation matrix are called principal components. PCA is used widely in dimensionality reduction.
 
-MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format.
+MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format and any Vectors.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -157,6 +157,23 @@ val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are
 // Project the rows to the linear space spanned by the top 10 principal components.
 val projected: RowMatrix = mat.multiply(pc)
 {% endhighlight %}
+
+The following code demonstrates how to compute principal components on source vectors
+and use them to project the vectors into a low-dimensional space while keeping associated labels:
+
+{% highlight scala %}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.feature.PCA
+
+val data: RDD[LabeledPoint] = ...
+
+// Compute the top 10 principal components.
+val pca = new PCA(10).fit(data.map(_.features))
+
+// Project vectors to the linear space spanned by the top 10 principal components, keeping the label
+val projected = data.map(p => p.copy(features = pca.transform(p.features)))
+{% endhighlight %}
+
 </div>
 
 <div data-lang="java" markdown="1">
 
@@ -507,7 +507,6 @@ v_N
 
 This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value.
 
-
 <div class="codetabs">
 <div data-lang="scala">
 {% highlight scala %}
@@ -531,3 +530,57 @@ val transformedData2 = parsedData.map(x => transformer.transform(x))
 </div>
 
 
+## PCA
+
+A feature transformer that projects vectors to a low-dimensional space using PCA.
+Details you can read at [dimensionality reduction](mllib-dimensionality-reduction.html).
+
+### Example
+
+The following code demonstrates how to compute principal components on a `Vector`
+and use them to project the vectors into a low-dimensional space while keeping associated labels
+for calculation a [Linear Regression]((mllib-linear-methods.html))
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.feature.PCA
+
+val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
+  val parts = line.split(',')
+  LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
+}.cache()
+
+val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+val training = splits(0).cache()
+val test = splits(1)
+
+val pca = new PCA(training.first().features.size/2).fit(data.map(_.features))
+val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
+val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
+
+val numIterations = 100
+val model = LinearRegressionWithSGD.train(training, numIterations)
+val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
+
+val valuesAndPreds = test.map { point =>
+  val score = model.predict(point.features)
+  (score, point.label)
+}
+
+val valuesAndPreds_pca = test_pca.map { point =>
+  val score = model_pca.predict(point.features)
+  (score, point.label)
+}
+
+val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
+val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean()
+
+println("Mean Squared Error = " + MSE)
+println("PCA Mean Squared Error = " + MSE_pca)
+{% endhighlight %}
+</div>
+</div>
@@ -1915,7 +1915,7 @@ In that case, consider
 [reducing](#reducing-the-processing-time-of-each-batch) the batch processing time.
 
 The progress of a Spark Streaming program can also be monitored using the
-[StreamingListener](api/scala/index.html#org.apache.spark.scheduler.StreamingListener) interface,
+[StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener) interface,
 which allows you to get receiver status and processing times. Note that this is a developer API
 and it is likely to be improved upon (i.e., more information reported) in the future.
 
 
@@ -112,7 +112,7 @@ object DecisionTreeExample {
         .text(s"input path to test dataset.  If given, option fracTest is ignored." +
           s" default: ${defaultParams.testInput}")
         .action((x, c) => c.copy(testInput = x))
-      opt[String]("<dataFormat>")
+      opt[String]("dataFormat")
         .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
         .action((x, c) => c.copy(dataFormat = x))
       arg[String]("<input>")
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@ div#application-timeline, div#job-timeline {`
`19`	`19`	`margin-bottom: 30px;`
`20`	`20`	`}`
`21`	`21`
`22`		`-#application-timeline div.legend-area {`
	`22`	`+#application-timeline div.legend-area,`
	`23`	`+#job-timeline div.legend-area {`
`23`	`24`	`margin-top: 5px;`
`24`	`25`	`}`
`25`	`26`
Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {`
`89`	`89`	`val simpleRdd = sc.makeRDD(uniformDistro, 10)`
`90`	`90`	`assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)`
`91`	`91`	`assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)`
	`92`	`+ assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)`
	`93`	`+ assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)`
`92`	`94`	`}`
`93`	`95`
`94`	`96`	`test("SparkContext.union") {`