Skip to content

Commit 9ea0832

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents d5bf3f5 + 1b46556 commit 9ea0832

File tree

46 files changed

+644
-102
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+644
-102
lines changed

LICENSE

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
836836
See the License for the specific language governing permissions and
837837
limitations under the License.
838838

839+
========================================================================
840+
For vis.js (core/src/main/resources/org/apache/spark/ui/static/vis.min.js):
841+
========================================================================
842+
Copyright (C) 2010-2015 Almende B.V.
843+
844+
Vis.js is dual licensed under both
845+
846+
* The Apache 2.0 License
847+
http://www.apache.org/licenses/LICENSE-2.0
848+
849+
and
850+
851+
* The MIT License
852+
http://opensource.org/licenses/MIT
853+
854+
Vis.js may be distributed under either license.
839855

840856
========================================================================
841857
BSD-style licenses

R/pkg/R/RDD.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
6767
})
6868

6969
setMethod("show", "RDD",
70-
function(.Object) {
71-
cat(paste(callJMethod(.Object@jrdd, "toString"), "\n", sep=""))
70+
function(object) {
71+
cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep=""))
7272
})
7373

7474
setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {

core/src/main/resources/org/apache/spark/ui/static/timeline-view.css

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ div#application-timeline, div#job-timeline {
1919
margin-bottom: 30px;
2020
}
2121

22-
#application-timeline div.legend-area {
22+
#application-timeline div.legend-area,
23+
#job-timeline div.legend-area {
2324
margin-top: 5px;
2425
}
2526

core/src/main/resources/org/apache/spark/ui/static/timeline-view.js

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,24 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime) {
3939

4040
function setupJobEventAction() {
4141
$(".item.range.job.application-timeline-object").each(function() {
42-
var getJobId = function(baseElem) {
42+
var getSelectorForJobEntry = function(baseElem) {
4343
var jobIdText = $($(baseElem).find(".application-timeline-content")[0]).text();
4444
var jobId = jobIdText.match("\\(Job (\\d+)\\)")[1];
45-
return jobId;
45+
return "#job-" + jobId;
4646
};
4747

4848
$(this).click(function() {
49-
window.location.href = "job/?id=" + getJobId(this);
49+
var jobPagePath = $(getSelectorForJobEntry(this)).find("a").attr("href")
50+
window.location.href = jobPagePath
5051
});
5152

5253
$(this).hover(
5354
function() {
54-
$("#job-" + getJobId(this)).addClass("corresponding-item-hover");
55+
$(getSelectorForJobEntry(this)).addClass("corresponding-item-hover");
5556
$($(this).find("div.application-timeline-content")[0]).tooltip("show");
5657
},
5758
function() {
58-
$("#job-" + getJobId(this)).removeClass("corresponding-item-hover");
59+
$(getSelectorForJobEntry(this)).removeClass("corresponding-item-hover");
5960
$($(this).find("div.application-timeline-content")[0]).tooltip("hide");
6061
}
6162
);
@@ -97,32 +98,24 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
9798

9899
function setupStageEventAction() {
99100
$(".item.range.stage.job-timeline-object").each(function() {
100-
var getStageIdAndAttempt = function(baseElem) {
101+
var getSelectorForStageEntry = function(baseElem) {
101102
var stageIdText = $($(baseElem).find(".job-timeline-content")[0]).text();
102103
var stageIdAndAttempt = stageIdText.match("\\(Stage (\\d+\\.\\d+)\\)")[1].split(".");
103-
return stageIdAndAttempt;
104+
return "#stage-" + stageIdAndAttempt[0] + "-" + stageIdAndAttempt[1];
104105
};
105106

106107
$(this).click(function() {
107-
var idAndAttempt = getStageIdAndAttempt(this);
108-
var id = idAndAttempt[0];
109-
var attempt = idAndAttempt[1];
110-
window.location.href = "../../stages/stage/?id=" + id + "&attempt=" + attempt;
108+
var stagePagePath = $(getSelectorForStageEntry(this)).find("a").attr("href")
109+
window.location.href = stagePagePath
111110
});
112111

113112
$(this).hover(
114113
function() {
115-
var idAndAttempt = getStageIdAndAttempt(this);
116-
var id = idAndAttempt[0];
117-
var attempt = idAndAttempt[1];
118-
$("#stage-" + id + "-" + attempt).addClass("corresponding-item-hover");
114+
$(getSelectorForStageEntry(this)).addClass("corresponding-item-hover");
119115
$($(this).find("div.job-timeline-content")[0]).tooltip("show");
120116
},
121117
function() {
122-
var idAndAttempt = getStageIdAndAttempt(this);
123-
var id = idAndAttempt[0];
124-
var attempt = idAndAttempt[1];
125-
$("#stage-" + id + "-" + attempt).removeClass("corresponding-item-hover");
118+
$(getSelectorForStageEntry(this)).removeClass("corresponding-item-hover");
126119
$($(this).find("div.job-timeline-content")[0]).tooltip("hide");
127120
}
128121
);

core/src/main/scala/org/apache/spark/rdd/RDD.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag](
11611161
*/
11621162
@Experimental
11631163
def countApproxDistinct(p: Int, sp: Int): Long = withScope {
1164-
require(p >= 4, s"p ($p) must be at least 4")
1165-
require(sp <= 32, s"sp ($sp) cannot be greater than 32")
1164+
require(p >= 4, s"p ($p) must be >= 4")
1165+
require(sp <= 32, s"sp ($sp) must be <= 32")
11661166
require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
11671167
val zeroCounter = new HyperLogLogPlus(p, sp)
11681168
aggregate(zeroCounter)(
@@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag](
11871187
* It must be greater than 0.000017.
11881188
*/
11891189
def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
1190+
require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
11901191
val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
1191-
countApproxDistinct(p, 0)
1192+
countApproxDistinct(if (p < 4) 4 else p, 0)
11921193
}
11931194

11941195
/**

core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
8989
val simpleRdd = sc.makeRDD(uniformDistro, 10)
9090
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
9191
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
92+
assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
93+
assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
9294
}
9395

9496
test("SparkContext.union") {

docs/mllib-dimensionality-reduction.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ statistical method to find a rotation such that the first coordinate has the lar
137137
possible, and each succeeding coordinate in turn has the largest variance possible. The columns of
138138
the rotation matrix are called principal components. PCA is used widely in dimensionality reduction.
139139

140-
MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format.
140+
MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format and any Vectors.
141141

142142
<div class="codetabs">
143143
<div data-lang="scala" markdown="1">
@@ -157,6 +157,23 @@ val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are
157157
// Project the rows to the linear space spanned by the top 10 principal components.
158158
val projected: RowMatrix = mat.multiply(pc)
159159
{% endhighlight %}
160+
161+
The following code demonstrates how to compute principal components on source vectors
162+
and use them to project the vectors into a low-dimensional space while keeping associated labels:
163+
164+
{% highlight scala %}
165+
import org.apache.spark.mllib.regression.LabeledPoint
166+
import org.apache.spark.mllib.feature.PCA
167+
168+
val data: RDD[LabeledPoint] = ...
169+
170+
// Compute the top 10 principal components.
171+
val pca = new PCA(10).fit(data.map(_.features))
172+
173+
// Project vectors to the linear space spanned by the top 10 principal components, keeping the label
174+
val projected = data.map(p => p.copy(features = pca.transform(p.features)))
175+
{% endhighlight %}
176+
160177
</div>
161178

162179
<div data-lang="java" markdown="1">

docs/mllib-feature-extraction.md

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,6 @@ v_N
507507

508508
This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value.
509509

510-
511510
<div class="codetabs">
512511
<div data-lang="scala">
513512
{% highlight scala %}
@@ -531,3 +530,57 @@ val transformedData2 = parsedData.map(x => transformer.transform(x))
531530
</div>
532531

533532

533+
## PCA
534+
535+
A feature transformer that projects vectors to a low-dimensional space using PCA.
536+
Details you can read at [dimensionality reduction](mllib-dimensionality-reduction.html).
537+
538+
### Example
539+
540+
The following code demonstrates how to compute principal components on a `Vector`
541+
and use them to project the vectors into a low-dimensional space while keeping associated labels
542+
for calculation a [Linear Regression]((mllib-linear-methods.html))
543+
544+
<div class="codetabs">
545+
<div data-lang="scala">
546+
{% highlight scala %}
547+
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
548+
import org.apache.spark.mllib.regression.LabeledPoint
549+
import org.apache.spark.mllib.linalg.Vectors
550+
import org.apache.spark.mllib.feature.PCA
551+
552+
val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
553+
val parts = line.split(',')
554+
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
555+
}.cache()
556+
557+
val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
558+
val training = splits(0).cache()
559+
val test = splits(1)
560+
561+
val pca = new PCA(training.first().features.size/2).fit(data.map(_.features))
562+
val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
563+
val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
564+
565+
val numIterations = 100
566+
val model = LinearRegressionWithSGD.train(training, numIterations)
567+
val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
568+
569+
val valuesAndPreds = test.map { point =>
570+
val score = model.predict(point.features)
571+
(score, point.label)
572+
}
573+
574+
val valuesAndPreds_pca = test_pca.map { point =>
575+
val score = model_pca.predict(point.features)
576+
(score, point.label)
577+
}
578+
579+
val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
580+
val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean()
581+
582+
println("Mean Squared Error = " + MSE)
583+
println("PCA Mean Squared Error = " + MSE_pca)
584+
{% endhighlight %}
585+
</div>
586+
</div>

docs/streaming-programming-guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1915,7 +1915,7 @@ In that case, consider
19151915
[reducing](#reducing-the-processing-time-of-each-batch) the batch processing time.
19161916

19171917
The progress of a Spark Streaming program can also be monitored using the
1918-
[StreamingListener](api/scala/index.html#org.apache.spark.scheduler.StreamingListener) interface,
1918+
[StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener) interface,
19191919
which allows you to get receiver status and processing times. Note that this is a developer API
19201920
and it is likely to be improved upon (i.e., more information reported) in the future.
19211921

examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ object DecisionTreeExample {
112112
.text(s"input path to test dataset. If given, option fracTest is ignored." +
113113
s" default: ${defaultParams.testInput}")
114114
.action((x, c) => c.copy(testInput = x))
115-
opt[String]("<dataFormat>")
115+
opt[String]("dataFormat")
116116
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
117117
.action((x, c) => c.copy(dataFormat = x))
118118
arg[String]("<input>")

0 commit comments

Comments
 (0)