Skip to content

Commit 195d06f

Browse files
committed
add Java example for summary stats and minor fix
1 parent 9f1ff89 commit 195d06f

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

docs/mllib-basics.md

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ which could be faster if the rows are sparse.
333333
<div class="codetabs">
334334
<div data-lang="scala" markdown="1">
335335

336-
`RowMatrix#computeColumnSummaryStatistics` returns an instance of
336+
[`RowMatrix#computeColumnSummaryStatistics`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) returns an instance of
337337
[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
338338
which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
339339
total count.
@@ -355,6 +355,31 @@ println(summary.numNonzeros) // number of nonzeros in each column
355355
val cov: Matrix = mat.computeCovariance()
356356
{% endhighlight %}
357357
</div>
358+
359+
<div data-lang="java" markdown="1">
360+
361+
[`RowMatrix#computeColumnSummaryStatistics`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html#computeColumnSummaryStatistics()) returns an instance of
362+
[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
363+
which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
364+
total count.
365+
366+
{% highlight java %}
367+
import org.apache.spark.mllib.linalg.Matrix;
368+
import org.apache.spark.mllib.linalg.distributed.RowMatrix;
369+
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
370+
371+
RowMatrix mat = ... // a RowMatrix
372+
373+
// Compute column summary statistics.
374+
MultivariateStatisticalSummary summary = mat.computeColumnSummaryStatistics();
375+
System.out.println(summary.mean()); // a dense vector containing the mean value for each column
376+
System.out.println(summary.variance()); // column-wise variance
377+
System.out.println(summary.numNonzeros()); // number of nonzeros in each column
378+
379+
// Compute the covariance matrix.
380+
Matrix cov = mat.computeCovariance();
381+
{% endhighlight %}
382+
</div>
358383
</div>
359384

360385
### IndexedRowMatrix
@@ -467,7 +492,7 @@ import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
467492

468493
JavaRDD<MatrixEntry> entries = ... // a JavaRDD of matrix entries
469494
// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
470-
CoordinateMatrix mat = new CoordinateMatrix(entries);
495+
CoordinateMatrix mat = new CoordinateMatrix(entries.rdd());
471496

472497
// Get its size.
473498
long m = mat.numRows();

0 commit comments

Comments
 (0)