@@ -26,36 +26,32 @@ import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
2626import org .apache .spark .rdd .RDD
2727
2828/**
29+ * :: Experimental ::
2930 * API for statistical functions in MLlib.
3031 */
3132@ Experimental
3233object Statistics {
3334
3435 /**
35- * :: Experimental ::
3636 * Computes column-wise summary statistics for the input RDD[Vector].
3737 *
3838 * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
3939 * @return [[MultivariateStatisticalSummary ]] object containing column-wise summary statistics.
4040 */
41- @ Experimental
4241 def colStats (X : RDD [Vector ]): MultivariateStatisticalSummary = {
4342 new RowMatrix (X ).computeColumnSummaryStatistics()
4443 }
4544
4645 /**
47- * :: Experimental ::
4846 * Compute the Pearson correlation matrix for the input RDD of Vectors.
4947 * Columns with 0 covariance produce NaN entries in the correlation matrix.
5048 *
5149 * @param X an RDD[Vector] for which the correlation matrix is to be computed.
5250 * @return Pearson correlation matrix comparing columns in X.
5351 */
54- @ Experimental
5552 def corr (X : RDD [Vector ]): Matrix = Correlations .corrMatrix(X )
5653
5754 /**
58- * :: Experimental ::
5955 * Compute the correlation matrix for the input RDD of Vectors using the specified method.
6056 * Methods currently supported: `pearson` (default), `spearman`.
6157 *
@@ -69,11 +65,9 @@ object Statistics {
6965 * Supported: `pearson` (default), `spearman`
7066 * @return Correlation matrix comparing columns in X.
7167 */
72- @ Experimental
7368 def corr (X : RDD [Vector ], method : String ): Matrix = Correlations .corrMatrix(X , method)
7469
7570 /**
76- * :: Experimental ::
7771 * Compute the Pearson correlation for the input RDDs.
7872 * Returns NaN if either vector has 0 variance.
7973 *
@@ -84,11 +78,9 @@ object Statistics {
8478 * @param y RDD[Double] of the same cardinality as x.
8579 * @return A Double containing the Pearson correlation between the two input RDD[Double]s
8680 */
87- @ Experimental
8881 def corr (x : RDD [Double ], y : RDD [Double ]): Double = Correlations .corr(x, y)
8982
9083 /**
91- * :: Experimental ::
9284 * Compute the correlation for the input RDDs using the specified method.
9385 * Methods currently supported: `pearson` (default), `spearman`.
9486 *
@@ -99,14 +91,12 @@ object Statistics {
9991 * @param y RDD[Double] of the same cardinality as x.
10092 * @param method String specifying the method to use for computing correlation.
10193 * Supported: `pearson` (default), `spearman`
102- *@return A Double containing the correlation between the two input RDD[Double]s using the
94+ * @return A Double containing the correlation between the two input RDD[Double]s using the
10395 * specified method.
10496 */
105- @ Experimental
10697 def corr (x : RDD [Double ], y : RDD [Double ], method : String ): Double = Correlations .corr(x, y, method)
10798
10899 /**
109- * :: Experimental ::
110100 * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
111101 * expected distribution.
112102 *
@@ -120,13 +110,11 @@ object Statistics {
120110 * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
121111 * the method used, and the null hypothesis.
122112 */
123- @ Experimental
124113 def chiSqTest (observed : Vector , expected : Vector ): ChiSqTestResult = {
125114 ChiSqTest .chiSquared(observed, expected)
126115 }
127116
128117 /**
129- * :: Experimental ::
130118 * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
131119 * distribution, with each category having an expected frequency of `1 / observed.size`.
132120 *
@@ -136,23 +124,19 @@ object Statistics {
136124 * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
137125 * the method used, and the null hypothesis.
138126 */
139- @ Experimental
140127 def chiSqTest (observed : Vector ): ChiSqTestResult = ChiSqTest .chiSquared(observed)
141128
142129 /**
143- * :: Experimental ::
144130 * Conduct Pearson's independence test on the input contingency matrix, which cannot contain
145131 * negative entries or columns or rows that sum up to 0.
146132 *
147133 * @param observed The contingency matrix (containing either counts or relative frequencies).
148134 * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
149135 * the method used, and the null hypothesis.
150136 */
151- @ Experimental
152137 def chiSqTest (observed : Matrix ): ChiSqTestResult = ChiSqTest .chiSquaredMatrix(observed)
153138
154139 /**
155- * :: Experimental ::
156140 * Conduct Pearson's independence test for every feature against the label across the input RDD.
157141 * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
158142 * the chi-squared statistic is computed. All label and feature values must be categorical.
@@ -162,7 +146,6 @@ object Statistics {
162146 * @return an array containing the ChiSquaredTestResult for every feature against the label.
163147 * The order of the elements in the returned array reflects the order of input features.
164148 */
165- @ Experimental
166149 def chiSqTest (data : RDD [LabeledPoint ]): Array [ChiSqTestResult ] = {
167150 ChiSqTest .chiSquaredFeatures(data)
168151 }
0 commit comments