From aa2dcb805d1e498a182b5fbdc500b299dd722efb Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Fri, 25 Nov 2016 19:15:48 +0900
Subject: [PATCH 01/17] Fix errors (first round)
---
.../apache/spark/ml/feature/Bucketizer.scala | 2 +-
.../spark/ml/feature/CountVectorizer.scala | 4 +-
.../apache/spark/ml/feature/HashingTF.scala | 2 +-
.../org/apache/spark/ml/feature/NGram.scala | 2 +-
.../apache/spark/ml/feature/Normalizer.scala | 2 +-
.../spark/ml/feature/OneHotEncoder.scala | 4 +-
.../org/apache/spark/ml/feature/PCA.scala | 4 +-
.../ml/feature/PolynomialExpansion.scala | 2 +-
.../ml/feature/QuantileDiscretizer.scala | 6 +--
.../spark/ml/feature/SQLTransformer.scala | 2 +-
.../spark/ml/feature/StopWordsRemover.scala | 2 +-
.../spark/ml/feature/StringIndexer.scala | 8 ++--
.../apache/spark/ml/feature/Tokenizer.scala | 2 +-
.../spark/ml/feature/VectorIndexer.scala | 8 ++--
.../spark/ml/feature/VectorSlicer.scala | 4 +-
.../apache/spark/ml/feature/package-info.java | 4 +-
.../ml/regression/AFTSurvivalRegression.scala | 2 +-
.../ml/regression/DecisionTreeRegressor.scala | 3 +-
.../spark/ml/regression/GBTRegressor.scala | 2 +-
.../GeneralizedLinearRegression.scala | 22 +++++-----
.../ml/regression/IsotonicRegression.scala | 4 +-
.../ml/regression/LinearRegression.scala | 30 +++++++-------
.../ml/regression/RandomForestRegressor.scala | 2 +-
.../apache/spark/ml/util/MetadataUtils.scala | 2 +-
.../org/apache/spark/ml/util/ReadWrite.scala | 6 +--
.../apache/spark/mllib/clustering/LDA.scala | 4 +-
.../spark/mllib/clustering/LDAModel.scala | 2 +-
.../spark/mllib/clustering/LDAOptimizer.scala | 4 +-
.../tree/configuration/BoostingStrategy.scala | 10 ++---
.../mllib/tree/configuration/Strategy.scala | 8 ++--
.../scala/org/apache/spark/sql/Column.scala | 40 +++++++++----------
.../spark/sql/DataFrameNaFunctions.scala | 36 ++++++++---------
.../org/apache/spark/sql/SQLContext.scala | 6 +--
33 files changed, 121 insertions(+), 120 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 1143f0f565ebd..546643d8f91f7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
/**
* Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
* A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
- * also includes y. Splits should be of length >= 3 and strictly increasing.
+ * also includes y. Splits should be of length >= 3 and strictly increasing.
* Values at -inf, inf must be explicitly provided to cover all Double values;
* otherwise, values outside the splits specified will be treated as errors.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 6299f74a6bf96..cc0924fd95b08 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -53,7 +53,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/**
* Specifies the minimum number of different documents a term must appear in to be included
* in the vocabulary.
- * If this is an integer >= 1, this specifies the number of documents the term must appear in;
+ * If this is an integer >= 1, this specifies the number of documents the term must appear in;
* if this is a double in [0,1), then this specifies the fraction of documents.
*
* Default: 1.0
@@ -78,7 +78,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/**
* Filter to ignore rare words in a document. For each document, terms with
* frequency/count less than the given threshold are ignored.
- * If this is an integer >= 1, then this specifies a count (of times the term must appear
+ * If this is an integer >= 1, then this specifies a count (of times the term must appear
* in the document);
* if this is a double in [0,1), then this specifies a fraction (out of the document's token
* count).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index a8792a35ff4ae..8f60ec8788fac 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def setOutputCol(value: String): this.type = set(outputCol, value)
/**
- * Number of features. Should be > 0.
+ * Number of features. Should be > 0.
* (default = 2^18^)
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index 4463aea0097e2..c424aaa1f5634 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
def this() = this(Identifiable.randomUID("ngram"))
/**
- * Minimum n-gram length, >= 1.
+ * Minimum n-gram length, >= 1.
* Default: 2, bigram features
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index eb0690058013f..629702051d426 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def this() = this(Identifiable.randomUID("normalizer"))
/**
- * Normalization in L^p^ space. Must be >= 1.
+ * Normalization in L^p^ space. Must be >= 1.
* (default: p = 2)
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index ea401216aec7b..ba1380bdda451 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -33,14 +33,14 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
* at most a single one-value per row that indicates the input category index.
* For example with 5 categories, an input value of 2.0 would map to an output vector of
* `[0.0, 0.0, 1.0, 0.0]`.
- * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
+ * The last category is not included by default (configurable via `OneHotEncoder!.dropLast`
* because it makes the vector entries sum up to one, and hence linearly dependent.
* So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
*
* @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
* The output vectors are sparse.
*
- * @see [[StringIndexer]] for converting categorical values into category indices
+ * @see `StringIndexer` for converting categorical values into category indices
*/
@Since("1.4.0")
class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 6e08bf059124c..4143d864d7930 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -63,7 +63,7 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
}
/**
- * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
+ * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
* principal components.
*/
@Since("1.5.0")
@@ -144,7 +144,7 @@ class PCAModel private[ml] (
* Transform a vector by computed Principal Components.
*
* @note Vectors to be transformed must be the same length as the source vectors given
- * to [[PCA.fit()]].
+ * to `PCA.fit()`.
*/
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 4be17da3e9f76..74526a8260a0d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -45,7 +45,7 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
def this() = this(Identifiable.randomUID("poly"))
/**
- * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion.
+ * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion.
* Default: 2
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index b9e01dde70d85..a1cceef51d30a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
/**
* Number of buckets (quantiles, or categories) into which data points are grouped. Must
- * be >= 2.
+ * be >= 2.
*
* See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
*
@@ -52,7 +52,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
/**
* Relative error (see documentation for
- * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] for description)
+ * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description)
* Must be in the range [0, 1].
* default: 0.001
* @group param
@@ -99,7 +99,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
* but NaNs will be counted in a special bucket[4].
*
* Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
- * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
+ * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile`
* for a detailed description). The precision of the approximation can be controlled with the
* `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`,
* covering all real values.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index b25fff973c441..c4398bebf9be6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.StructType
* use Spark SQL built-in function and UDFs to operate on these selected columns.
* For example, [[SQLTransformer]] supports statements like:
* - SELECT a, a + b AS a_b FROM __THIS__
- * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
* - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
*/
@Since("1.6.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index a55816249c74b..3fcd84c029e61 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -52,7 +52,7 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
/**
* The words to be filtered out.
* Default: English stop words
- * @see [[StopWordsRemover.loadDefaultStopWords()]]
+ * @see `StopWordsRemover.loadDefaultStopWords()`
* @group param
*/
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 8b155f00017cf..0a4d31d1654e7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -60,7 +60,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
* The indices are in [0, numLabels), ordered by label frequencies.
* So the most frequent label gets index 0.
*
- * @see [[IndexToString]] for the inverse transformation
+ * @see `IndexToString` for the inverse transformation
*/
@Since("1.4.0")
class StringIndexer @Since("1.4.0") (
@@ -116,7 +116,7 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] {
* @param labels Ordered list of labels, corresponding to indices to be assigned.
*
* @note During transformation, if the input column does not exist,
- * [[StringIndexerModel.transform]] would return the input dataset unmodified.
+ * `StringIndexerModel.transform` would return the input dataset unmodified.
* This is a temporary fix for the case when target labels do not exist during prediction.
*/
@Since("1.4.0")
@@ -247,12 +247,12 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] {
}
/**
- * A [[Transformer]] that maps a column of indices back to a new column of corresponding
+ * A `Transformer` that maps a column of indices back to a new column of corresponding
* string values.
* The index-string mapping is either from the ML attributes of the input column,
* or from user-supplied labels (which take precedence over ML attributes).
*
- * @see [[StringIndexer]] for converting strings into indices
+ * @see `StringIndexer` for converting strings into indices
*/
@Since("1.5.0")
class IndexToString private[ml] (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 45d8fa94a8f8f..bf2b1d7c0f777 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def this() = this(Identifiable.randomUID("regexTok"))
/**
- * Minimum token length, >= 0.
+ * Minimum token length, >= 0.
* Default: 1, to avoid returning empty strings
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index d1a5c2e82581e..0ae9f264f4a8a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
/**
* Threshold for the number of values a categorical feature can take.
- * If a feature is found to have > maxCategories values, then it is declared continuous.
- * Must be >= 2.
+ * If a feature is found to have > maxCategories values, then it is declared continuous.
+ * Must be >= 2.
*
* (default = 20)
* @group param
@@ -59,7 +59,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
}
/**
- * Class for indexing categorical feature columns in a dataset of [[Vector]].
+ * Class for indexing categorical feature columns in a dataset of `Vector`.
*
* This has 2 usage modes:
* - Automatically identify categorical features (default behavior)
@@ -76,7 +76,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
* - Warning: This can cause problems if features are continuous since this will collect ALL
* unique values to the driver.
* - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
- * If maxCategories >= 3, then both features will be declared categorical.
+ * If maxCategories >= 3, then both features will be declared categorical.
*
* This returns a model which can transform categorical features to use 0-based indices.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
index 966ccb85d0e0e..e3e462d07e10c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
@@ -32,8 +32,8 @@ import org.apache.spark.sql.types.StructType
* This class takes a feature vector and outputs a new feature vector with a subarray of the
* original features.
*
- * The subset of features can be specified with either indices ([[setIndices()]])
- * or names ([[setNames()]]). At least one feature must be selected. Duplicate features
+ * The subset of features can be specified with either indices (`setIndices()`)
+ * or names (`setNames()`). At least one feature must be selected. Duplicate features
* are not allowed, so there can be no overlap between selected indices and names.
*
* The output vector will order features with the selected indices first (in the order given),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
index dcff4245d1d26..ce7f335056872 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -61,12 +61,12 @@
* createStructField("id", IntegerType, false),
* createStructField("text", StringType, false),
* createStructField("rating", DoubleType, false)));
- * JavaRDD rowRDD = jsc.parallelize(
+ * JavaRDD<Row> rowRDD = jsc.parallelize(
* Arrays.asList(
* RowFactory.create(0, "Hi I heard about Spark", 3.0),
* RowFactory.create(1, "I wish Java could use case classes", 4.0),
* RowFactory.create(2, "Logistic regression models are neat", 4.0)));
- * Dataset dataset = jsql.createDataFrame(rowRDD, schema);
+ * Dataset<Row> dataset = jsql.createDataFrame(rowRDD, schema);
* // define feature transformers
* RegexTokenizer tok = new RegexTokenizer()
* .setInputCol("text")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index d6ad1ea6d1096..ede859db19d5f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -185,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
setDefault(tol -> 1E-6)
/**
- * Suggested depth for treeAggregate (>= 2).
+ * Suggested depth for treeAggregate (>= 2).
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 894b6a2ca2041..0b0c46144bfbe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -132,7 +132,8 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor
}
/**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression.
+ *
+ * Decision tree (Wikipedia) model for regression.
* It supports both continuous and categorical features.
* @param rootNode Root of the decision tree
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index ed2d05525d611..b4d5603bdbb20 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -223,7 +223,7 @@ class GBTRegressionModel private[ml](
* (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
* and follows the implementation from scikit-learn.
*
- * @see [[DecisionTreeRegressionModel.featureImportances]]
+ * @see `DecisionTreeRegressionModel.featureImportances`
*/
@Since("2.0.0")
lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 1201ecd5e4e61..440eacd13fd32 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -131,10 +131,10 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
* It supports "gaussian", "binomial", "poisson" and "gamma" as family.
* Valid link functions for each family is listed below. The first link function of each family
* is the default one.
- * - "gaussian" -> "identity", "log", "inverse"
- * - "binomial" -> "logit", "probit", "cloglog"
- * - "poisson" -> "log", "identity", "sqrt"
- * - "gamma" -> "inverse", "identity", "log"
+ * - "gaussian" -> "identity", "log", "inverse"
+ * - "binomial" -> "logit", "probit", "cloglog"
+ * - "poisson" -> "log", "identity", "sqrt"
+ * - "gamma" -> "inverse", "identity", "log"
*/
@Experimental
@Since("2.0.0")
@@ -1066,7 +1066,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
import GeneralizedLinearRegression._
/**
- * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver.
+ * Whether the underlying `WeightedLeastSquares` using the "normal" solver.
*/
private[ml] val isNormalSolver: Boolean = {
diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
@@ -1074,10 +1074,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
/**
* Standard error of estimated coefficients and intercept.
- * This value is only available when the underlying [[WeightedLeastSquares]]
+ * This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
- * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
@@ -1092,10 +1092,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
/**
* T-statistic of estimated coefficients and intercept.
- * This value is only available when the underlying [[WeightedLeastSquares]]
+ * This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
- * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
@@ -1115,10 +1115,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
/**
* Two-sided p-value of estimated coefficients and intercept.
- * This value is only available when the underlying [[WeightedLeastSquares]]
+ * This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
- * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 4d274f3a5bbf1..c378a99e3c230 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -56,7 +56,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
final def getIsotonic: Boolean = $(isotonic)
/**
- * Param for the index of the feature if [[featuresCol]] is a vector column (default: `0`), no
+ * Param for the index of the feature if `featuresCol` is a vector column (default: `0`), no
* effect otherwise.
* @group param
*/
@@ -194,7 +194,7 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] {
* Model fitted by IsotonicRegression.
* Predicts using a piecewise linear function.
*
- * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]].
+ * For detailed rules see `org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()`.
*
* @param oldModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]]
* model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index eb4e38cc83c19..95c6625920ec0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -119,7 +119,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
/**
* Set the ElasticNet mixing parameter.
* For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
- * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+ * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
* Default is 0.0 which is an L2 penalty.
*
* @group setParam
@@ -165,7 +165,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
* - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
* optimization method.
* - "normal" denotes using Normal Equation as an analytical solution to the linear regression
- * problem. This solver is limited to [[LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER]].
+ * problem. This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`.
* - "auto" (default) means that the solver algorithm is selected automatically.
* The Normal Equations solver will be used when possible, but this will automatically fall
* back to iterative optimization methods when needed.
@@ -181,7 +181,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
setDefault(solver -> "auto")
/**
- * Suggested depth for treeAggregate (>= 2).
+ * Suggested depth for treeAggregate (>= 2).
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
@@ -338,12 +338,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
/*
Note that in Linear Regression, the objective history (loss + regularization) returned
from optimizer is computed in the scaled space given by the following formula.
-
+
$$
L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2
+ regTerms \\
$$
-
+
*/
val arrayBuilder = mutable.ArrayBuilder.make[Double]
var state: optimizer.State = null
@@ -414,7 +414,7 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
override def load(path: String): LinearRegression = super.load(path)
/**
- * When using [[LinearRegression.solver]] == "normal", the solver must limit the number of
+ * When using `LinearRegression.solver` == "normal", the solver must limit the number of
* features to at most this number. The entire covariance matrix X^T^X will be collected
* to the driver. This limit helps prevent memory overflow errors.
*/
@@ -584,7 +584,7 @@ class LinearRegressionTrainingSummary private[regression] (
*
* This value is only available when using the "l-bfgs" solver.
*
- * @see [[LinearRegression.solver]]
+ * @see `LinearRegression.solver`
*/
@Since("1.5.0")
val totalIterations = objectiveHistory.length
@@ -627,7 +627,7 @@ class LinearRegressionSummary private[regression] (
* Reference:
* Wikipedia explain variation
*
- * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
* This will change in later Spark versions.
*/
@Since("1.5.0")
@@ -637,7 +637,7 @@ class LinearRegressionSummary private[regression] (
* Returns the mean absolute error, which is a risk function corresponding to the
* expected value of the absolute error loss or l1-norm loss.
*
- * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
* This will change in later Spark versions.
*/
@Since("1.5.0")
@@ -647,7 +647,7 @@ class LinearRegressionSummary private[regression] (
* Returns the mean squared error, which is a risk function corresponding to the
* expected value of the squared error loss or quadratic loss.
*
- * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
* This will change in later Spark versions.
*/
@Since("1.5.0")
@@ -657,7 +657,7 @@ class LinearRegressionSummary private[regression] (
* Returns the root mean squared error, which is defined as the square root of
* the mean squared error.
*
- * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
* This will change in later Spark versions.
*/
@Since("1.5.0")
@@ -668,7 +668,7 @@ class LinearRegressionSummary private[regression] (
* Reference:
* Wikipedia coefficient of determination
*
- * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
* This will change in later Spark versions.
*/
@Since("1.5.0")
@@ -714,7 +714,7 @@ class LinearRegressionSummary private[regression] (
* Standard error of estimated coefficients and intercept.
* This value is only available when using the "normal" solver.
*
- * If [[LinearRegression.fitIntercept]] is set to true,
+ * If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
@@ -742,7 +742,7 @@ class LinearRegressionSummary private[regression] (
* T-statistic of estimated coefficients and intercept.
* This value is only available when using the "normal" solver.
*
- * If [[LinearRegression.fitIntercept]] is set to true,
+ * If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
@@ -765,7 +765,7 @@ class LinearRegressionSummary private[regression] (
* Two-sided p-value of estimated coefficients and intercept.
* This value is only available when using the "normal" solver.
*
- * If [[LinearRegression.fitIntercept]] is set to true,
+ * If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index d60f05eed58d9..59798730e7c04 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -208,7 +208,7 @@ class RandomForestRegressionModel private[ml] (
* (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
* and follows the implementation from scikit-learn.
*
- * @see [[DecisionTreeRegressionModel.featureImportances]]
+ * @see `DecisionTreeRegressionModel.featureImportances`
*/
@Since("1.5.0")
lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index f34a8310ddf1c..5e081cce0651e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -48,7 +48,7 @@ private[spark] object MetadataUtils {
* If a feature does not have metadata, it is assumed to be continuous.
* If a feature is Nominal, then it must have the number of values
* specified.
- * @return Map: feature index --> number of categories.
+ * @return Map: feature index --> number of categories.
* The map's set of keys will be the set of categorical feature indices.
*/
def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 5b7e5ec75c842..343a70c5d7a46 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -76,7 +76,7 @@ private[util] sealed trait BaseReadWrite {
*/
protected final def sqlContext: SQLContext = sparkSession.sqlContext
- /** Returns the underlying [[SparkContext]]. */
+ /** Returns the underlying `SparkContext`. */
protected final def sc: SparkContext = sparkSession.sparkContext
}
@@ -169,7 +169,7 @@ trait MLWritable {
* This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
* [[org.apache.spark.sql.Dataset]].
*
- * @see [[DefaultParamsReadable]], the counterpart to this trait
+ * @see `DefaultParamsReadable`, the counterpart to this trait
*/
@DeveloperApi
trait DefaultParamsWritable extends MLWritable { self: Params =>
@@ -238,7 +238,7 @@ trait MLReadable[T] {
* [[org.apache.spark.sql.Dataset]].
*
* @tparam T ML instance type
- * @see [[DefaultParamsWritable]], the counterpart to this trait
+ * @see `DefaultParamsWritable`, the counterpart to this trait
*/
@DeveloperApi
trait DefaultParamsReadable[T] extends MLReadable[T] {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 16742bd284e69..63a39c1ce0274 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -261,7 +261,7 @@ class LDA private (
def getCheckpointInterval: Int = checkpointInterval
/**
- * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that
+ * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that
* the cache will get checkpointed every 10 iterations. Checkpointing helps with recovery
* (when nodes fail). It also helps with eliminating temporary shuffle files on disk, which can be
* important when LDA is run for many iterations. If the checkpoint directory is not set in
@@ -340,7 +340,7 @@ class LDA private (
}
/**
- * Java-friendly version of [[run()]]
+ * Java-friendly version of `run()`
*/
@Since("1.3.0")
def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 017fbc6feb0d7..436c8bf299dc6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -392,7 +392,7 @@ class LocalLDAModel private[spark] (
* literature). Returns a vector of zeros for an empty document.
*
* Note this means to allow quick query for single document. For batch documents, please refer
- * to [[topicDistributions()]] to avoid overhead.
+ * to `topicDistributions()` to avoid overhead.
*
* @param document document to predict topic mixture distributions for
* @return topic mixture distribution for the document
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 9687fc8804e89..c65fce4ef11ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -350,9 +350,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
* Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in
* each iteration.
*
- * @note This should be adjusted in synch with [[LDA.setMaxIterations()]]
+ * @note This should be adjusted in synch with `LDA.setMaxIterations()`
* so the entire corpus is used. Specifically, set both so that
- * maxIterations * miniBatchFraction >= 1.
+ * maxIterations * miniBatchFraction >= 1.
*
* Default: 0.05, i.e., 5% of total documents.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index d8405d13ce904..8c7222815ea7a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -36,14 +36,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError}
* @param validationTol validationTol is a condition which decides iteration termination when
* runWithValidation is used.
* The end of iteration is decided based on below logic:
- * If the current loss on the validation set is > 0.01, the diff
+ * If the current loss on the validation set is > 0.01, the diff
* of validation error is compared to relative tolerance which is
* validationTol * (current loss on the validation set).
- * If the current loss on the validation set is <= 0.01, the diff
+ * If the current loss on the validation set is <= 0.01, the diff
* of validation error is compared to absolute tolerance which is
* validationTol * 0.01.
* Ignored when
- * [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
+ * `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used.
*/
@Since("1.2.0")
case class BoostingStrategy @Since("1.4.0") (
@@ -92,8 +92,8 @@ object BoostingStrategy {
/**
* Returns default configuration for the boosting algorithm
* @param algo Learning goal. Supported:
- * [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
- * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+ * `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+ * `org.apache.spark.mllib.tree.configuration.Algo.Regression`
* @return Configuration for boosting algorithm
*/
@Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index b34e1b1b56c43..b4c1e45596d51 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -28,8 +28,8 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
/**
* Stores all the configuration options for tree construction
* @param algo Learning goal. Supported:
- * [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
- * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+ * `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+ * `org.apache.spark.mllib.tree.configuration.Algo.Regression`
* @param impurity Criterion used for information gain calculation.
* Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]],
* [[org.apache.spark.mllib.tree.impurity.Entropy]].
@@ -43,9 +43,9 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
* for choosing how to split on features at each node.
* More bins give higher granularity.
* @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported:
- * [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]]
+ * `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort`
* @param categoricalFeaturesInfo A map storing information about the categorical variables and the
- * number of discrete values they take. An entry (n -> k)
+ * number of discrete values they take. An entry (n -> k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param minInstancesPerNode Minimum number of instances each child must have after split.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index fa3b2b9de5d5d..e99d7865bda91 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -97,7 +97,7 @@ class TypedColumn[-T, U](
}
/**
- * A column that will be computed based on the data in a [[DataFrame]].
+ * A column that will be computed based on the data in a `DataFrame`.
*
* A new column is constructed based on the input columns present in a dataframe:
*
@@ -801,7 +801,7 @@ class Column(val expr: Expression) extends Logging {
/**
* An expression that gets an item at position `ordinal` out of an array,
- * or gets a value by key `key` in a [[MapType]].
+ * or gets a value by key `key` in a `MapType`.
*
* @group expr_ops
* @since 1.3.0
@@ -809,7 +809,7 @@ class Column(val expr: Expression) extends Logging {
def getItem(key: Any): Column = withExpr { UnresolvedExtractValue(expr, Literal(key)) }
/**
- * An expression that gets a field by name in a [[StructType]].
+ * An expression that gets a field by name in a `StructType`.
*
* @group expr_ops
* @since 1.3.0
@@ -1195,92 +1195,92 @@ class Column(val expr: Expression) extends Logging {
class ColumnName(name: String) extends Column(name) {
/**
- * Creates a new [[StructField]] of type boolean.
+ * Creates a new `StructField` of type boolean.
* @since 1.3.0
*/
def boolean: StructField = StructField(name, BooleanType)
/**
- * Creates a new [[StructField]] of type byte.
+ * Creates a new `StructField` of type byte.
* @since 1.3.0
*/
def byte: StructField = StructField(name, ByteType)
/**
- * Creates a new [[StructField]] of type short.
+ * Creates a new `StructField` of type short.
* @since 1.3.0
*/
def short: StructField = StructField(name, ShortType)
/**
- * Creates a new [[StructField]] of type int.
+ * Creates a new `StructField` of type int.
* @since 1.3.0
*/
def int: StructField = StructField(name, IntegerType)
/**
- * Creates a new [[StructField]] of type long.
+ * Creates a new `StructField` of type long.
* @since 1.3.0
*/
def long: StructField = StructField(name, LongType)
/**
- * Creates a new [[StructField]] of type float.
+ * Creates a new `StructField` of type float.
* @since 1.3.0
*/
def float: StructField = StructField(name, FloatType)
/**
- * Creates a new [[StructField]] of type double.
+ * Creates a new `StructField` of type double.
* @since 1.3.0
*/
def double: StructField = StructField(name, DoubleType)
/**
- * Creates a new [[StructField]] of type string.
+ * Creates a new `StructField` of type string.
* @since 1.3.0
*/
def string: StructField = StructField(name, StringType)
/**
- * Creates a new [[StructField]] of type date.
+ * Creates a new `StructField` of type date.
* @since 1.3.0
*/
def date: StructField = StructField(name, DateType)
/**
- * Creates a new [[StructField]] of type decimal.
+ * Creates a new `StructField` of type decimal.
* @since 1.3.0
*/
def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT)
/**
- * Creates a new [[StructField]] of type decimal.
+ * Creates a new `StructField` of type decimal.
* @since 1.3.0
*/
def decimal(precision: Int, scale: Int): StructField =
StructField(name, DecimalType(precision, scale))
/**
- * Creates a new [[StructField]] of type timestamp.
+ * Creates a new `StructField` of type timestamp.
* @since 1.3.0
*/
def timestamp: StructField = StructField(name, TimestampType)
/**
- * Creates a new [[StructField]] of type binary.
+ * Creates a new `StructField` of type binary.
* @since 1.3.0
*/
def binary: StructField = StructField(name, BinaryType)
/**
- * Creates a new [[StructField]] of type array.
+ * Creates a new `StructField` of type array.
* @since 1.3.0
*/
def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType))
/**
- * Creates a new [[StructField]] of type map.
+ * Creates a new `StructField` of type map.
* @since 1.3.0
*/
def map(keyType: DataType, valueType: DataType): StructField =
@@ -1289,13 +1289,13 @@ class ColumnName(name: String) extends Column(name) {
def map(mapType: MapType): StructField = StructField(name, mapType)
/**
- * Creates a new [[StructField]] of type struct.
+ * Creates a new `StructField` of type struct.
* @since 1.3.0
*/
def struct(fields: StructField*): StructField = struct(StructType(fields))
/**
- * Creates a new [[StructField]] of type struct.
+ * Creates a new `StructField` of type struct.
* @since 1.3.0
*/
def struct(structType: StructType): StructField = StructField(name, structType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 0d43f09bc54cd..184c5a11298d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
/**
- * Functionality for working with missing data in [[DataFrame]]s.
+ * Functionality for working with missing data in `DataFrame`s.
*
* @since 1.3.1
*/
@@ -36,14 +36,14 @@ import org.apache.spark.sql.types._
final class DataFrameNaFunctions private[sql](df: DataFrame) {
/**
- * Returns a new [[DataFrame]] that drops rows containing any null or NaN values.
+ * Returns a new `DataFrame` that drops rows containing any null or NaN values.
*
* @since 1.3.1
*/
def drop(): DataFrame = drop("any", df.columns)
/**
- * Returns a new [[DataFrame]] that drops rows containing null or NaN values.
+ * Returns a new `DataFrame` that drops rows containing null or NaN values.
*
* If `how` is "any", then drop rows containing any null or NaN values.
* If `how` is "all", then drop rows only if every column is null or NaN for that row.
@@ -53,7 +53,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(how: String): DataFrame = drop(how, df.columns)
/**
- * Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+ * Returns a new `DataFrame` that drops rows containing any null or NaN values
* in the specified columns.
*
* @since 1.3.1
@@ -61,7 +61,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(cols: Array[String]): DataFrame = drop(cols.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+ * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values
* in the specified columns.
*
* @since 1.3.1
@@ -69,7 +69,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(cols: Seq[String]): DataFrame = drop(cols.size, cols)
/**
- * Returns a new [[DataFrame]] that drops rows containing null or NaN values
+ * Returns a new `DataFrame` that drops rows containing null or NaN values
* in the specified columns.
*
* If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -80,7 +80,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null or NaN values
+ * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values
* in the specified columns.
*
* If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -97,7 +97,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
}
/**
- * Returns a new [[DataFrame]] that drops rows containing
+ * Returns a new `DataFrame` that drops rows containing
* less than `minNonNulls` non-null and non-NaN values.
*
* @since 1.3.1
@@ -105,7 +105,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(minNonNulls: Int): DataFrame = drop(minNonNulls, df.columns)
/**
- * Returns a new [[DataFrame]] that drops rows containing
+ * Returns a new `DataFrame` that drops rows containing
* less than `minNonNulls` non-null and non-NaN values in the specified columns.
*
* @since 1.3.1
@@ -113,7 +113,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing less than
+ * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than
* `minNonNulls` non-null and non-NaN values in the specified columns.
*
* @since 1.3.1
@@ -126,21 +126,21 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
}
/**
- * Returns a new [[DataFrame]] that replaces null or NaN values in numeric columns with `value`.
+ * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
*
* @since 1.3.1
*/
def fill(value: Double): DataFrame = fill(value, df.columns)
/**
- * Returns a new [[DataFrame]] that replaces null values in string columns with `value`.
+ * Returns a new `DataFrame` that replaces null values in string columns with `value`.
*
* @since 1.3.1
*/
def fill(value: String): DataFrame = fill(value, df.columns)
/**
- * Returns a new [[DataFrame]] that replaces null or NaN values in specified numeric columns.
+ * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns.
* If a specified column is not a numeric column, it is ignored.
*
* @since 1.3.1
@@ -148,7 +148,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that replaces null or NaN values in specified
+ * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
* numeric columns. If a specified column is not a numeric column, it is ignored.
*
* @since 1.3.1
@@ -167,7 +167,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
}
/**
- * Returns a new [[DataFrame]] that replaces null values in specified string columns.
+ * Returns a new `DataFrame` that replaces null values in specified string columns.
* If a specified column is not a string column, it is ignored.
*
* @since 1.3.1
@@ -175,7 +175,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that replaces null values in
+ * (Scala-specific) Returns a new `DataFrame` that replaces null values in
* specified string columns. If a specified column is not a string column, it is ignored.
*
* @since 1.3.1
@@ -194,7 +194,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
}
/**
- * Returns a new [[DataFrame]] that replaces null values.
+ * Returns a new `DataFrame` that replaces null values.
*
* The key of the map is the column name, and the value of the map is the replacement value.
* The value must be of the following type:
@@ -213,7 +213,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.asScala.toSeq)
/**
- * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
+ * (Scala-specific) Returns a new `DataFrame` that replaces null values.
*
* The key of the map is the column name, and the value of the map is the replacement value.
* The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 858fa4c7609b6..a7bc7c68270f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -84,7 +84,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
/**
* Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary
- * tables, registered functions, but sharing the same [[SparkContext]], cached data and
+ * tables, registered functions, but sharing the same `SparkContext`, cached data and
* other things.
*
* @since 1.6.0
@@ -883,8 +883,8 @@ class SQLContext private[sql](val sparkSession: SparkSession)
}
/**
- * Loads an JavaRDD storing JSON objects (one object per record) and applies the given
- * schema, returning the result as a `DataFrame`.
+ * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the
+ * given schema, returning the result as a `DataFrame`.
*
* @group specificdata
* @deprecated As of 1.4.0, replaced by `read().json()`.
From aa5acbb016483c88caad0a57d6481a8fee93e1c3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 00:08:06 +0900
Subject: [PATCH 02/17] Fix errors (second round)
---
.../ml/regression/LinearRegression.scala | 10 ++---
.../apache/spark/mllib/clustering/LDA.scala | 2 +-
.../apache/spark/sql/DataFrameReader.scala | 43 +++++++++----------
.../spark/sql/DataFrameStatFunctions.scala | 20 ++++-----
.../apache/spark/sql/DataFrameWriter.scala | 30 ++++++-------
.../scala/org/apache/spark/sql/Dataset.scala | 38 ++++++++--------
.../org/apache/spark/sql/ForeachWriter.scala | 3 +-
.../org/apache/spark/sql/functions.scala | 20 ++++-----
8 files changed, 82 insertions(+), 84 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 95c6625920ec0..556e48a604ea7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -60,11 +60,11 @@ private[regression] trait LinearRegressionParams extends PredictorParams
* The learning objective is to minimize the squared error, with regularization.
* The specific squared error loss function used is:
*
- *
+ *
* $$
* L = 1/2n ||A coefficients - y||^2^
* $$
- *
+ *
*
* This supports multiple types of regularization:
* - none (a.k.a. ordinary least squares)
@@ -717,7 +717,7 @@ class LinearRegressionSummary private[regression] (
* If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
- * @see [[LinearRegression.solver]]
+ * @see `LinearRegression.solver`
*/
lazy val coefficientStandardErrors: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -745,7 +745,7 @@ class LinearRegressionSummary private[regression] (
* If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
- * @see [[LinearRegression.solver]]
+ * @see `LinearRegression.solver`
*/
lazy val tValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -768,7 +768,7 @@ class LinearRegressionSummary private[regression] (
* If `LinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*
- * @see [[LinearRegression.solver]]
+ * @see `LinearRegression.solver`
*/
lazy val pValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 63a39c1ce0274..14dfd3af54a71 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -216,7 +216,7 @@ class LDA private (
def getBeta: Double = getTopicConcentration
/**
- * Alias for [[setTopicConcentration()]]
+ * Alias for `setTopicConcentration()`
*/
@Since("1.3.0")
def setBeta(beta: Double): this.type = setTopicConcentration(beta)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 5be9a99369997..1af2f9afea5eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.types.StructType
/**
* Interface used to load a [[Dataset]] from external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[SparkSession.read]] to access this.
+ * key-value stores, etc). Use `SparkSession.read` to access this.
*
* @since 1.4.0
*/
@@ -116,7 +116,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external
+ * Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
* key-value stores).
*
* @since 1.4.0
@@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
+ * Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by
* a local or distributed file system).
*
* @since 1.4.0
@@ -136,7 +136,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads input in as a [[DataFrame]], for data sources that support multiple paths.
+ * Loads input in as a `DataFrame`, for data sources that support multiple paths.
* Only works if the source is a HadoopFsRelationProvider.
*
* @since 1.6.0
@@ -153,7 +153,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+ * Construct a `DataFrame` representing the database table accessible via JDBC URL
* url named table and connection properties.
*
* @since 1.4.0
@@ -163,7 +163,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+ * Construct a `DataFrame` representing the database table accessible via JDBC URL
* url named table. Partitions of the table will be retrieved in parallel based on the parameters
* passed to this function.
*
@@ -198,10 +198,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+ * Construct a `DataFrame` representing the database table accessible via JDBC URL
* url named table using connection properties. The `predicates` parameter gives a list
* expressions suitable for inclusion in WHERE clauses; each one defines one partition
- * of the [[DataFrame]].
+ * of the `DataFrame`.
*
* Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
* your external database systems.
@@ -240,7 +240,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
/**
* Loads a JSON file (JSON Lines text format or
- * newline-delimited JSON) and returns the result as a [[DataFrame]].
+ * newline-delimited JSON) and returns the result as a `DataFrame`.
* See the documentation on the overloaded `json()` method with varargs for more details.
*
* @since 1.4.0
@@ -252,7 +252,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
/**
* Loads a JSON file (JSON Lines text format or
- * newline-delimited JSON) and returns the result as a [[DataFrame]].
+ * newline-delimited JSON) and returns the result as a `DataFrame`.
*
* This function goes through the input once to determine the input schema. If you know the
* schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -299,7 +299,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
/**
* Loads a `JavaRDD[String]` storing JSON objects (JSON
* Lines text format or newline-delimited JSON) and returns the result as
- * a [[DataFrame]].
+ * a `DataFrame`.
*
* Unless the schema is specified using [[schema]] function, this function goes through the
* input once to determine the input schema.
@@ -311,7 +311,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
/**
* Loads an `RDD[String]` storing JSON objects (JSON Lines
- * text format or newline-delimited JSON) and returns the result as a [[DataFrame]].
+ * text format or newline-delimited JSON) and returns the result as a `DataFrame`.
*
* Unless the schema is specified using [[schema]] function, this function goes through the
* input once to determine the input schema.
@@ -341,7 +341,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the
+ * Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the
* other overloaded `csv()` method for more details.
*
* @since 2.0.0
@@ -352,7 +352,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads a CSV file and returns the result as a [[DataFrame]].
+ * Loads a CSV file and returns the result as a `DataFrame`.
*
* This function will go through the input once to determine the input schema if `inferSchema`
* is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -392,7 +392,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
* `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that
* indicates a timestamp format. Custom date formats follow the formats at
* `java.text.SimpleDateFormat`. This applies to timestamp type.
- * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format.
* `maxColumns` (default `20480`): defines a hard limit of how many columns
* a record can have.
* `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
@@ -415,7 +414,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
/**
- * Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation
+ * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation
* on the other overloaded `parquet()` method for more details.
*
* @since 2.0.0
@@ -426,7 +425,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads a Parquet file, returning the result as a [[DataFrame]].
+ * Loads a Parquet file, returning the result as a `DataFrame`.
*
* You can set the following Parquet-specific option(s) for reading Parquet files:
*
@@ -442,7 +441,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads an ORC file and returns the result as a [[DataFrame]].
+ * Loads an ORC file and returns the result as a `DataFrame`.
*
* @param path input path
* @since 1.5.0
@@ -454,7 +453,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads an ORC file and returns the result as a [[DataFrame]].
+ * Loads an ORC file and returns the result as a `DataFrame`.
*
* @param paths input paths
* @since 2.0.0
@@ -464,7 +463,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
def orc(paths: String*): DataFrame = format("orc").load(paths: _*)
/**
- * Returns the specified table as a [[DataFrame]].
+ * Returns the specified table as a `DataFrame`.
*
* @since 1.4.0
*/
@@ -475,7 +474,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+ * Loads text files and returns a `DataFrame` whose schema starts with a string column named
* "value", and followed by partitioned columns if there are any. See the documentation on
* the other overloaded `text()` method for more details.
*
@@ -487,7 +486,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+ * Loads text files and returns a `DataFrame` whose schema starts with a string column named
* "value", and followed by partitioned columns if there are any.
*
* Each line in the text files is a new row in the resulting DataFrame. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index a9a861c4635b2..f48ddd54d3650 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -44,7 +44,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* of `x` is close to (p * N).
* More precisely,
*
- * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+ * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
*
* This method implements a variation of the Greenwald-Khanna algorithm (with some speed
* optimizations).
@@ -55,7 +55,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param probabilities a list of quantile probabilities
* Each number must belong to [0, 1].
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
- * @param relativeError The relative target precision to achieve (>= 0).
+ * @param relativeError The relative target precision to achieve (>= 0).
* If set to zero, the exact quantiles are computed, which could be very expensive.
* Note that values greater than 1 are accepted but give the same result as 1.
* @return the approximate quantiles at the given probabilities
@@ -254,7 +254,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* and Papadimitriou.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
- * backward compatibility of the schema of the resulting [[DataFrame]].
+ * backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
@@ -299,7 +299,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* Uses a `default` support of 1%.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
- * backward compatibility of the schema of the resulting [[DataFrame]].
+ * backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
@@ -317,7 +317,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* its fraction as zero.
* @param seed random seed
* @tparam T stratum type
- * @return a new [[DataFrame]] that represents the stratified sample
+ * @return a new `DataFrame` that represents the stratified sample
*
* {{{
* val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
@@ -354,7 +354,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* its fraction as zero.
* @param seed random seed
* @tparam T stratum type
- * @return a new [[DataFrame]] that represents the stratified sample
+ * @return a new `DataFrame` that represents the stratified sample
*
* @since 1.5.0
*/
@@ -369,7 +369,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param depth depth of the sketch
* @param width width of the sketch
* @param seed random seed
- * @return a [[CountMinSketch]] over column `colName`
+ * @return a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -383,7 +383,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param eps relative error of the sketch
* @param confidence confidence of the sketch
* @param seed random seed
- * @return a [[CountMinSketch]] over column `colName`
+ * @return a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(
@@ -398,7 +398,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param depth depth of the sketch
* @param width width of the sketch
* @param seed random seed
- * @return a [[CountMinSketch]] over column `colName`
+ * @return a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -412,7 +412,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param eps relative error of the sketch
* @param confidence confidence of the sketch
* @param seed random seed
- * @return a [[CountMinSketch]] over column `colName`
+ * @return a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 2d863422fbabe..fc699095ad719 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType
/**
* Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[Dataset.write]] to access this.
+ * key-value stores, etc). Use `Dataset.write` to access this.
*
* @since 1.4.0
*/
@@ -189,7 +189,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] at the specified path.
+ * Saves the content of the `DataFrame` at the specified path.
*
* @since 1.4.0
*/
@@ -199,7 +199,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] as the specified table.
+ * Saves the content of the `DataFrame` as the specified table.
*
* @since 1.4.0
*/
@@ -215,8 +215,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
dataSource.write(mode, df)
}
/**
- * Inserts the content of the [[DataFrame]] to the specified table. It requires that
- * the schema of the [[DataFrame]] is the same as the schema of the table.
+ * Inserts the content of the `DataFrame` to the specified table. It requires that
+ * the schema of the `DataFrame` is the same as the schema of the table.
*
* @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
* resolution. For example:
@@ -322,15 +322,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] as the specified table.
+ * Saves the content of the `DataFrame` as the specified table.
*
* In the case the table already exists, behavior of this function depends on the
* save mode, specified by the `mode` function (default to throwing an exception).
- * When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+ * When `mode` is `Overwrite`, the schema of the `DataFrame` does not need to be
* the same as that of the existing table.
*
* When `mode` is `Append`, if there is an existing table, we will use the format and options of
- * the existing table. The column order in the schema of the [[DataFrame]] doesn't need to be same
+ * the existing table. The column order in the schema of the `DataFrame` doesn't need to be same
* as that of the existing table. Unlike `insertInto`, `saveAsTable` will use the column names to
* find the correct column positions. For example:
*
@@ -346,7 +346,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* +---+---+
* }}}
*
- * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input
+ * When the DataFrame is created from a non-partitioned `HadoopFsRelation` with a single input
* path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC
* and Parquet), the table is persisted in a Hive compatible format, which means other systems
* like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL
@@ -401,7 +401,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] to an external database table via JDBC. In the case the
+ * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the
* table already exists in the external database, behavior of this function depends on the
* save mode, specified by the `mode` function (default to throwing an exception).
*
@@ -442,7 +442,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] in JSON format (
+ * Saves the content of the `DataFrame` in JSON format (
* JSON Lines text format or newline-delimited JSON) at the specified path.
* This is equivalent to:
* {{{
@@ -469,7 +469,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] in Parquet format at the specified path.
+ * Saves the content of the `DataFrame` in Parquet format at the specified path.
* This is equivalent to:
* {{{
* format("parquet").save(path)
@@ -490,7 +490,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] in ORC format at the specified path.
+ * Saves the content of the `DataFrame` in ORC format at the specified path.
* This is equivalent to:
* {{{
* format("orc").save(path)
@@ -511,7 +511,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] in a text file at the specified path.
+ * Saves the content of the `DataFrame` in a text file at the specified path.
* The DataFrame must have only one column that is of string type.
* Each row becomes a new line in the output file. For example:
* {{{
@@ -536,7 +536,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
}
/**
- * Saves the content of the [[DataFrame]] in CSV format at the specified path.
+ * Saves the content of the `DataFrame` in CSV format at the specified path.
* This is equivalent to:
* {{{
* format("csv").save(path)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7ba6ffce278cf..127a31a756cba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -472,8 +472,8 @@ class Dataset[T] private[sql](
/**
* Returns true if this Dataset contains one or more sources that continuously
* return data as it arrives. A Dataset that reads data from a streaming source
- * must be executed as a [[StreamingQuery]] using the `start()` method in
- * [[DataStreamWriter]]. Methods that return a single answer, e.g. `count()` or
+ * must be executed as a `StreamingQuery` using the `start()` method in
+ * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or
* `collect()`, will throw an [[AnalysisException]] when there is a streaming
* source present.
*
@@ -685,7 +685,7 @@ class Dataset[T] private[sql](
def stat: DataFrameStatFunctions = new DataFrameStatFunctions(toDF())
/**
- * Join with another [[DataFrame]].
+ * Join with another `DataFrame`.
*
* Behaves as an INNER JOIN and requires a subsequent join predicate.
*
@@ -699,7 +699,7 @@ class Dataset[T] private[sql](
}
/**
- * Inner equi-join with another [[DataFrame]] using the given column.
+ * Inner equi-join with another `DataFrame` using the given column.
*
* Different from other join functions, the join column will only appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
@@ -713,7 +713,7 @@ class Dataset[T] private[sql](
* @param usingColumn Name of the column to join on. This column must exist on both sides.
*
* @note If you perform a self-join using this function without aliasing the input
- * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+ * `DataFrame`s, you will NOT be able to reference any columns after the join, since
* there is no way to disambiguate which side of the join you would like to reference.
*
* @group untypedrel
@@ -724,7 +724,7 @@ class Dataset[T] private[sql](
}
/**
- * Inner equi-join with another [[DataFrame]] using the given columns.
+ * Inner equi-join with another `DataFrame` using the given columns.
*
* Different from other join functions, the join columns will only appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
@@ -738,7 +738,7 @@ class Dataset[T] private[sql](
* @param usingColumns Names of the columns to join on. This columns must exist on both sides.
*
* @note If you perform a self-join using this function without aliasing the input
- * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+ * `DataFrame`s, you will NOT be able to reference any columns after the join, since
* there is no way to disambiguate which side of the join you would like to reference.
*
* @group untypedrel
@@ -749,7 +749,7 @@ class Dataset[T] private[sql](
}
/**
- * Equi-join with another [[DataFrame]] using the given columns.
+ * Equi-join with another `DataFrame` using the given columns.
*
* Different from other join functions, the join columns will only appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
@@ -759,7 +759,7 @@ class Dataset[T] private[sql](
* @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
*
* @note If you perform a self-join using this function without aliasing the input
- * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+ * `DataFrame`s, you will NOT be able to reference any columns after the join, since
* there is no way to disambiguate which side of the join you would like to reference.
*
* @group untypedrel
@@ -782,7 +782,7 @@ class Dataset[T] private[sql](
}
/**
- * Inner join with another [[DataFrame]], using the given join expression.
+ * Inner join with another `DataFrame`, using the given join expression.
*
* {{{
* // The following two are equivalent:
@@ -796,7 +796,7 @@ class Dataset[T] private[sql](
def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner")
/**
- * Join with another [[DataFrame]], using the given join expression. The following performs
+ * Join with another `DataFrame`, using the given join expression. The following performs
* a full outer join between `df1` and `df2`.
*
* {{{
@@ -860,7 +860,7 @@ class Dataset[T] private[sql](
}
/**
- * Explicit cartesian join with another [[DataFrame]].
+ * Explicit cartesian join with another `DataFrame`.
*
* @param right Right side of the join operation.
*
@@ -875,7 +875,7 @@ class Dataset[T] private[sql](
/**
* :: Experimental ::
- * Joins this Dataset returning a [[Tuple2]] for each pair where `condition` evaluates to
+ * Joins this Dataset returning a `Tuple2` for each pair where `condition` evaluates to
* true.
*
* This is similar to the relation `join` function with one important difference in the
@@ -956,7 +956,7 @@ class Dataset[T] private[sql](
/**
* :: Experimental ::
- * Using inner equi-join to join this Dataset returning a [[Tuple2]] for each pair
+ * Using inner equi-join to join this Dataset returning a `Tuple2` for each pair
* where `condition` evaluates to true.
*
* @param other Right side of the join.
@@ -2232,7 +2232,7 @@ class Dataset[T] private[sql](
}
/**
- * Returns a new [[DataFrame]] that contains the result of applying a serialized R function
+ * Returns a new `DataFrame` that contains the result of applying a serialized R function
* `func` to each partition.
*/
private[sql] def mapPartitionsInR(
@@ -2446,7 +2446,7 @@ class Dataset[T] private[sql](
/**
* Returns a new Dataset that has exactly `numPartitions` partitions.
- * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
+ * Similar to coalesce defined on an `RDD`, this operation results in a narrow dependency, e.g.
* if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
* the 100 new partitions will claim 10 of the current partitions.
*
@@ -2536,7 +2536,7 @@ class Dataset[T] private[sql](
def unpersist(): this.type = unpersist(blocking = false)
/**
- * Represents the content of the Dataset as an [[RDD]] of [[T]].
+ * Represents the content of the Dataset as an `RDD` of [[T]].
*
* @group basic
* @since 1.6.0
@@ -2550,14 +2550,14 @@ class Dataset[T] private[sql](
}
/**
- * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+ * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
* @group basic
* @since 1.6.0
*/
def toJavaRDD: JavaRDD[T] = rdd.toJavaRDD()
/**
- * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+ * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
* @group basic
* @since 1.6.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
index 1163035e315fc..b94ad59fa2f6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
@@ -18,11 +18,10 @@
package org.apache.spark.sql
import org.apache.spark.annotation.{Experimental, InterfaceStability}
-import org.apache.spark.sql.streaming.StreamingQuery
/**
* :: Experimental ::
- * A class to consume data generated by a [[StreamingQuery]]. Typically this is used to send the
+ * A class to consume data generated by a `StreamingQuery`. Typically this is used to send the
* generated data to external systems. Each partition will use a new deserialized instance, so you
* usually should do all the initialization (e.g. opening a connection or initiating a transaction)
* in the `open` method.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d5940c638acdb..fbeebb9c2a5fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -474,7 +474,7 @@ object functions {
/**
* Aggregate function: returns the level of grouping, equals to
*
- * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+ * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
*
* @note The list of columns should match with grouping columns exactly, or empty (means all the
* grouping columns).
@@ -487,7 +487,7 @@ object functions {
/**
* Aggregate function: returns the level of grouping, equals to
*
- * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+ * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
*
* @note The list of columns should match with grouping columns exactly.
*
@@ -1048,9 +1048,9 @@ object functions {
* within each partition in the lower 33 bits. The assumption is that the data frame has
* less than 1 billion partitions, and each partition has less than 8 billion records.
*
- * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+ * As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
- * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
*
* @group normal_funcs
* @since 1.4.0
@@ -1066,9 +1066,9 @@ object functions {
* within each partition in the lower 33 bits. The assumption is that the data frame has
* less than 1 billion partitions, and each partition has less than 8 billion records.
*
- * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+ * As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
- * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
*
* @group normal_funcs
* @since 1.6.0
@@ -1184,7 +1184,7 @@ object functions {
/**
* Creates a new struct column.
- * If the input column is a column in a [[DataFrame]], or a derived column expression
+ * If the input column is a column in a `DataFrame`, or a derived column expression
* that is named (i.e. aliased), its name would be remained as the StructField's name,
* otherwise, the newly generated StructField's name would be auto generated as col${index + 1},
* i.e. col1, col2, col3, ...
@@ -1846,8 +1846,8 @@ object functions {
def round(e: Column): Column = round(e, 0)
/**
- * Round the value of `e` to `scale` decimal places if `scale` >= 0
- * or at integral part when `scale` < 0.
+ * Round the value of `e` to `scale` decimal places if `scale` >= 0
+ * or at integral part when `scale` < 0.
*
* @group math_funcs
* @since 1.5.0
@@ -1864,7 +1864,7 @@ object functions {
/**
* Round the value of `e` to `scale` decimal places with HALF_EVEN round mode
- * if `scale` >= 0 or at integral part when `scale` < 0.
+ * if `scale` >= 0 or at integral part when `scale` < 0.
*
* @group math_funcs
* @since 2.0.0
From ff17c3a03f681947a6bc2729bd29fd14f07fe20d Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 00:38:32 +0900
Subject: [PATCH 03/17] Fix errors (third round)
---
.../scala/org/apache/spark/Accumulator.scala | 2 +-
.../scala/org/apache/spark/SparkConf.scala | 2 +-
.../org/apache/spark/TaskEndReason.scala | 2 +-
.../spark/scheduler/InputFormatInfo.scala | 2 +-
.../spark/streaming/kafka/KafkaCluster.scala | 8 +--
.../spark/streaming/kafka/KafkaUtils.scala | 18 +++---
.../spark/streaming/kafka/OffsetRange.scala | 2 +-
.../stat/test/KolmogorovSmirnovTest.scala | 3 +-
.../spark/mllib/stat/test/StreamingTest.scala | 6 +-
.../mllib/stat/test/StreamingTestMethod.scala | 4 +-
.../spark/sql/InternalOutputModes.scala | 2 +-
.../main/scala/org/apache/spark/sql/Row.scala | 2 +-
.../spark/sql/DataFrameStatFunctions.scala | 6 +-
.../scala/org/apache/spark/sql/Dataset.scala | 6 +-
.../spark/sql/KeyValueGroupedDataset.scala | 8 +--
.../spark/sql/RelationalGroupedDataset.scala | 28 ++++-----
.../org/apache/spark/sql/RuntimeConfig.scala | 4 +-
.../org/apache/spark/sql/SparkSession.scala | 60 +++++++++----------
.../apache/spark/sql/UDFRegistration.scala | 2 +-
.../org/apache/spark/sql/functions.scala | 16 ++---
20 files changed, 92 insertions(+), 91 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index 9d1f1d59dbce1..1a45e15cdc728 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
*
* An accumulator is created from an initial value `v` by calling
* [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator.
+ * Tasks running on the cluster can then add to it using the [[Accumulable.+=]] operator.
* However, they cannot read its value. Only the driver program can read the accumulator's value,
* using its [[#value]] method.
*
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 04d657c09afd0..bc8010eca2e1b 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -262,7 +262,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no
* suffix is provided then seconds are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getTimeAsSeconds(key: String): Long = {
Utils.timeStringAsSeconds(get(key))
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 7ca3c103dbf5b..7745387dbceba 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -65,7 +65,7 @@ sealed trait TaskFailedReason extends TaskEndReason {
/**
* :: DeveloperApi ::
- * A [[org.apache.spark.scheduler.ShuffleMapTask]] that completed successfully earlier, but we
+ * A `org.apache.spark.scheduler.ShuffleMapTask` that completed successfully earlier, but we
* lost the executor before the stage completed. This means Spark needs to reschedule the task
* to be re-executed on a different executor.
*/
diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index a6b032cc0084c..5f23d657e1155 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -153,7 +153,7 @@ object InputFormatInfo {
a) For each host, count number of splits hosted on that host.
b) Decrement the currently allocated containers on that host.
- c) Compute rack info for each host and update rack -> count map based on (b).
+ c) Compute rack info for each host and update rack -> count map based on (b).
d) Allocate nodes based on (c)
e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node
(even if data locality on that is very high) : this is to prevent fragility of job if a
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index 35acb7b09f12b..c419221aa607a 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -231,7 +231,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
// this 0 here indicates api version, in this case the original ZK backed api.
private def defaultConsumerApiVersion: Short = 0
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
@@ -250,7 +250,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
}
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
@@ -287,7 +287,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
Left(errs)
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long]
@@ -305,7 +305,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata]
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 56f0cb0b166a2..59f4e408569f6 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -47,7 +47,7 @@ object KafkaUtils {
* @param ssc StreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel Storage level to use for storing the received objects
* (default: StorageLevel.MEMORY_AND_DISK_SER_2)
@@ -72,7 +72,7 @@ object KafkaUtils {
* @param ssc StreamingContext object
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel Storage level to use for storing the received objects
* @tparam K type of Kafka message key
@@ -97,7 +97,7 @@ object KafkaUtils {
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread
* @return DStream of (Kafka message key, Kafka message value)
*/
@@ -115,7 +115,7 @@ object KafkaUtils {
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..).
* @param groupId The group id for this consumer.
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel RDD storage level.
* @return DStream of (Kafka message key, Kafka message value)
@@ -140,7 +140,7 @@ object KafkaUtils {
* @param valueDecoderClass Type of kafka value decoder
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel RDD storage level.
* @tparam K type of Kafka message key
@@ -396,7 +396,7 @@ object KafkaUtils {
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
- * in the [[StreamingContext]]. The information on consumed offset can be
+ * in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -448,7 +448,7 @@ object KafkaUtils {
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
- * in the [[StreamingContext]]. The information on consumed offset can be
+ * in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -499,7 +499,7 @@ object KafkaUtils {
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
- * in the [[StreamingContext]]. The information on consumed offset can be
+ * in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -565,7 +565,7 @@ object KafkaUtils {
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
- * in the [[StreamingContext]]. The information on consumed offset can be
+ * in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index d9b856e4697a0..10d364f987405 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -22,7 +22,7 @@ import kafka.common.TopicAndPartition
/**
* Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
* offset ranges in RDDs generated by the direct Kafka DStream (see
- * [[KafkaUtils.createDirectStream()]]).
+ * `KafkaUtils.createDirectStream()`).
* {{{
* KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
* val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index a8b5955a7285d..d17f7047c5b2b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
* distribution of the sample data and the theoretical distribution we can provide a test for the
* the null hypothesis that the sample data comes from that theoretical distribution.
* For more information on KS Test:
- * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+ * @see
+ * Kolmogorov-Smirnov test (Wikipedia)
*
* Implementation note: We seek to implement the KS test with a minimal number of distributed
* passes. We sort the RDD, and then perform the following operations on a per-partition basis:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
index 97c032de7a813..d680237bf687f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -47,7 +47,7 @@ case class BinarySample @Since("1.6.0") (
* of the observation.
*
* To address novelty affects, the `peacePeriod` specifies a set number of initial
- * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing.
+ * [[org.apache.spark.rdd.RDD]] batches of the `DStream` to be dropped from significance testing.
*
* The `windowSize` sets the number of batches each significance test is to be performed over. The
* window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform
@@ -97,7 +97,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
}
/**
- * Register a [[DStream]] of values for significance testing.
+ * Register a `DStream` of values for significance testing.
*
* @param data stream of BinarySample(key,value) pairs where the key denotes group membership
* (true = experiment, false = control) and the value is the numerical metric to
@@ -114,7 +114,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
}
/**
- * Register a [[JavaDStream]] of values for significance testing.
+ * Register a `JavaDStream` of values for significance testing.
*
* @param data stream of BinarySample(isExperiment,value) pairs where the isExperiment denotes
* group (true = experiment, false = control) and the value is the numerical metric
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
index ff27f28459e26..14ac14d6d61f4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -73,7 +73,7 @@ private[stat] sealed trait StreamingTestMethod extends Serializable {
* This test does not assume equal variance between the two samples and does not assume equal
* sample size.
*
- * @see http://en.wikipedia.org/wiki/Welch%27s_t_test
+ * @see Welch's t-test (Wikipedia)
*/
private[stat] object WelchTTest extends StreamingTestMethod with Logging {
@@ -115,7 +115,7 @@ private[stat] object WelchTTest extends StreamingTestMethod with Logging {
* mean. This test assumes equal variance between the two samples and does not assume equal sample
* size. For unequal variances, Welch's t-test should be used instead.
*
- * @see http://en.wikipedia.org/wiki/Student%27s_t-test
+ * @see Student's t-test (Wikipedia)
*/
private[stat] object StudentTTest extends StreamingTestMethod with Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
index 153f9f57faf42..594c41c2c7446 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
import org.apache.spark.sql.streaming.OutputMode
/**
- * Internal helper class to generate objects representing various [[OutputMode]]s,
+ * Internal helper class to generate objects representing various `OutputMode`s,
*/
private[sql] object InternalOutputModes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index a821d2ca34579..71c5151c74478 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -74,7 +74,7 @@ object Row {
* It is invalid to use the native primitive interface to retrieve a value that is null, instead a
* user must check `isNullAt` before attempting to retrieve a value that might be null.
*
- * To create a new Row, use [[RowFactory.create()]] in Java or [[Row.apply()]] in Scala.
+ * To create a new Row, use `RowFactory.create()` in Java or `Row.apply()` in Scala.
*
* A [[Row]] object can be constructed by providing field values. Example:
* {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index f48ddd54d3650..f27ca9aeb9235 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch}
/**
- * Statistic functions for [[DataFrame]]s.
+ * Statistic functions for `DataFrame`s.
*
* @since 1.4.0
*/
@@ -189,7 +189,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* The `support` should be greater than 1e-4.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
- * backward compatibility of the schema of the resulting [[DataFrame]].
+ * backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols the names of the columns to search frequent items in.
* @param support The minimum frequency for an item to be considered `frequent`. Should be greater
@@ -236,7 +236,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* Uses a `default` support of 1%.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
- * backward compatibility of the schema of the resulting [[DataFrame]].
+ * backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 127a31a756cba..fcc02e5eb3ef9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -68,7 +68,7 @@ private[sql] object Dataset {
/**
* A Dataset is a strongly typed collection of domain-specific objects that can be transformed
* in parallel using functional or relational operations. Each Dataset also has an untyped view
- * called a [[DataFrame]], which is a Dataset of [[Row]].
+ * called a `DataFrame`, which is a Dataset of [[Row]].
*
* Operations available on Datasets are divided into transformations and actions. Transformations
* are the ones that produce new Datasets, and actions are the ones that trigger computation and
@@ -363,7 +363,7 @@ class Dataset[T] private[sql](
* - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will
* be assigned to `_1`).
* - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the
- * [[DataFrame]] will be used.
+ * `DataFrame` will be used.
*
* If the schema of the Dataset does not match the desired `U` type, you can use `select`
* along with `alias` or `as` to rearrange or rename as required.
@@ -377,7 +377,7 @@ class Dataset[T] private[sql](
/**
* Converts this strongly typed collection of data to generic `DataFrame` with columns renamed.
- * This can be quite convenient in conversion from an RDD of tuples into a [[DataFrame]] with
+ * This can be quite convenient in conversion from an RDD of tuples into a `DataFrame` with
* meaningful names. For example:
* {{{
* val rdd: RDD[(Int, String)] = ...
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 31ce8eb25e808..395d709f26591 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -131,7 +131,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [[Dataset]]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
- * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+ * `org.apache.spark.sql.expressions#Aggregator`.
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
@@ -160,7 +160,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [[Dataset]]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
- * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+ * `org.apache.spark.sql.expressions#Aggregator`.
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
@@ -182,7 +182,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [[Dataset]]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
- * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+ * `org.apache.spark.sql.expressions#Aggregator`.
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
@@ -205,7 +205,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [[Dataset]]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
- * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+ * `org.apache.spark.sql.expressions#Aggregator`.
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index f019d1e9daceb..0b1e191a1cd99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -129,7 +129,7 @@ class RelationalGroupedDataset protected[sql](
/**
* (Scala-specific) Compute aggregates by specifying the column names and
- * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+ * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
*
* The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
* {{{
@@ -150,7 +150,7 @@ class RelationalGroupedDataset protected[sql](
/**
* (Scala-specific) Compute aggregates by specifying a map from column name to
- * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+ * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
*
* The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
* {{{
@@ -171,7 +171,7 @@ class RelationalGroupedDataset protected[sql](
/**
* (Java-specific) Compute aggregates by specifying a map from column name to
- * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+ * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
*
* The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
* {{{
@@ -228,7 +228,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Count the number of rows for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
*
* @since 1.3.0
*/
@@ -236,7 +236,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Compute the average value for each numeric columns for each group. This is an alias for `avg`.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
* When specified columns are given, only compute the average values for them.
*
* @since 1.3.0
@@ -248,7 +248,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Compute the max value for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
* When specified columns are given, only compute the max values for them.
*
* @since 1.3.0
@@ -260,7 +260,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Compute the mean value for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
* When specified columns are given, only compute the mean values for them.
*
* @since 1.3.0
@@ -272,7 +272,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Compute the min value for each numeric column for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
* When specified columns are given, only compute the min values for them.
*
* @since 1.3.0
@@ -284,7 +284,7 @@ class RelationalGroupedDataset protected[sql](
/**
* Compute the sum for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
+ * The resulting `DataFrame` will also contain the grouping columns.
* When specified columns are given, only compute the sum for them.
*
* @since 1.3.0
@@ -295,7 +295,7 @@ class RelationalGroupedDataset protected[sql](
}
/**
- * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+ * Pivots a column of the current `DataFrame` and perform the specified aggregation.
* There are two versions of pivot function: one that requires the caller to specify the list
* of distinct values to pivot on, and one that does not. The latter is more concise but less
* efficient, because Spark needs to first compute the list of distinct values internally.
@@ -335,7 +335,7 @@ class RelationalGroupedDataset protected[sql](
}
/**
- * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+ * Pivots a column of the current `DataFrame` and perform the specified aggregation.
* There are two versions of pivot function: one that requires the caller to specify the list
* of distinct values to pivot on, and one that does not. The latter is more concise but less
* efficient, because Spark needs to first compute the list of distinct values internally.
@@ -367,7 +367,7 @@ class RelationalGroupedDataset protected[sql](
}
/**
- * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+ * Pivots a column of the current `DataFrame` and perform the specified aggregation.
* There are two versions of pivot function: one that requires the caller to specify the list
* of distinct values to pivot on, and one that does not. The latter is more concise but less
* efficient, because Spark needs to first compute the list of distinct values internally.
@@ -392,12 +392,12 @@ class RelationalGroupedDataset protected[sql](
* Applies the given serialized R function `func` to each group of data. For each unique group,
* the function will be passed the group key and an iterator that contains all of the elements in
* the group. The function can return an iterator containing elements of an arbitrary type which
- * will be returned as a new [[DataFrame]].
+ * will be returned as a new `DataFrame`.
*
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [[Dataset]]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
- * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+ * `org.apache.spark.sql.expressions#Aggregator`.
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
index 9108d19d0a0c2..43684abc13629 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
/**
- * Runtime configuration interface for Spark. To access this, use [[SparkSession.conf]].
+ * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`.
*
* Options set here are automatically propagated to the Hadoop configuration during I/O.
*
@@ -65,7 +65,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) {
/**
* Returns the value of Spark runtime configuration property for the given key.
*
- * @throws NoSuchElementException if the key is not set and does not have a default value
+ * @note Throws `NoSuchElementException` if the key is not set and does not have a default value
* @since 2.0.0
*/
@throws[NoSuchElementException]("if the key is not set")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 71b1880dc0715..08d74ac0185b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -93,7 +93,7 @@ class SparkSession private(
* ----------------------- */
/**
- * State shared across sessions, including the [[SparkContext]], cached data, listener,
+ * State shared across sessions, including the `SparkContext`, cached data, listener,
* and a catalog that interacts with external systems.
*/
@transient
@@ -125,7 +125,7 @@ class SparkSession private(
*
* This is the interface through which the user can get and set all Spark and Hadoop
* configurations that are relevant to Spark SQL. When getting the value of a config,
- * this defaults to the value set in the underlying [[SparkContext]], if any.
+ * this defaults to the value set in the underlying `SparkContext`, if any.
*
* @since 2.0.0
*/
@@ -189,8 +189,8 @@ class SparkSession private(
/**
* :: Experimental ::
- * Returns a [[StreamingQueryManager]] that allows managing all the
- * [[StreamingQuery StreamingQueries]] active on `this`.
+ * Returns a `StreamingQueryManager` that allows managing all the
+ * `StreamingQuery`s active on `this`.
*
* @since 2.0.0
*/
@@ -200,9 +200,9 @@ class SparkSession private(
/**
* Start a new session with isolated SQL configurations, temporary tables, registered
- * functions are isolated, but sharing the underlying [[SparkContext]] and cached data.
+ * functions are isolated, but sharing the underlying `SparkContext` and cached data.
*
- * @note Other than the [[SparkContext]], all shared state is initialized lazily.
+ * @note Other than the `SparkContext`, all shared state is initialized lazily.
* This method will force the initialization of the shared state to ensure that parent
* and child sessions are set up with the same shared state. If the underlying catalog
* implementation is Hive, this will initialize the metastore, which may take some time.
@@ -219,7 +219,7 @@ class SparkSession private(
* --------------------------------- */
/**
- * Returns a [[DataFrame]] with no rows or columns.
+ * Returns a `DataFrame` with no rows or columns.
*
* @since 2.0.0
*/
@@ -243,7 +243,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples).
+ * Creates a `DataFrame` from an RDD of Product (e.g. case classes, tuples).
*
* @since 2.0.0
*/
@@ -257,7 +257,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[DataFrame]] from a local Seq of Product.
+ * Creates a `DataFrame` from a local Seq of Product.
*
* @since 2.0.0
*/
@@ -272,7 +272,7 @@ class SparkSession private(
/**
* :: DeveloperApi ::
- * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
+ * Creates a `DataFrame` from an `RDD` containing [[Row]]s using the given schema.
* It is important to make sure that the structure of every [[Row]] of the provided RDD matches
* the provided schema. Otherwise, there will be runtime exception.
* Example:
@@ -309,7 +309,7 @@ class SparkSession private(
/**
* :: DeveloperApi ::
- * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema.
+ * Creates a `DataFrame` from a `JavaRDD` containing [[Row]]s using the given schema.
* It is important to make sure that the structure of every [[Row]] of the provided RDD matches
* the provided schema. Otherwise, there will be runtime exception.
*
@@ -323,7 +323,7 @@ class SparkSession private(
/**
* :: DeveloperApi ::
- * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema.
+ * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema.
* It is important to make sure that the structure of every [[Row]] of the provided List matches
* the provided schema. Otherwise, there will be runtime exception.
*
@@ -381,7 +381,7 @@ class SparkSession private(
}
/**
- * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+ * Convert a `BaseRelation` created for external data sources into a `DataFrame`.
*
* @since 2.0.0
*/
@@ -470,7 +470,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+ * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
* in a range from 0 to `end` (exclusive) with step value 1.
*
* @since 2.0.0
@@ -481,7 +481,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+ * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
* in a range from `start` to `end` (exclusive) with step value 1.
*
* @since 2.0.0
@@ -494,7 +494,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+ * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
* in a range from `start` to `end` (exclusive) with a step value.
*
* @since 2.0.0
@@ -507,7 +507,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+ * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
* in a range from `start` to `end` (exclusive) with a step value, with partition number
* specified.
*
@@ -520,7 +520,7 @@ class SparkSession private(
}
/**
- * Creates a [[DataFrame]] from an RDD[Row].
+ * Creates a `DataFrame` from an RDD[Row].
* User can specify whether the input rows should be converted to Catalyst rows.
*/
private[sql] def internalCreateDataFrame(
@@ -533,7 +533,7 @@ class SparkSession private(
}
/**
- * Creates a [[DataFrame]] from an RDD[Row].
+ * Creates a `DataFrame` from an RDD[Row].
* User can specify whether the input rows should be converted to Catalyst rows.
*/
private[sql] def createDataFrame(
@@ -566,7 +566,7 @@ class SparkSession private(
@transient lazy val catalog: Catalog = new CatalogImpl(self)
/**
- * Returns the specified table as a [[DataFrame]].
+ * Returns the specified table as a `DataFrame`.
*
* @since 2.0.0
*/
@@ -583,7 +583,7 @@ class SparkSession private(
* ----------------- */
/**
- * Executes a SQL query using Spark, returning the result as a [[DataFrame]].
+ * Executes a SQL query using Spark, returning the result as a `DataFrame`.
* The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'.
*
* @since 2.0.0
@@ -594,7 +594,7 @@ class SparkSession private(
/**
* Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a
- * [[DataFrame]].
+ * `DataFrame`.
* {{{
* sparkSession.read.parquet("/path/to/file.parquet")
* sparkSession.read.schema(schema).json("/path/to/file.json")
@@ -606,7 +606,7 @@ class SparkSession private(
/**
* :: Experimental ::
- * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]].
+ * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`.
* {{{
* sparkSession.readStream.parquet("/path/to/directory/of/parquet/files")
* sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files")
@@ -624,7 +624,7 @@ class SparkSession private(
/**
* :: Experimental ::
* (Scala-specific) Implicit methods available in Scala for converting
- * common Scala objects into [[DataFrame]]s.
+ * common Scala objects into `DataFrame`s.
*
* {{{
* val sparkSession = SparkSession.builder.getOrCreate()
@@ -641,7 +641,7 @@ class SparkSession private(
// scalastyle:on
/**
- * Stop the underlying [[SparkContext]].
+ * Stop the underlying `SparkContext`.
*
* @since 2.0.0
*/
@@ -726,7 +726,7 @@ object SparkSession {
/**
* Sets a config option. Options set using this method are automatically propagated to
- * both [[SparkConf]] and SparkSession's own configuration.
+ * both `SparkConf` and SparkSession's own configuration.
*
* @since 2.0.0
*/
@@ -737,7 +737,7 @@ object SparkSession {
/**
* Sets a config option. Options set using this method are automatically propagated to
- * both [[SparkConf]] and SparkSession's own configuration.
+ * both `SparkConf` and SparkSession's own configuration.
*
* @since 2.0.0
*/
@@ -748,7 +748,7 @@ object SparkSession {
/**
* Sets a config option. Options set using this method are automatically propagated to
- * both [[SparkConf]] and SparkSession's own configuration.
+ * both `SparkConf` and SparkSession's own configuration.
*
* @since 2.0.0
*/
@@ -759,7 +759,7 @@ object SparkSession {
/**
* Sets a config option. Options set using this method are automatically propagated to
- * both [[SparkConf]] and SparkSession's own configuration.
+ * both `SparkConf` and SparkSession's own configuration.
*
* @since 2.0.0
*/
@@ -769,7 +769,7 @@ object SparkSession {
}
/**
- * Sets a list of config options based on the given [[SparkConf]].
+ * Sets a list of config options based on the given `SparkConf`.
*
* @since 2.0.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 6043c5ee14b54..c8be89c646957 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.util.Utils
/**
- * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
+ * Functions for registering user-defined functions. Use `SQLContext.udf` to access this.
*
* @note The user-defined functions must be deterministic.
*
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index fbeebb9c2a5fe..93e7229b20c1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2172,7 +2172,7 @@ object functions {
* and returns the result as a string column.
*
* If d is 0, the result has no decimal point or fractional part.
- * If d < 0, the result will be null.
+ * If d < 0, the result will be null.
*
* @group string_funcs
* @since 1.5.0
@@ -2888,7 +2888,7 @@ object functions {
}
/**
- * (Scala-specific) Parses a column containing a JSON string into a [[StructType]] with the
+ * (Scala-specific) Parses a column containing a JSON string into a `StructType` with the
* specified schema. Returns `null`, in the case of an unparseable string.
*
* @param e a string column containing JSON data.
@@ -2904,7 +2904,7 @@ object functions {
}
/**
- * (Java-specific) Parses a column containing a JSON string into a [[StructType]] with the
+ * (Java-specific) Parses a column containing a JSON string into a `StructType` with the
* specified schema. Returns `null`, in the case of an unparseable string.
*
* @param e a string column containing JSON data.
@@ -2919,7 +2919,7 @@ object functions {
from_json(e, schema, options.asScala.toMap)
/**
- * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+ * Parses a column containing a JSON string into a `StructType` with the specified schema.
* Returns `null`, in the case of an unparseable string.
*
* @param e a string column containing JSON data.
@@ -2932,7 +2932,7 @@ object functions {
from_json(e, schema, Map.empty[String, String])
/**
- * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+ * Parses a column containing a JSON string into a `StructType` with the specified schema.
* Returns `null`, in the case of an unparseable string.
*
* @param e a string column containing JSON data.
@@ -2946,7 +2946,7 @@ object functions {
/**
- * (Scala-specific) Converts a column containing a [[StructType]] into a JSON string with the
+ * (Scala-specific) Converts a column containing a `StructType` into a JSON string with the
* specified schema. Throws an exception, in the case of an unsupported type.
*
* @param e a struct column.
@@ -2961,7 +2961,7 @@ object functions {
}
/**
- * (Java-specific) Converts a column containing a [[StructType]] into a JSON string with the
+ * (Java-specific) Converts a column containing a `StructType` into a JSON string with the
* specified schema. Throws an exception, in the case of an unsupported type.
*
* @param e a struct column.
@@ -2975,7 +2975,7 @@ object functions {
to_json(e, options.asScala.toMap)
/**
- * Converts a column containing a [[StructType]] into a JSON string with the
+ * Converts a column containing a `StructType` into a JSON string with the
* specified schema. Throws an exception, in the case of an unsupported type.
*
* @param e a struct column.
From 1593545d367009c17b3c271e23ae35723af9bd46 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 00:39:20 +0900
Subject: [PATCH 04/17] Fix the linelength style in KafkaUtils.scala
---
.../scala/org/apache/spark/streaming/kafka/KafkaUtils.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 59f4e408569f6..437c797e55605 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -72,8 +72,8 @@ object KafkaUtils {
* @param ssc StreamingContext object
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
- * in its own thread.
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is
+ * consumed in its own thread.
* @param storageLevel Storage level to use for storing the received objects
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
From 558d5e3982bd7af552fa6f5a1b581355f7b316ff Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 01:41:44 +0900
Subject: [PATCH 05/17] Fix errors (fourth round)
---
.../scala/org/apache/spark/Accumulator.scala | 2 +-
.../scala/org/apache/spark/SparkConf.scala | 10 ++---
.../scala/org/apache/spark/SparkContext.scala | 14 +++----
.../scala/org/apache/spark/TaskContext.scala | 4 +-
.../scala/org/apache/spark/TestUtils.scala | 2 +-
.../apache/spark/rdd/DoubleRDDFunctions.scala | 2 +-
.../org/apache/spark/rdd/HadoopRDD.scala | 2 +-
.../scala/org/apache/spark/rdd/JdbcRDD.scala | 6 +--
.../org/apache/spark/rdd/NewHadoopRDD.scala | 2 +-
.../apache/spark/rdd/PairRDDFunctions.scala | 16 ++++----
.../apache/spark/rdd/RDDCheckpointData.scala | 2 +-
.../apache/spark/rdd/coalesce-public.scala | 4 +-
.../spark/storage/BlockManagerMessages.scala | 2 +-
.../spark/util/random/SamplingUtils.scala | 16 ++++----
.../util/random/StratifiedSamplingUtils.scala | 8 ++--
.../org/apache/spark/ml/param/params.scala | 38 +++++++++----------
.../spark/mllib/classification/SVM.scala | 2 +-
.../BinaryClassificationMetrics.scala | 5 ++-
.../linalg/EigenValueDecomposition.scala | 2 +-
.../apache/spark/mllib/linalg/Vectors.scala | 4 +-
.../spark/mllib/random/RandomRDDs.scala | 8 ++--
.../spark/mllib/tree/DecisionTree.scala | 6 +--
.../mllib/tree/GradientBoostedTrees.scala | 6 +--
.../spark/mllib/tree/RandomForest.scala | 16 ++++----
.../spark/sql/RelationalGroupedDataset.scala | 2 +-
.../apache/spark/sql/sources/interfaces.scala | 12 +++---
.../apache/spark/streaming/StateSpec.scala | 2 +-
.../streaming/api/java/JavaPairDStream.scala | 4 +-
.../api/java/JavaStreamingContext.scala | 2 +-
.../dstream/PairDStreamFunctions.scala | 4 +-
30 files changed, 104 insertions(+), 101 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index 1a45e15cdc728..bcf157078813d 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
*
* An accumulator is created from an initial value `v` by calling
* [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the [[Accumulable.+=]] operator.
+ * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]].
* However, they cannot read its value. Only the driver program can read the accumulator's value,
* using its [[#value]] method.
*
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index bc8010eca2e1b..3f5b19eda2cc3 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -279,7 +279,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no
* suffix is provided then milliseconds are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getTimeAsMs(key: String): Long = {
Utils.timeStringAsMs(get(key))
@@ -296,7 +296,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then bytes are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getSizeAsBytes(key: String): Long = {
Utils.byteStringAsBytes(get(key))
@@ -320,7 +320,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Kibibytes are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getSizeAsKb(key: String): Long = {
Utils.byteStringAsKb(get(key))
@@ -337,7 +337,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Mebibytes are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getSizeAsMb(key: String): Long = {
Utils.byteStringAsMb(get(key))
@@ -354,7 +354,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Gibibytes are assumed.
- * @throws NoSuchElementException
+ * @note Throws `NoSuchElementException`
*/
def getSizeAsGb(key: String): Long = {
Utils.byteStringAsGb(get(key))
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1261e3e735761..872c46ab689e1 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -645,7 +645,7 @@ class SparkContext(config: SparkConf) extends Logging {
/**
* Get a local property set in this thread, or null if it is missing. See
- * [[org.apache.spark.SparkContext.setLocalProperty]].
+ * `org.apache.spark.SparkContext.setLocalProperty`.
*/
def getLocalProperty(key: String): String =
Option(localProperties.get).map(_.getProperty(key)).orNull
@@ -663,7 +663,7 @@ class SparkContext(config: SparkConf) extends Logging {
* Application programmers can use this method to group all those jobs together and give a
* group description. Once set, the Spark web UI will associate such jobs with this group.
*
- * The application can also use [[org.apache.spark.SparkContext.cancelJobGroup]] to cancel all
+ * The application can also use `org.apache.spark.SparkContext.cancelJobGroup` to cancel all
* running jobs in this group. For example,
* {{{
* // In the main thread:
@@ -1384,7 +1384,7 @@ class SparkContext(config: SparkConf) extends Logging {
}
/**
- * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+ * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
* inputs by adding them into the list.
*/
def collectionAccumulator[T]: CollectionAccumulator[T] = {
@@ -1394,7 +1394,7 @@ class SparkContext(config: SparkConf) extends Logging {
}
/**
- * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+ * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
* inputs by adding them into the list.
*/
def collectionAccumulator[T](name: String): CollectionAccumulator[T] = {
@@ -2043,7 +2043,7 @@ class SparkContext(config: SparkConf) extends Logging {
}
/**
- * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]]
+ * Cancel active jobs for the specified group. See `org.apache.spark.SparkContext.setJobGroup`
* for more information.
*/
def cancelJobGroup(groupId: String) {
@@ -2061,7 +2061,7 @@ class SparkContext(config: SparkConf) extends Logging {
* Cancel a given job if it's scheduled or running.
*
* @param jobId the job ID to cancel
- * @throws InterruptedException if the cancel message cannot be sent
+ * @note Throws `InterruptedException` if the cancel message cannot be sent
*/
def cancelJob(jobId: Int) {
dagScheduler.cancelJob(jobId)
@@ -2071,7 +2071,7 @@ class SparkContext(config: SparkConf) extends Logging {
* Cancel a given stage and all jobs associated with it.
*
* @param stageId the stage ID to cancel
- * @throws InterruptedException if the cancel message cannot be sent
+ * @note Throws `InterruptedException` if the cancel message cannot be sent
*/
def cancelStage(stageId: Int) {
dagScheduler.cancelStage(stageId)
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 27abccf5ac2a9..0fd777ed12829 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -164,7 +164,7 @@ abstract class TaskContext extends Serializable {
/**
* Get a local property set upstream in the driver, or null if it is missing. See also
- * [[org.apache.spark.SparkContext.setLocalProperty]].
+ * `org.apache.spark.SparkContext.setLocalProperty`.
*/
def getLocalProperty(key: String): String
@@ -174,7 +174,7 @@ abstract class TaskContext extends Serializable {
/**
* ::DeveloperApi::
* Returns all metrics sources with the given name which are associated with the instance
- * which runs the task. For more information see [[org.apache.spark.metrics.MetricsSystem!]].
+ * which runs the task. For more information see `org.apache.spark.metrics.MetricsSystem`.
*/
@DeveloperApi
def getMetricsSources(sourceName: String): Seq[Source]
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 871b9d1ad575b..2909191bd6f14 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -186,7 +186,7 @@ private[spark] object TestUtils {
/**
- * A [[SparkListener]] that detects whether spills have occurred in Spark jobs.
+ * A `SparkListener` that detects whether spills have occurred in Spark jobs.
*/
private class SpillListener extends SparkListener {
private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]]
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index f3ab324d59119..f4bc3e3021447 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -155,7 +155,7 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
* to the right except for the last which is closed
* e.g. for the array
* [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50]
- * e.g 1<=x<10 , 10<=x<20, 20<=x<=50
+ * e.g 1<=x<10 , 10<=x<20, 20<=x<=50
* And on the input of 1 and 50 we would have a histogram of 1, 0, 1
*
* @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 86351b8c575e5..ae4320d4583d6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -96,7 +96,7 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
* @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
*
* @note Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.hadoopRDD()]]
+ * `org.apache.spark.SparkContext.hadoopRDD()`
*/
@DeveloperApi
class HadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
index 0970b98071675..3197c57d1c4ad 100644
--- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -41,7 +41,7 @@ private[spark] class JdbcPartition(idx: Int, val lower: Long, val upper: Long) e
* The RDD takes care of closing the connection.
* @param sql the text of the query.
* The query must contain two ? placeholders for parameters used to partition the results.
- * E.g. "select title, author from books where ? <= id and id <= ?"
+ * E.g. "select title, author from books where ? <= id and id <= ?"
* @param lowerBound the minimum value of the first placeholder
* @param upperBound the maximum value of the second placeholder
* The lower and upper bounds are inclusive.
@@ -151,7 +151,7 @@ object JdbcRDD {
* The RDD takes care of closing the connection.
* @param sql the text of the query.
* The query must contain two ? placeholders for parameters used to partition the results.
- * E.g. "select title, author from books where ? <= id and id <= ?"
+ * E.g. "select title, author from books where ? <= id and id <= ?"
* @param lowerBound the minimum value of the first placeholder
* @param upperBound the maximum value of the second placeholder
* The lower and upper bounds are inclusive.
@@ -191,7 +191,7 @@ object JdbcRDD {
* The RDD takes care of closing the connection.
* @param sql the text of the query.
* The query must contain two ? placeholders for parameters used to partition the results.
- * E.g. "select title, author from books where ? <= id and id <= ?"
+ * E.g. "select title, author from books where ? <= id and id <= ?"
* @param lowerBound the minimum value of the first placeholder
* @param upperBound the maximum value of the second placeholder
* The lower and upper bounds are inclusive.
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index a5965f597038d..c783e1375283a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -63,7 +63,7 @@ private[spark] class NewHadoopPartition(
* @param valueClass Class of the value associated with the inputFormatClass.
*
* @note Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]]
+ * `org.apache.spark.SparkContext.newAPIHadoopRDD()`
*/
@DeveloperApi
class NewHadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 33e695ec5322b..ab8582f3e44cf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -399,7 +399,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* here.
*
- * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p`
+ * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero {{{ sp > p }}}
* would trigger sparse representation of registers, which may reduce the memory consumption
* and increase accuracy when the cardinality is small.
*
@@ -492,8 +492,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* each time the resulting RDD is evaluated.
*
* @note This operation may be very expensive. If you are grouping in order to perform an
- * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
- * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+ * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+ * or `PairRDDFunctions.reduceByKey` will provide much better performance.
*
* @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
* key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
@@ -516,8 +516,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.
*
* @note This operation may be very expensive. If you are grouping in order to perform an
- * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
- * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+ * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+ * or `PairRDDFunctions.reduceByKey` will provide much better performance.
*
* @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
* key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
@@ -637,8 +637,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* evaluated.
*
* @note This operation may be very expensive. If you are grouping in order to perform an
- * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
- * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+ * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+ * or `PairRDDFunctions.reduceByKey` will provide much better performance.
*/
def groupByKey(): RDD[(K, Iterable[V])] = self.withScope {
groupByKey(defaultPartitioner(self))
@@ -908,7 +908,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Return an RDD with the pairs from `this` whose keys are not in `other`.
*
* Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
- * RDD will be <= us.
+ * RDD will be <= us.
*/
def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope {
subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length)))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 1070bb96b2524..9f70d79c52cb4 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -23,7 +23,7 @@ import org.apache.spark.Partition
/**
* Enumeration to manage state transitions of an RDD through checkpointing
- * [ Initialized --> checkpointing in progress --> checkpointed ].
+ * [ Initialized --> checkpointing in progress --> checkpointed ].
*/
private[spark] object CheckpointState extends Enumeration {
type CheckpointState = Value
diff --git a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
index d8a80aa5aeb15..e00bc22aba44d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
@@ -35,14 +35,14 @@ trait PartitionCoalescer {
* @param maxPartitions the maximum number of partitions to have after coalescing
* @param parent the parent RDD whose partitions to coalesce
* @return an array of [[PartitionGroup]]s, where each element is itself an array of
- * [[Partition]]s and represents a partition after coalescing is performed.
+ * `Partition`s and represents a partition after coalescing is performed.
*/
def coalesce(maxPartitions: Int, parent: RDD[_]): Array[PartitionGroup]
}
/**
* ::DeveloperApi::
- * A group of [[Partition]]s
+ * A group of `Partition`s
* @param prefLoc preferred location for the partition group
*/
@DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 6bded92700504..ce82e43b2d58b 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages {
extends ToBlockManagerSlave
/**
- * Driver -> Executor message to trigger a thread dump.
+ * Driver -> Executor message to trigger a thread dump.
*/
case object TriggerThreadDump extends ToBlockManagerSlave
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index f98932a470165..1099747444be5 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -67,17 +67,17 @@ private[spark] object SamplingUtils {
}
/**
- * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
+ * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
* the time.
*
* How the sampling rate is determined:
* Let p = num / total, where num is the sample size and total is the total number of
- * datapoints in the RDD. We're trying to compute q > p such that
- * - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
- * where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
- * i.e. the failure rate of not having a sufficiently large sample < 0.0001.
+ * datapoints in the RDD. We're trying to compute q > p such that
+ * - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), where
+ * we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
+ * i.e. the failure rate of not having a sufficiently large sample < 0.0001.
* Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
- * num > 12, but we need a slightly larger q (9 empirically determined).
+ * num > 12, but we need a slightly larger q (9 empirically determined).
* - when sampling without replacement, we're drawing each datapoint with prob_i
* ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
* rate, where success rate is defined the same as in sampling with replacement.
@@ -108,14 +108,14 @@ private[spark] object SamplingUtils {
private[spark] object PoissonBounds {
/**
- * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
+ * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
*/
def getLowerBound(s: Double): Double = {
math.max(s - numStd(s) * math.sqrt(s), 1e-15)
}
/**
- * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
+ * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
*
* @param s sample size
*/
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index 67822749112c6..83547072a08b2 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -160,9 +160,11 @@ private[spark] object StratifiedSamplingUtils extends Logging {
*
* To do so, we compute sampleSize = math.ceil(size * samplingRate) for each stratum and compare
* it to the number of items that were accepted instantly and the number of items in the waitlist
- * for that stratum. Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted),
- * which means we need to sort the elements in the waitlist by their associated values in order
- * to find the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
+ * for that stratum.
+ *
+ * Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which
+ * means we need to sort the elements in the waitlist by their associated values in order to find
+ * the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
* Note that all elements in the waitlist have values >= bound for instant accept, so a T value
* in the waitlist range would allow all elements that were instantly accepted on the first pass
* to be included in the sample.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 96206e0b7ad88..4850a9e43f91c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -87,7 +87,7 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
def ->(value: T): ParamPair[T] = ParamPair(this, value)
// scalastyle:on
- /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */
+ /** Encodes a param value into JSON, which can be decoded by `jsonDecode()`. */
def jsonEncode(value: T): String = {
value match {
case x: String =>
@@ -140,7 +140,7 @@ private[ml] object Param {
/**
* :: DeveloperApi ::
- * Factory methods for common validation functions for [[Param.isValid]].
+ * Factory methods for common validation functions for `Param.isValid`.
* The numerical methods only support Int, Long, Float, and Double.
*/
@DeveloperApi
@@ -165,32 +165,32 @@ object ParamValidators {
s" of unexpected input type: ${value.getClass}")
}
- /** Check if value > lowerBound */
+ /** Check if value > lowerBound */
def gt[T](lowerBound: Double): T => Boolean = { (value: T) =>
getDouble(value) > lowerBound
}
- /** Check if value >= lowerBound */
+ /** Check if value >= lowerBound */
def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) =>
getDouble(value) >= lowerBound
}
- /** Check if value < upperBound */
+ /** Check if value < upperBound */
def lt[T](upperBound: Double): T => Boolean = { (value: T) =>
getDouble(value) < upperBound
}
- /** Check if value <= upperBound */
+ /** Check if value <= upperBound */
def ltEq[T](upperBound: Double): T => Boolean = { (value: T) =>
getDouble(value) <= upperBound
}
/**
* Check for value in range lowerBound to upperBound.
- * @param lowerInclusive If true, check for value >= lowerBound.
- * If false, check for value > lowerBound.
- * @param upperInclusive If true, check for value <= upperBound.
- * If false, check for value < upperBound.
+ * @param lowerInclusive If true, check for value >= lowerBound.
+ * If false, check for value > lowerBound.
+ * @param upperInclusive If true, check for value <= upperBound.
+ * If false, check for value < upperBound.
*/
def inRange[T](
lowerBound: Double,
@@ -203,7 +203,7 @@ object ParamValidators {
lowerValid && upperValid
}
- /** Version of [[inRange()]] which uses inclusive be default: [lowerBound, upperBound] */
+ /** Version of `inRange()` which uses inclusive be default: [lowerBound, upperBound] */
def inRange[T](lowerBound: Double, upperBound: Double): T => Boolean = {
inRange[T](lowerBound, upperBound, lowerInclusive = true, upperInclusive = true)
}
@@ -228,7 +228,7 @@ object ParamValidators {
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Double]]] for Java.
+ * Specialized version of `Param[Double]` for Java.
*/
@DeveloperApi
class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean)
@@ -288,7 +288,7 @@ private[param] object DoubleParam {
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Int]]] for Java.
+ * Specialized version of `Param[Int]` for Java.
*/
@DeveloperApi
class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean)
@@ -317,7 +317,7 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Float]]] for Java.
+ * Specialized version of `Param[Float]` for Java.
*/
@DeveloperApi
class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean)
@@ -378,7 +378,7 @@ private object FloatParam {
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Long]]] for Java.
+ * Specialized version of `Param[Long]` for Java.
*/
@DeveloperApi
class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean)
@@ -407,7 +407,7 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Boolean]]] for Java.
+ * Specialized version of `Param[Boolean]` for Java.
*/
@DeveloperApi
class BooleanParam(parent: String, name: String, doc: String) // No need for isValid
@@ -430,7 +430,7 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Array[String]]]] for Java.
+ * Specialized version of `Param[Array[String]]` for Java.
*/
@DeveloperApi
class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array[String] => Boolean)
@@ -455,7 +455,7 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Array[Double]]]] for Java.
+ * Specialized version of `Param[Array[Double]]` for Java.
*/
@DeveloperApi
class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array[Double] => Boolean)
@@ -485,7 +485,7 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
/**
* :: DeveloperApi ::
- * Specialized version of [[Param[Array[Int]]]] for Java.
+ * Specialized version of `Param[Array[Int]]` for Java.
*/
@DeveloperApi
class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[Int] => Boolean)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index aec1526b55c49..5fb04ed0ee9a2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -124,7 +124,7 @@ object SVMModel extends Loader[SVMModel] {
/**
* Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2
- * regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
+ * regularization is used, which can be changed via `SVMWithSGD.optimizer`.
*
* @note Labels used in SVM should be {0, 1}.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 92cd7f22dc439..a8588bf182641 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -78,7 +78,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
* Returns the receiver operating characteristic (ROC) curve,
* which is an RDD of (false positive rate, true positive rate)
* with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
- * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+ * @see
+ * Receiver operating characteristic (Wikipedia)
*/
@Since("1.0.0")
def roc(): RDD[(Double, Double)] = {
@@ -98,7 +99,7 @@ class BinaryClassificationMetrics @Since("1.3.0") (
/**
* Returns the precision-recall curve, which is an RDD of (recall, precision),
* NOT (precision, recall), with (0.0, 1.0) prepended to it.
- * @see http://en.wikipedia.org/wiki/Precision_and_recall
+ * @see Precision and recall
*/
@Since("1.0.0")
def pr(): RDD[(Double, Double)] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index bb94745f078e8..7a1d2577c20e0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -32,7 +32,7 @@ private[mllib] object EigenValueDecomposition {
*
* @param mul a function that multiplies the symmetric matrix with a DenseVector.
* @param n dimension of the square matrix (maximum Int.MaxValue).
- * @param k number of leading eigenvalues required, 0 < k < n.
+ * @param k number of leading eigenvalues required, 0 < k < n.
* @param tol tolerance of the eigs computation.
* @param maxIterations the maximum number of Arnoldi update iterations.
* @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index c94d7890cf557..2a226dc341762 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -77,7 +77,7 @@ sealed trait Vector extends Serializable {
/**
* Returns a hash code value for the vector. The hash code is based on its size and its first 128
- * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+ * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`.
*/
override def hashCode(): Int = {
// This is a reference implementation. It calls return in foreachActive, which is slow.
@@ -351,7 +351,7 @@ object Vectors {
}
/**
- * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
+ * Parses a string resulted from [[Vector.toString()]] into a [[Vector]].
*/
@Since("1.1.0")
def parse(s: String): Vector = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 6d60136ddc38f..b2e37bad3cf69 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -249,8 +249,8 @@ object RandomRDDs {
* shape and scale.
*
* @param sc SparkContext used to create the RDD.
- * @param shape shape parameter (> 0) for the gamma distribution
- * @param scale scale parameter (> 0) for the gamma distribution
+ * @param shape shape parameter (> 0) for the gamma distribution
+ * @param scale scale parameter (> 0) for the gamma distribution
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
@@ -766,8 +766,8 @@ object RandomRDDs {
* gamma distribution with the input shape and scale.
*
* @param sc SparkContext used to create the RDD.
- * @param shape shape parameter (> 0) for the gamma distribution.
- * @param scale scale parameter (> 0) for the gamma distribution.
+ * @param shape shape parameter (> 0) for the gamma distribution.
+ * @param scale scale parameter (> 0) for the gamma distribution.
* @param numRows Number of Vectors in the RDD.
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index d846c43cf2913..95b155b037194 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -162,7 +162,7 @@ object DecisionTree extends Serializable with Logging {
* @param numClasses Number of classes for classification. Default value of 2.
* @param maxBins Maximum number of bins used for splitting features.
* @param quantileCalculationStrategy Algorithm for calculating quantiles.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -$gt; k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @return DecisionTreeModel that can be used for prediction.
@@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging {
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels should take values {0, 1, ..., numClasses-1}.
* @param numClasses Number of classes for classification.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param impurity Criterion used for information gain calculation.
@@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging {
*
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels are real numbers.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param impurity Criterion used for information gain calculation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index cdeef16135015..a7017f0339101 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] (
}
/**
- * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]].
+ * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#run`.
*/
@Since("1.2.0")
def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -89,7 +89,7 @@ class GradientBoostedTrees private[spark] (
* This dataset should be different from the training dataset,
* but it should follow the same distribution.
* E.g., these two datasets could be created from an original dataset
- * by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+ * by using `org.apache.spark.rdd.RDD.randomSplit()`
* @return GradientBoostedTreesModel that can be used for prediction.
*/
@Since("1.4.0")
@@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] (
}
/**
- * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
+ * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation`.
*/
@Since("1.4.0")
def runWithValidation(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 428af21406092..81c1bb27ea207 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -53,13 +53,13 @@ import org.apache.spark.util.Utils
* the type of random forest (classification or regression), feature type
* (continuous, categorical), depth of the tree, quantile calculation strategy,
* etc.
- * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done.
+ * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done.
* @param featureSubsetStrategy Number of features to consider for splits at each node.
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* Supported numerical values: "(0.0-1.0]", "[1-n]".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt" for classification and
+ * if numTrees > 1 (forest) set to "sqrt" for classification and
* to "onethird" for regression.
* If a real value "n" in the range (0, 1.0] is set,
* use n * number of features.
@@ -111,7 +111,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt".
+ * if numTrees > 1 (forest) set to "sqrt".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return RandomForestModel that can be used for prediction.
*/
@@ -134,7 +134,7 @@ object RandomForest extends Serializable with Logging {
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels should take values {0, 1, ..., numClasses-1}.
* @param numClasses Number of classes for classification.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param numTrees Number of trees in the random forest.
@@ -142,7 +142,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt".
+ * if numTrees > 1 (forest) set to "sqrt".
* @param impurity Criterion used for information gain calculation.
* Supported values: "gini" (recommended) or "entropy".
* @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
@@ -200,7 +200,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "onethird".
+ * if numTrees > 1 (forest) set to "onethird".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return RandomForestModel that can be used for prediction.
*/
@@ -222,7 +222,7 @@ object RandomForest extends Serializable with Logging {
*
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels are real numbers.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param numTrees Number of trees in the random forest.
@@ -230,7 +230,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "onethird".
+ * if numTrees > 1 (forest) set to "onethird".
* @param impurity Criterion used for information gain calculation.
* The only supported value for regression is "variance".
* @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 0b1e191a1cd99..0fe8d87ebd6ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.NumericType
import org.apache.spark.sql.types.StructType
/**
- * A set of methods for aggregations on a [[DataFrame]], created by [[Dataset.groupBy]].
+ * A set of methods for aggregations on a `DataFrame`, created by `Dataset.groupBy`.
*
* The main method is the agg function, which has multiple variants. This class also contains
* convenience some first order statistics such as mean, sum for convenience.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index ff6dd8cb0cf92..a1ea748de98f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -112,7 +112,7 @@ trait SchemaRelationProvider {
/**
* ::Experimental::
- * Implemented by objects that can produce a streaming [[Source]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Source` for a specific format or system.
*
* @since 2.0.0
*/
@@ -143,7 +143,7 @@ trait StreamSourceProvider {
/**
* ::Experimental::
- * Implemented by objects that can produce a streaming [[Sink]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Sink` for a specific format or system.
*
* @since 2.0.0
*/
@@ -185,7 +185,7 @@ trait CreatableRelationProvider {
/**
* Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
- * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
+ * be able to produce the schema of their data in the form of a `StructType`. Concrete
* implementation should inherit from one of the descendant `Scan` classes, which define various
* abstract methods for execution.
*
@@ -216,10 +216,10 @@ abstract class BaseRelation {
/**
* Whether does it need to convert the objects in Row to internal representation, for example:
- * java.lang.String -> UTF8String
- * java.lang.Decimal -> Decimal
+ * java.lang.String -> UTF8String
+ * java.lang.Decimal -> Decimal
*
- * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
+ * If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow`
*
* @note The internal representation is not stable across releases and thus data sources outside
* of Spark SQL should leave this as true.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index 7c1ea2f89ddb8..ea20105892bf0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -100,7 +100,7 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] exten
/**
* :: Experimental ::
- * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]]
+ * Builder object for creating instances of `org.apache.spark.streaming.StateSpec`
* that is used for specifying the parameters of the DStream transformation `mapWithState`
* that is used for specifying the parameters of the DStream transformation
* `mapWithState` operation of a
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index aa4003c62e1e7..2ec907c8cfd5f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -434,8 +434,8 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
* Return a [[JavaMapWithStateDStream]] by applying a function to every key-value element of
* `this` stream, while maintaining some state data for each unique key. The mapping function
* and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
- * transformation can be specified using [[StateSpec]] class. The state data is accessible in
- * as a parameter of type [[State]] in the mapping function.
+ * transformation can be specified using `StateSpec` class. The state data is accessible in
+ * as a parameter of type `State` in the mapping function.
*
* Example of using `mapWithState`:
* {{{
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index b43b9405def97..982e72cffbf3f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.scheduler.StreamingListener
* A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main
* entry point for Spark Streaming functionality. It provides methods to create
* [[org.apache.spark.streaming.api.java.JavaDStream]] and
- * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream]] from input sources. The internal
* org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed
* using `context.sparkContext`. After creating and transforming DStreams, the streaming
* computation can be started and stopped using `context.start()` and `context.stop()`,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index ac739411fd212..f38c1e7996595 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -356,8 +356,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
* Return a [[MapWithStateDStream]] by applying a function to every key-value element of
* `this` stream, while maintaining some state data for each unique key. The mapping function
* and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
- * transformation can be specified using [[StateSpec]] class. The state data is accessible in
- * as a parameter of type [[State]] in the mapping function.
+ * transformation can be specified using `StateSpec` class. The state data is accessible in
+ * as a parameter of type `State` in the mapping function.
*
* Example of using `mapWithState`:
* {{{
From 9a9d0cd6f74fa32256ce6edf0832865585242ba2 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 02:06:12 +0900
Subject: [PATCH 06/17] Fix errors (last round)
---
.../org/apache/spark/rdd/PairRDDFunctions.scala | 7 ++++++-
.../apache/spark/rpc/netty/RpcEndpointVerifier.scala | 2 +-
.../org/apache/spark/scheduler/ResultTask.scala | 2 +-
.../org/apache/spark/scheduler/ShuffleMapTask.scala | 2 +-
.../main/scala/org/apache/spark/scheduler/Task.scala | 2 +-
.../org/apache/spark/scheduler/TaskDescription.scala | 2 +-
.../spark/storage/ShuffleBlockFetcherIterator.scala | 2 +-
.../spark/util/random/StratifiedSamplingUtils.scala | 6 +++---
.../streaming/flume/FlumePollingInputDStream.scala | 2 +-
.../spark/streaming/kafka/KafkaInputDStream.scala | 2 +-
.../spark/graphx/impl/VertexPartitionBase.scala | 2 +-
.../spark/graphx/impl/VertexPartitionBaseOps.scala | 2 +-
.../org/apache/spark/ml/recommendation/ALS.scala | 2 +-
.../spark/ml/tree/impl/DecisionTreeMetadata.scala | 2 +-
.../scala/org/apache/spark/ml/util/ReadWrite.scala | 2 +-
.../spark/mllib/evaluation/RankingMetrics.scala | 2 +-
.../binary/BinaryClassificationMetricComputers.scala | 2 +-
.../apache/spark/mllib/fpm/AssociationRules.scala | 4 ++--
.../scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 6 +++---
.../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++++-----
.../org/apache/spark/mllib/linalg/Vectors.scala | 2 +-
.../org/apache/spark/mllib/rdd/SlidingRDD.scala | 4 ++--
.../org/apache/spark/sql/internal/CatalogImpl.scala | 12 ++++++------
.../spark/sql/internal/VariableSubstitution.scala | 2 +-
.../scala/org/apache/spark/sql/hive/hiveUDFs.scala | 2 +-
.../apache/spark/sql/hive/hiveWriterContainers.scala | 2 +-
.../scala/org/apache/spark/streaming/StateSpec.scala | 2 +-
27 files changed, 47 insertions(+), 42 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index ab8582f3e44cf..01d203685f407 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -399,7 +399,12 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* here.
*
- * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero {{{ sp > p }}}
+ * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero
+ *
+ * {{{
+ * sp > p
+ * }}}
+ *
* would trigger sparse representation of registers, which may reduce the memory consumption
* and increase accuracy when the cardinality is small.
*
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index 99f20da2d66aa..0e980b1089221 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -20,7 +20,7 @@ package org.apache.spark.rpc.netty
import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
/**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists.
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an `RpcEndpoint` exists.
*
* This is used when setting up a remote endpoint reference.
*/
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index 1e7c63af2e797..d19353f2a9930 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.rdd.RDD
* @param outputId index of the task in this job (a job can launch tasks on only a subset of the
* input RDD's partitions).
* @param localProperties copy of thread-local properties set by the user on the driver side.
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
*
* The parameters below are optional:
* @param jobId id of the job this task belongs to
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index 66d6790e168f2..31011de85bf7e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.shuffle.ShuffleWriter
* the type should be (RDD[_], ShuffleDependency[_, _, _]).
* @param partition partition of the RDD this task is associated with
* @param locs preferred task execution locations for locality scheduling
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
* @param localProperties copy of thread-local properties set by the user on the driver side.
*
* The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index d39651a722325..1554200aeaf64 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -46,7 +46,7 @@ import org.apache.spark.util._
* @param stageId id of the stage this task belongs to
* @param stageAttemptId attempt id of the stage this task belongs to
* @param partitionId index of the number in the RDD
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
* @param localProperties copy of thread-local properties set by the user on the driver side.
*
* The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
index 1c7c81c488c3a..45c742cbff5e7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
@@ -23,7 +23,7 @@ import org.apache.spark.util.SerializableBuffer
/**
* Description of a task that gets passed onto executors to be executed, usually created by
- * [[TaskSetManager.resourceOffer]].
+ * `TaskSetManager.resourceOffer`.
*/
private[spark] class TaskDescription(
val taskId: Long,
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 4dc2f362329a0..a94cf0fbb10ef 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -247,7 +247,7 @@ final class ShuffleBlockFetcherIterator(
/**
* Fetch the local blocks while we are fetching remote blocks. This is ok because
- * [[ManagedBuffer]]'s memory is allocated lazily when we create the input stream, so all we
+ * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we
* track in-memory are the ManagedBuffer references themselves.
*/
private[this] def fetchLocalBlocks() {
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index 83547072a08b2..debca177155cd 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -35,8 +35,8 @@ import org.apache.spark.rdd.RDD
* high probability. This is achieved by maintaining a waitlist of size O(log(s)), where s is the
* desired sample size for each stratum.
*
- * Like in simple random sampling, we generate a random value for each item from the
- * uniform distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
+ * Like in simple random sampling, we generate a random value for each item from the uniform
+ * distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
* are accepted into the sample instantly. The threshold for instant accept is designed so that
* s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a
* waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding
@@ -165,7 +165,7 @@ private[spark] object StratifiedSamplingUtils extends Logging {
* Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which
* means we need to sort the elements in the waitlist by their associated values in order to find
* the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
- * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
+ * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
* in the waitlist range would allow all elements that were instantly accepted on the first pass
* to be included in the sample.
*/
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 54565840fa665..d84e289272c62 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -36,7 +36,7 @@ import org.apache.spark.streaming.flume.sink._
import org.apache.spark.streaming.receiver.Receiver
/**
- * A [[ReceiverInputDStream]] that can be used to read data from several Flume agents running
+ * A `ReceiverInputDStream` that can be used to read data from several Flume agents running
* [[org.apache.spark.streaming.flume.sink.SparkSink]]s.
* @param _ssc Streaming context that will execute this input stream
* @param addresses List of addresses at which SparkSinks are listening
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 3713bda41b8ee..fffb920e97a53 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.ThreadUtils
*
* @param kafkaParams Map of kafka configuration parameters.
* See: http://kafka.apache.org/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel RDD storage level.
*/
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
index 8d608c99b1a1d..8da46db98be81 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
@@ -57,7 +57,7 @@ private[graphx] object VertexPartitionBase {
* concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for
* VertexPartitionBase and subclasses that provide implicit evidence of membership in the
* `VertexPartitionBaseOpsConstructor` typeclass (for example,
- * [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * `VertexPartition.VertexPartitionOpsConstructor`).
*/
private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag]
extends Serializable {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
index 43594573cf013..a8ed59b09bbb7 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
@@ -29,7 +29,7 @@ import org.apache.spark.util.collection.BitSet
/**
* A class containing additional operations for subclasses of VertexPartitionBase that provide
* implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for
- * example, [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * example, `VertexPartition.VertexPartitionOpsConstructor`).
*/
private[graphx] abstract class VertexPartitionBaseOps
[VD: ClassTag, Self[X] <: VertexPartitionBase[X]: VertexPartitionBaseOpsConstructor]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 6d2c59a905ec7..c6de6eb02774b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -878,7 +878,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
}
/**
- * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing.
+ * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing.
*/
private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag]
extends Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
index 442f52bf0231d..9d7a3bd07abd3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
@@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD
* @param numClasses For classification: labels can take values {0, ..., numClasses - 1}.
* For regression: fixed at 0 (no meaning).
* @param maxBins Maximum number of bins, for all features.
- * @param featureArity Map: categorical feature index --> arity.
+ * @param featureArity Map: categorical feature index --> arity.
* I.e., the feature takes values in {0, ..., arity - 1}.
* @param numBins Number of bins for each feature.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 343a70c5d7a46..b0759dca718a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -345,7 +345,7 @@ private[ml] object DefaultParamsReader {
/**
* All info from metadata file.
*
- * @param params paramMap, as a [[JValue]]
+ * @param params paramMap, as a `JValue`
* @param metadata All metadata, including the other fields
* @param metadataJson Full metadata file String (for debugging)
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index e29b51c3a19da..3d274d68f1180 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
/**
* Evaluator for ranking algorithms.
*
- * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
+ * Java users should use `RankingMetrics$.of` to create a [[RankingMetrics]] instance.
*
* @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
index be3319d60ce25..5a4c6aef50b7b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
@@ -62,7 +62,7 @@ private[evaluation] object Recall extends BinaryClassificationMetricComputer {
* F-Measure. Defined as 0 if both precision and recall are 0. EG in the case that all examples
* are false positives.
* @param beta the beta constant in F-Measure
- * @see http://en.wikipedia.org/wiki/F1_score
+ * @see F1 score (Wikipedia)
*/
private[evaluation] case class FMeasure(beta: Double) extends BinaryClassificationMetricComputer {
private val beta2 = beta * beta
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 3c26d2670841b..dca031477d3b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
import org.apache.spark.rdd.RDD
/**
- * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
+ * Generates association rules from a `RDD[FreqItemset[Item]]`. This method only generates
* association rules which have a single item as the consequent.
*
*/
@@ -56,7 +56,7 @@ class AssociationRules private[fpm] (
/**
* Computes the association rules with confidence above [[minConfidence]].
* @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
- * @return a [[Set[Rule[Item]]] containing the association rules.
+ * @return a `Set[Rule[Item]]` containing the association rules.
*
*/
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index b53386012280d..5f5b3a497b9ad 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -44,7 +44,7 @@ import org.apache.spark.storage.StorageLevel
/**
* Model trained by [[FPGrowth]], which holds frequent itemsets.
- * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
+ * @param freqItemsets frequent itemset, which is an RDD of `FreqItemset`
* @tparam Item item type
*/
@Since("1.3.0")
@@ -69,7 +69,7 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
* - human-readable (JSON) model metadata to path/metadata/
* - Parquet formatted data to path/data/
*
- * The model may be loaded using [[FPGrowthModel.load]].
+ * The model may be loaded using `FPGrowthModel.load`.
*
* @param sc Spark context used to save model data.
* @param path Path specifying the directory in which to save this model.
@@ -309,7 +309,7 @@ object FPGrowth {
/**
* Frequent itemset.
- * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
+ * @param items items in this itemset. Java users should call `FreqItemset#javaItems` instead.
* @param freq frequency
* @tparam Item item type
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index a5641672218dd..08f32ca4736bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -211,7 +211,7 @@ class PrefixSpan private (
}
/**
- * A Java-friendly version of [[run()]] that reads sequences from a [[JavaRDD]] and returns
+ * A Java-friendly version of `run()` that reads sequences from a `JavaRDD` and returns
* frequent sequences in a [[PrefixSpanModel]].
* @param data ordered sequences of itemsets stored as Java Iterable of Iterables
* @tparam Item item type
@@ -366,13 +366,13 @@ object PrefixSpan extends Logging {
* Items are represented by positive integers, and items in each itemset must be distinct and
* ordered.
* we use 0 as the delimiter between itemsets.
- * For example, a sequence `<(12)(31)1>` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
- * The postfix of this sequence w.r.t. to prefix `<1>` is `<(_2)(13)1>`.
+ * For example, a sequence <(12)(31)1> is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
+ * The postfix of this sequence w.r.t. to prefix <1> is <(_2)(13)1>.
* We may reuse the original items array `[0, 1, 2, 0, 1, 3, 0, 1, 0]` to represent the postfix,
* and mark the start index of the postfix, which is `2` in this example.
* So the active items in this postfix are `[2, 0, 1, 3, 0, 1, 0]`.
* We also remember the start indices of partial projections, the ones that split an itemset.
- * For example, another possible partial projection w.r.t. `<1>` is `<(_3)1>`.
+ * For example, another possible partial projection w.r.t. <1> is <(_3)1>.
* We remember the start indices of partial projections, which is `[2, 5]` in this example.
* This data structure makes it easier to do projections.
*
@@ -583,7 +583,7 @@ class PrefixSpanModel[Item] @Since("1.5.0") (
* - human-readable (JSON) model metadata to path/metadata/
* - Parquet formatted data to path/data/
*
- * The model may be loaded using [[PrefixSpanModel.load]].
+ * The model may be loaded using `PrefixSpanModel.load`.
*
* @param sc Spark context used to save model data.
* @param path Path specifying the directory in which to save this model.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 2a226dc341762..63ea9d3264b0f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -351,7 +351,7 @@ object Vectors {
}
/**
- * Parses a string resulted from [[Vector.toString()]] into a [[Vector]].
+ * Parses a string resulted from `Vector.toString` into a [[Vector]].
*/
@Since("1.1.0")
def parse(s: String): Vector = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
index adb5e51947f6d..365b2a06110f6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
@@ -42,8 +42,8 @@ class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T]
* @param windowSize the window size, must be greater than 1
* @param step step size for windows
*
- * @see [[org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*]]
- * @see [[scala.collection.IterableLike.sliding(Int, Int)*]]
+ * @see `org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*`
+ * @see `scala.collection.IterableLike.sliding(Int, Int)*`
*/
private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index d3e323cb12891..822949af7d899 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType
/**
- * Internal implementation of the user-facing [[Catalog]].
+ * Internal implementation of the user-facing `Catalog`.
*/
class CatalogImpl(sparkSession: SparkSession) extends Catalog {
@@ -175,7 +175,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
}
/**
- * Get the database with the specified name. This throws an [[AnalysisException]] when no
+ * Get the database with the specified name. This throws an `AnalysisException` when no
* [[Database]] can be found.
*/
override def getDatabase(dbName: String): Database = {
@@ -184,7 +184,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Get the table or view with the specified name. This table can be a temporary view or a
- * table/view in the current database. This throws an [[AnalysisException]] when no [[Table]]
+ * table/view in the current database. This throws an `AnalysisException` when no `Table`
* can be found.
*/
override def getTable(tableName: String): Table = {
@@ -193,7 +193,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Get the table or view with the specified name in the specified database. This throws an
- * [[AnalysisException]] when no [[Table]] can be found.
+ * `AnalysisException` when no `Table` can be found.
*/
override def getTable(dbName: String, tableName: String): Table = {
makeTable(TableIdentifier(tableName, Option(dbName)))
@@ -201,7 +201,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Get the function with the specified name. This function can be a temporary function or a
- * function in the current database. This throws an [[AnalysisException]] when no [[Function]]
+ * function in the current database. This throws an `AnalysisException` when no `Function`
* can be found.
*/
override def getFunction(functionName: String): Function = {
@@ -209,7 +209,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
}
/**
- * Get the function with the specified name. This returns [[None]] when no [[Function]] can be
+ * Get the function with the specified name. This returns `None` when no `Function` can be
* found.
*/
override def getFunction(dbName: String, functionName: String): Function = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 791a9cf813b6a..4e7c813be9922 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -23,7 +23,7 @@ import org.apache.spark.internal.config._
* A helper class that enables substitution using syntax like
* `${var}`, `${system:var}` and `${env:var}`.
*
- * Variable substitution is controlled by [[SQLConf.variableSubstituteEnabled]].
+ * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`.
*/
class VariableSubstitution(conf: SQLConf) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 32edd4aec2865..90e86959cd0e4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -177,7 +177,7 @@ private[hive] case class HiveGenericUDF(
/**
* Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
- * [[Generator]]. Note that the semantics of Generators do not allow
+ * `Generator`. Note that the semantics of Generators do not allow
* Generators to maintain state in between input rows. Thus UDTFs that rely on partitioning
* dependent operations like calls to `close()` before producing output will not operate the same as
* in Hive. However, in practice this should not affect compatibility for most sane UDTFs
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index a34e2e76f5838..0c9321068c4c1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -49,7 +49,7 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
/**
* Internal helper class that saves an RDD using a Hive OutputFormat.
- * It is based on [[SparkHadoopWriter]].
+ * It is based on `SparkHadoopWriter`.
*/
private[hive] class SparkHiveWriterContainer(
@transient private val jobConf: JobConf,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index ea20105892bf0..c3b28bd516da5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -30,7 +30,7 @@ import org.apache.spark.util.ClosureCleaner
* `mapWithState` operation of a
* [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
* [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
- * Use [[org.apache.spark.streaming.StateSpec.function() StateSpec.function]] factory methods
+ * Use `org.apache.spark.streaming.StateSpec.function()` factory methods
* to create instances of this class.
*
* Example in Scala:
From 842a738e8966f70e3b882182a06f2d1e8d257f73 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 02:19:38 +0900
Subject: [PATCH 07/17] Fix more errors
---
core/src/main/scala/org/apache/spark/Accumulator.scala | 2 +-
.../org/apache/spark/rpc/netty/RpcEndpointVerifier.scala | 2 +-
.../apache/spark/storage/ShuffleBlockFetcherIterator.scala | 2 +-
.../spark/mllib/evaluation/BinaryClassificationMetrics.scala | 5 +++--
.../org/apache/spark/mllib/evaluation/RankingMetrics.scala | 2 +-
.../scala/org/apache/spark/sql/internal/CatalogImpl.scala | 2 +-
6 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index bcf157078813d..b28e9408062b5 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
*
* An accumulator is created from an initial value `v` by calling
* [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]].
+ * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]].
* However, they cannot read its value. Only the driver program can read the accumulator's value,
* using its [[#value]] method.
*
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index 0e980b1089221..430dcc50ba711 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -35,6 +35,6 @@ private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher
private[netty] object RpcEndpointVerifier {
val NAME = "endpoint-verifier"
- /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */
+ /** A message used to ask the remote [[RpcEndpointVerifier]] if an `RpcEndpoint` exists. */
case class CheckExistence(name: String)
}
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index a94cf0fbb10ef..269c12d6da444 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -423,7 +423,7 @@ object ShuffleBlockFetcherIterator {
* @param address BlockManager that the block was fetched from.
* @param size estimated size of the block, used to calculate bytesInFlight.
* Note that this is NOT the exact bytes.
- * @param buf [[ManagedBuffer]] for the content.
+ * @param buf `ManagedBuffer` for the content.
* @param isNetworkReqDone Is this the last network request for this host in this fetch request.
*/
private[storage] case class SuccessFetchResult(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index a8588bf182641..9b7cd0427f5ed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -99,7 +99,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
/**
* Returns the precision-recall curve, which is an RDD of (recall, precision),
* NOT (precision, recall), with (0.0, 1.0) prepended to it.
- * @see Precision and recall
+ * @see
+ * Precision and recall (Wikipedia)
*/
@Since("1.0.0")
def pr(): RDD[(Double, Double)] = {
@@ -119,7 +120,7 @@ class BinaryClassificationMetrics @Since("1.3.0") (
* Returns the (threshold, F-Measure) curve.
* @param beta the beta factor in F-Measure computation.
* @return an RDD of (threshold, F-Measure) pairs.
- * @see http://en.wikipedia.org/wiki/F1_score
+ * @see F1 score (Wikipedia)
*/
@Since("1.0.0")
def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index 3d274d68f1180..cedfdbf0dc127 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -41,7 +41,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
/**
* Compute the average precision of all the queries, truncated at ranking position k.
*
- * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be
+ * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be
* computed as #(relevant items retrieved) / k. This formula also applies when the size of the
* ground truth set is less than k.
*
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 822949af7d899..6d984621ccca1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -176,7 +176,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Get the database with the specified name. This throws an `AnalysisException` when no
- * [[Database]] can be found.
+ * `Database` can be found.
*/
override def getDatabase(dbName: String): Database = {
makeDatabase(dbName)
From 366031768acc4d93a30f23151ce23c00060a322f Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 02:27:08 +0900
Subject: [PATCH 08/17] Make markdown pretty for some links
---
core/src/main/scala/org/apache/spark/Accumulator.scala | 2 +-
.../src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 2 +-
.../org/apache/spark/mllib/tree/GradientBoostedTrees.scala | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index b28e9408062b5..7bea636c94aa0 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
*
* An accumulator is created from an initial value `v` by calling
* [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]].
+ * Tasks running on the cluster can then add to it using the `+=` operator.
* However, they cannot read its value. Only the driver program can read the accumulator's value,
* using its [[#value]] method.
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 5f5b3a497b9ad..e3cf0d4979ed4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -309,7 +309,7 @@ object FPGrowth {
/**
* Frequent itemset.
- * @param items items in this itemset. Java users should call `FreqItemset#javaItems` instead.
+ * @param items items in this itemset. Java users should call `FreqItemset.javaItems` instead.
* @param freq frequency
* @tparam Item item type
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index a7017f0339101..3e85678906b33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] (
}
/**
- * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#run`.
+ * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.run`.
*/
@Since("1.2.0")
def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] (
}
/**
- * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation`.
+ * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.runWithValidation`.
*/
@Since("1.4.0")
def runWithValidation(
From 22bfb6854a7e0ecfc9b7d9ffe8c49715484eb3a1 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 03:01:38 +0900
Subject: [PATCH 09/17] Keep original style without new indentations
---
.../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 01d203685f407..b31d21c354d11 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -399,12 +399,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* here.
*
- * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero
- *
- * {{{
- * sp > p
- * }}}
- *
+ * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp > p)
* would trigger sparse representation of registers, which may reduce the memory consumption
* and increase accuracy when the cardinality is small.
*
From 73fcd355a565c5ea433b1f8ca11e08ee6c3f2a9e Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sat, 26 Nov 2016 12:44:12 +0900
Subject: [PATCH 10/17] Use throws annotation
---
.../scala/org/apache/spark/SparkConf.scala | 18 ++++++++++++------
.../scala/org/apache/spark/SparkContext.scala | 6 ++++--
.../scala/org/apache/spark/util/Utils.scala | 3 ++-
3 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 3f5b19eda2cc3..d7df031c856c0 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -262,8 +262,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no
* suffix is provided then seconds are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getTimeAsSeconds(key: String): Long = {
Utils.timeStringAsSeconds(get(key))
}
@@ -279,8 +280,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no
* suffix is provided then milliseconds are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getTimeAsMs(key: String): Long = {
Utils.timeStringAsMs(get(key))
}
@@ -296,8 +298,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then bytes are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getSizeAsBytes(key: String): Long = {
Utils.byteStringAsBytes(get(key))
}
@@ -320,8 +323,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Kibibytes are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getSizeAsKb(key: String): Long = {
Utils.byteStringAsKb(get(key))
}
@@ -337,8 +341,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Mebibytes are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getSizeAsMb(key: String): Long = {
Utils.byteStringAsMb(get(key))
}
@@ -354,8 +359,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
/**
* Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
* suffix is provided then Gibibytes are assumed.
- * @note Throws `NoSuchElementException`
+ * @throws java.util.NoSuchElementException
*/
+ @throws(classOf[NoSuchElementException])
def getSizeAsGb(key: String): Long = {
Utils.byteStringAsGb(get(key))
}
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 872c46ab689e1..bd1f1683cf82c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2061,8 +2061,9 @@ class SparkContext(config: SparkConf) extends Logging {
* Cancel a given job if it's scheduled or running.
*
* @param jobId the job ID to cancel
- * @note Throws `InterruptedException` if the cancel message cannot be sent
+ * @throws InterruptedException if the cancel message cannot be sent
*/
+ @throws(classOf[InterruptedException])
def cancelJob(jobId: Int) {
dagScheduler.cancelJob(jobId)
}
@@ -2071,8 +2072,9 @@ class SparkContext(config: SparkConf) extends Logging {
* Cancel a given stage and all jobs associated with it.
*
* @param stageId the stage ID to cancel
- * @note Throws `InterruptedException` if the cancel message cannot be sent
+ * @throws InterruptedException if the cancel message cannot be sent
*/
+ @throws(classOf[InterruptedException])
def cancelStage(stageId: Int) {
dagScheduler.cancelStage(stageId)
}
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index f051860a23b65..252f754e0e36e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2354,8 +2354,9 @@ private[spark] object Utils extends Logging {
* A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains
* host and port.
*
- * @note Throws `SparkException` if sparkUrl is invalid.
+ * @throws org.apache.spark.SparkException if sparkUrl is invalid.
*/
+ @throws(classOf[SparkException])
def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = {
try {
val uri = new java.net.URI(sparkUrl)
From 246bef3ce61f7fe74a97577e8cfe4a3a1531f1ff Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sun, 27 Nov 2016 01:58:49 +0900
Subject: [PATCH 11/17] Print < and > properly within this PR
---
.../apache/spark/rdd/DoubleRDDFunctions.scala | 6 ++-
.../apache/spark/rdd/PairRDDFunctions.scala | 8 ++--
.../spark/scheduler/InputFormatInfo.scala | 4 +-
.../spark/storage/BlockManagerMessages.scala | 2 +-
.../spark/util/random/SamplingUtils.scala | 19 ++++++--
.../util/random/StratifiedSamplingUtils.scala | 32 ++++++++-----
.../spark/streaming/kafka/KafkaCluster.scala | 20 ++++++--
.../spark/streaming/kafka/KafkaUtils.scala | 10 ++--
.../org/apache/spark/ml/clustering/LDA.scala | 4 +-
.../apache/spark/ml/feature/Bucketizer.scala | 2 +-
.../spark/ml/feature/CountVectorizer.scala | 5 +-
.../apache/spark/ml/feature/HashingTF.scala | 2 +-
.../org/apache/spark/ml/feature/NGram.scala | 2 +-
.../apache/spark/ml/feature/Normalizer.scala | 2 +-
.../ml/feature/PolynomialExpansion.scala | 3 +-
.../ml/feature/QuantileDiscretizer.scala | 2 +-
.../spark/ml/feature/SQLTransformer.scala | 4 +-
.../apache/spark/ml/feature/Tokenizer.scala | 2 +-
.../spark/ml/feature/VectorIndexer.scala | 7 +--
.../org/apache/spark/ml/param/params.scala | 48 +++++++++++++++----
.../ml/regression/AFTSurvivalRegression.scala | 2 +-
.../GeneralizedLinearRegression.scala | 10 ++--
.../ml/regression/LinearRegression.scala | 9 ++--
.../ml/tree/impl/DecisionTreeMetadata.scala | 2 +-
.../apache/spark/ml/util/MetadataUtils.scala | 2 +-
.../spark/mllib/clustering/LDAOptimizer.scala | 5 +-
.../mllib/evaluation/RankingMetrics.scala | 6 +--
.../linalg/EigenValueDecomposition.scala | 5 +-
.../spark/mllib/random/RandomRDDs.scala | 8 ++--
.../spark/mllib/tree/DecisionTree.scala | 6 +--
.../spark/mllib/tree/RandomForest.scala | 19 ++++----
.../tree/configuration/BoostingStrategy.scala | 6 +--
.../mllib/tree/configuration/Strategy.scala | 2 +-
.../spark/sql/DataFrameStatFunctions.scala | 6 ++-
.../org/apache/spark/sql/RuntimeConfig.scala | 3 +-
.../org/apache/spark/sql/SQLContext.scala | 2 +-
.../org/apache/spark/sql/functions.scala | 26 ++++++----
.../apache/spark/sql/sources/interfaces.scala | 7 ++-
38 files changed, 205 insertions(+), 105 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index f4bc3e3021447..5ebddf8b41a47 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -152,10 +152,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
/**
* Compute a histogram using the provided buckets. The buckets are all open
- * to the right except for the last which is closed
+ * to the right except for the last which is closed.
+ * {{{
* e.g. for the array
* [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50]
- * e.g 1<=x<10 , 10<=x<20, 20<=x<=50
+ * e.g 1<=x<10 , 10<=x<20, 20<=x<=50
+ * }}}
* And on the input of 1 and 50 we would have a histogram of 1, 0, 1
*
* @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index b31d21c354d11..aad99e3eb2c5b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -399,9 +399,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* here.
*
- * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp > p)
- * would trigger sparse representation of registers, which may reduce the memory consumption
- * and increase accuracy when the cardinality is small.
+ * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is
+ * greater than `p`) would trigger sparse representation of registers, which may reduce the
+ * memory consumption and increase accuracy when the cardinality is small.
*
* @param p The precision value for the normal set.
* `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
@@ -908,7 +908,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
* Return an RDD with the pairs from `this` whose keys are not in `other`.
*
* Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
- * RDD will be <= us.
+ * RDD will be less than or equal to us.
*/
def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope {
subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length)))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index 5f23d657e1155..a80e45e2c3c25 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -151,9 +151,10 @@ object InputFormatInfo {
Computes the preferred locations based on input(s) and returned a location to block map.
Typical use of this method for allocation would follow some algo like this:
+ {{{
a) For each host, count number of splits hosted on that host.
b) Decrement the currently allocated containers on that host.
- c) Compute rack info for each host and update rack -> count map based on (b).
+ c) Compute rack info for each host and update rack -> count map based on (b).
d) Allocate nodes based on (c)
e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node
(even if data locality on that is very high) : this is to prevent fragility of job if a
@@ -162,6 +163,7 @@ object InputFormatInfo {
go to (a) until required nodes are allocated.
If a node 'dies', follow same procedure.
+ }}}
PS: I know the wording here is weird, hopefully it makes some sense !
*/
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index ce82e43b2d58b..d71acbb4cf771 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages {
extends ToBlockManagerSlave
/**
- * Driver -> Executor message to trigger a thread dump.
+ * Driver to Executor message to trigger a thread dump.
*/
case object TriggerThreadDump extends ToBlockManagerSlave
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index 1099747444be5..cc467cabcd037 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -67,10 +67,12 @@ private[spark] object SamplingUtils {
}
/**
- * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
- * the time.
+ * Returns a sampling rate that guarantees a sample of size greater than or equal to
+ * sampleSizeLowerBound 99.99% of the time.
*
* How the sampling rate is determined:
+ *
+ * {{{
* Let p = num / total, where num is the sample size and total is the total number of
* datapoints in the RDD. We're trying to compute q > p such that
* - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), where
@@ -81,6 +83,7 @@ private[spark] object SamplingUtils {
* - when sampling without replacement, we're drawing each datapoint with prob_i
* ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
* rate, where success rate is defined the same as in sampling with replacement.
+ * }}}
*
* The smallest sampling rate supported is 1e-10 (in order to avoid running into the limit of the
* RNG's resolution).
@@ -108,14 +111,22 @@ private[spark] object SamplingUtils {
private[spark] object PoissonBounds {
/**
- * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
+ * Returns a lambda such that
+ * {{{
+ * Pr[X > s]
+ * }}}
+ * is very small, where X ~ Pois(lambda).
*/
def getLowerBound(s: Double): Double = {
math.max(s - numStd(s) * math.sqrt(s), 1e-15)
}
/**
- * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
+ * Returns a lambda such that
+ * {{{
+ * Pr[X < s]
+ * }}}
+ * is very small, where X ~ Pois(lambda).
*
* @param s sample size
*/
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index debca177155cd..ab4b125d2ac34 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -36,12 +36,13 @@ import org.apache.spark.rdd.RDD
* desired sample size for each stratum.
*
* Like in simple random sampling, we generate a random value for each item from the uniform
- * distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
- * are accepted into the sample instantly. The threshold for instant accept is designed so that
- * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a
- * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding
- * a portion of the waitlist to the set of items that are instantly accepted. The exact threshold
- * is computed by sorting the values in the waitlist and picking the value at (s - numAccepted).
+ * distribution [0.0, 1.0]. All items with values less than or equal to min(values of items in the
+ * waitlist) are accepted into the sample instantly. The threshold for instant accept is designed
+ * so that s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by
+ * maintaining a waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size
+ * s by adding a portion of the waitlist to the set of items that are instantly accepted. The exact
+ * threshold is computed by sorting the values in the waitlist and picking the value at
+ * (s - numAccepted).
*
* Note that since we use the same seed for the RNG when computing the thresholds and the actual
* sample, our computed thresholds are guaranteed to produce the desired sample size.
@@ -162,12 +163,19 @@ private[spark] object StratifiedSamplingUtils extends Logging {
* it to the number of items that were accepted instantly and the number of items in the waitlist
* for that stratum.
*
- * Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which
- * means we need to sort the elements in the waitlist by their associated values in order to find
- * the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
- * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
- * in the waitlist range would allow all elements that were instantly accepted on the first pass
- * to be included in the sample.
+ * Most of the time,
+ * {{{
+ * numAccepted <= sampleSize <= (numAccepted + numWaitlisted)
+ * }}},
+ * which means we need to sort the elements in the waitlist by their associated values in order
+ * to find the value T s.t.
+ * {{{
+ * |{elements in the stratum whose associated values <= T}| = sampleSize
+ * }}}.
+ *
+ * Note that all elements in the waitlist have values greater than or equal to bound for instant
+ * accept, so a T value in the waitlist range would allow all elements that were instantly
+ * accepted on the first pass to be included in the sample.
*/
def computeThresholdByKey[K](finalResult: Map[K, AcceptanceResult],
fractions: Map[K, Double]): Map[K, Double] = {
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index c419221aa607a..a27490aad0f60 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -231,7 +231,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
// this 0 here indicates api version, in this case the original ZK backed api.
private def defaultConsumerApiVersion: Short = 0
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /**
+ * Requires Kafka higher than or equal to 0.8.1.1.
+ * Defaults to the original ZooKeeper backed api version.
+ */
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
@@ -250,7 +253,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
}
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /**
+ * Requires Kafka higher than or equal to 0.8.1.1.
+ * Defaults to the original ZooKeeper backed api version.
+ */
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
@@ -287,7 +293,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
Left(errs)
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /**
+ * Requires Kafka higher than or equal to 0.8.1.1.
+ * Defaults to the original ZooKeeper backed api version.
+ */
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long]
@@ -305,7 +314,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
}
- /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */
+ /**
+ * Requires Kafka higher than or equal to 0.8.1.1.
+ * Defaults to the original ZooKeeper backed api version.
+ */
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata]
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 437c797e55605..41d3b728110ed 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -47,7 +47,7 @@ object KafkaUtils {
* @param ssc StreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel Storage level to use for storing the received objects
* (default: StorageLevel.MEMORY_AND_DISK_SER_2)
@@ -72,7 +72,7 @@ object KafkaUtils {
* @param ssc StreamingContext object
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is
+ * @param topics Map of (topic_name and numPartitions) to consume. Each partition is
* consumed in its own thread.
* @param storageLevel Storage level to use for storing the received objects
* @tparam K type of Kafka message key
@@ -97,7 +97,7 @@ object KafkaUtils {
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed
* in its own thread
* @return DStream of (Kafka message key, Kafka message value)
*/
@@ -115,7 +115,7 @@ object KafkaUtils {
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..).
* @param groupId The group id for this consumer.
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel RDD storage level.
* @return DStream of (Kafka message key, Kafka message value)
@@ -140,7 +140,7 @@ object KafkaUtils {
* @param valueDecoderClass Type of kafka value decoder
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel RDD storage level.
* @tparam K type of Kafka message key
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 6032ab3db9350..865615ef4dc98 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -120,11 +120,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*
* Optimizer-specific parameter settings:
* - EM
- * - Value should be > 1.0
+ * - Value should be greater than 1.0
* - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
* Asuncion et al. (2009), who recommend a +1 adjustment for EM.
* - Online
- * - Value should be >= 0
+ * - Value should be greater than or equal to 0
* - default = (1.0 / k), following the implementation from
* here.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 546643d8f91f7..260159f8b7ac4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
/**
* Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
* A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
- * also includes y. Splits should be of length >= 3 and strictly increasing.
+ * also includes y. Splits should be of length greater than or equal to 3 and strictly increasing.
* Values at -inf, inf must be explicitly provided to cover all Double values;
* otherwise, values outside the splits specified will be treated as errors.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index cc0924fd95b08..773f5daf70a8d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -53,8 +53,9 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/**
* Specifies the minimum number of different documents a term must appear in to be included
* in the vocabulary.
- * If this is an integer >= 1, this specifies the number of documents the term must appear in;
- * if this is a double in [0,1), then this specifies the fraction of documents.
+ * If this is an integer greater than or equal to 1, this specifies the number of documents
+ * the term must appear in; if this is a double in [0,1), then this specifies the fraction
+ * of documents.
*
* Default: 1.0
* @group param
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 8f60ec8788fac..db432b6fefaff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def setOutputCol(value: String): this.type = set(outputCol, value)
/**
- * Number of features. Should be > 0.
+ * Number of features. Should be greater than 0.
* (default = 2^18^)
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index c424aaa1f5634..c8760f9dc178f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
def this() = this(Identifiable.randomUID("ngram"))
/**
- * Minimum n-gram length, >= 1.
+ * Minimum n-gram length, greater than or equal to 1.
* Default: 2, bigram features
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index 629702051d426..6e96545c8cb7a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def this() = this(Identifiable.randomUID("normalizer"))
/**
- * Normalization in L^p^ space. Must be >= 1.
+ * Normalization in L^p^ space. Must be greater than equal to 1.
* (default: p = 2)
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 74526a8260a0d..292f9496a456c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -45,7 +45,8 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
def this() = this(Identifiable.randomUID("poly"))
/**
- * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion.
+ * The polynomial degree to expand, which should be greater than equal to 1. A value of 1 means
+ * no expansion.
* Default: 2
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index a1cceef51d30a..d8f33cd768dcd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
/**
* Number of buckets (quantiles, or categories) into which data points are grouped. Must
- * be >= 2.
+ * be greater than or equal to 2.
*
* See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index c4398bebf9be6..a82aba10b5b1a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -32,9 +32,11 @@ import org.apache.spark.sql.types.StructType
* the output, it can be any select clause that Spark SQL supports. Users can also
* use Spark SQL built-in function and UDFs to operate on these selected columns.
* For example, [[SQLTransformer]] supports statements like:
+ * {{{
* - SELECT a, a + b AS a_b FROM __THIS__
- * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
* - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
+ * }}}
*/
@Since("1.6.0")
class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index bf2b1d7c0f777..cfaf6c0e610b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
def this() = this(Identifiable.randomUID("regexTok"))
/**
- * Minimum token length, >= 0.
+ * Minimum token length, greater than or equal to 0.
* Default: 1, to avoid returning empty strings
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 0ae9f264f4a8a..16bcc22f4a631 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
/**
* Threshold for the number of values a categorical feature can take.
- * If a feature is found to have > maxCategories values, then it is declared continuous.
- * Must be >= 2.
+ * If a feature is found to have greater than maxCategories values, then it is declared
+ * continuous. Must be greater than or equal to 2.
*
* (default = 20)
* @group param
@@ -76,7 +76,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
* - Warning: This can cause problems if features are continuous since this will collect ALL
* unique values to the driver.
* - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
- * If maxCategories >= 3, then both features will be declared categorical.
+ * If maxCategories greater than or equal to 3, then both features will be declared
+ * categorical.
*
* This returns a model which can transform categorical features to use 0-based indices.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 4850a9e43f91c..7481f1d57c956 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -165,32 +165,64 @@ object ParamValidators {
s" of unexpected input type: ${value.getClass}")
}
- /** Check if value > lowerBound */
+ /**
+ * Check if
+ * {{{
+ * value > lowerBound
+ * }}}
+ */
def gt[T](lowerBound: Double): T => Boolean = { (value: T) =>
getDouble(value) > lowerBound
}
- /** Check if value >= lowerBound */
+ /**
+ * Check if
+ * {{{
+ * value >= lowerBound
+ * }}}
+ */
def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) =>
getDouble(value) >= lowerBound
}
- /** Check if value < upperBound */
+ /**
+ * Check if
+ * {{{
+ * value < upperBound
+ * }}}
+ */
def lt[T](upperBound: Double): T => Boolean = { (value: T) =>
getDouble(value) < upperBound
}
- /** Check if value <= upperBound */
+ /**
+ * Check if
+ * {{{
+ * value <= upperBound
+ * }}}
+ */
def ltEq[T](upperBound: Double): T => Boolean = { (value: T) =>
getDouble(value) <= upperBound
}
/**
* Check for value in range lowerBound to upperBound.
- * @param lowerInclusive If true, check for value >= lowerBound.
- * If false, check for value > lowerBound.
- * @param upperInclusive If true, check for value <= upperBound.
- * If false, check for value < upperBound.
+ * @param lowerInclusive If true, check for
+ * {{{
+ * value >= lowerBound
+ * }}}
+ * If false, check for
+ * {{{
+ * value > lowerBound
+ * }}}
+ * @param upperInclusive If true, check for
+ * {{{
+ * value <= upperBound.
+ * }}}
+ * If false, check for
+ * {{{
+ * value < upperBound.
+ * }}}
*/
def inRange[T](
lowerBound: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index ede859db19d5f..af68e7b9d5809 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -185,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
setDefault(tol -> 1E-6)
/**
- * Suggested depth for treeAggregate (>= 2).
+ * Suggested depth for treeAggregate (greater than or equal to 2).
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 440eacd13fd32..13b80bdbe8ec7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -131,10 +131,12 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
* It supports "gaussian", "binomial", "poisson" and "gamma" as family.
* Valid link functions for each family is listed below. The first link function of each family
* is the default one.
- * - "gaussian" -> "identity", "log", "inverse"
- * - "binomial" -> "logit", "probit", "cloglog"
- * - "poisson" -> "log", "identity", "sqrt"
- * - "gamma" -> "inverse", "identity", "log"
+ * {{{
+ * - "gaussian" -> "identity", "log", "inverse"
+ * - "binomial" -> "logit", "probit", "cloglog"
+ * - "poisson" -> "log", "identity", "sqrt"
+ * - "gamma" -> "inverse", "identity", "log"
+ * }}}
*/
@Experimental
@Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 556e48a604ea7..9ba664ea9b186 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -118,8 +118,11 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
/**
* Set the ElasticNet mixing parameter.
- * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
- * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+ * {{{
+ * For alpha = 0, the penalty is an L2 penalty.
+ * For alpha = 1, it is an L1 penalty.
+ * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+ * }}}
* Default is 0.0 which is an L2 penalty.
*
* @group setParam
@@ -181,7 +184,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
setDefault(solver -> "auto")
/**
- * Suggested depth for treeAggregate (>= 2).
+ * Suggested depth for treeAggregate (greater than or equal to 2).
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
index 9d7a3bd07abd3..70926a4ecb778 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
@@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD
* @param numClasses For classification: labels can take values {0, ..., numClasses - 1}.
* For regression: fixed at 0 (no meaning).
* @param maxBins Maximum number of bins, for all features.
- * @param featureArity Map: categorical feature index --> arity.
+ * @param featureArity Map: categorical feature index and arity.
* I.e., the feature takes values in {0, ..., arity - 1}.
* @param numBins Number of bins for each feature.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index 5e081cce0651e..e80ca6b73c292 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -48,7 +48,7 @@ private[spark] object MetadataUtils {
* If a feature does not have metadata, it is assumed to be continuous.
* If a feature is Nominal, then it must have the number of values
* specified.
- * @return Map: feature index --> number of categories.
+ * @return Map: feature index and number of categories.
* The map's set of keys will be the set of categorical feature indices.
*/
def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index c65fce4ef11ae..47562c377a8e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -352,7 +352,10 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
*
* @note This should be adjusted in synch with `LDA.setMaxIterations()`
* so the entire corpus is used. Specifically, set both so that
- * maxIterations * miniBatchFraction >= 1.
+ *
+ * {{{
+ * maxIterations * miniBatchFraction >= 1.
+ * }}}
*
* Default: 0.05, i.e., 5% of total documents.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index cedfdbf0dc127..b98aa0534152b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -41,9 +41,9 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
/**
* Compute the average precision of all the queries, truncated at ranking position k.
*
- * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be
- * computed as #(relevant items retrieved) / k. This formula also applies when the size of the
- * ground truth set is less than k.
+ * If for a query, the ranking algorithm returns n (n is less than k) results, the precision
+ * value will be computed as #(relevant items retrieved) / k. This formula also applies when
+ * the size of the ground truth set is less than k.
*
* If a query has an empty ground truth set, zero will be used as precision together with
* a log warning.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 7a1d2577c20e0..5c55fe8b08fb2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -32,7 +32,10 @@ private[mllib] object EigenValueDecomposition {
*
* @param mul a function that multiplies the symmetric matrix with a DenseVector.
* @param n dimension of the square matrix (maximum Int.MaxValue).
- * @param k number of leading eigenvalues required, 0 < k < n.
+ * @param k number of leading eigenvalues required,
+ * {{{
+ * 0 < k < n
+ * }}}.
* @param tol tolerance of the eigs computation.
* @param maxIterations the maximum number of Arnoldi update iterations.
* @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index b2e37bad3cf69..85d4d7f37f2c0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -249,8 +249,8 @@ object RandomRDDs {
* shape and scale.
*
* @param sc SparkContext used to create the RDD.
- * @param shape shape parameter (> 0) for the gamma distribution
- * @param scale scale parameter (> 0) for the gamma distribution
+ * @param shape shape parameter (greater than 0) for the gamma distribution
+ * @param scale scale parameter (greater than 0) for the gamma distribution
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
@@ -766,8 +766,8 @@ object RandomRDDs {
* gamma distribution with the input shape and scale.
*
* @param sc SparkContext used to create the RDD.
- * @param shape shape parameter (> 0) for the gamma distribution.
- * @param scale scale parameter (> 0) for the gamma distribution.
+ * @param shape shape parameter (greater than 0) for the gamma distribution.
+ * @param scale scale parameter (greater than 0) for the gamma distribution.
* @param numRows Number of Vectors in the RDD.
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 95b155b037194..4ab63dab4d897 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -162,7 +162,7 @@ object DecisionTree extends Serializable with Logging {
* @param numClasses Number of classes for classification. Default value of 2.
* @param maxBins Maximum number of bins used for splitting features.
* @param quantileCalculationStrategy Algorithm for calculating quantiles.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -$gt; k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @return DecisionTreeModel that can be used for prediction.
@@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging {
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels should take values {0, 1, ..., numClasses-1}.
* @param numClasses Number of classes for classification.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param impurity Criterion used for information gain calculation.
@@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging {
*
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels are real numbers.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param impurity Criterion used for information gain calculation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 81c1bb27ea207..74f1a6edb26c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -53,14 +53,15 @@ import org.apache.spark.util.Utils
* the type of random forest (classification or regression), feature type
* (continuous, categorical), depth of the tree, quantile calculation strategy,
* etc.
- * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done.
+ * @param numTrees If 1, then no bootstrapping is used. If greater than 1, then bootstrapping is
+ * done.
* @param featureSubsetStrategy Number of features to consider for splits at each node.
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* Supported numerical values: "(0.0-1.0]", "[1-n]".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt" for classification and
- * to "onethird" for regression.
+ * if numTrees greater than 1 (forest) set to "sqrt" for
+ * classification and to "onethird" for regression.
* If a real value "n" in the range (0, 1.0] is set,
* use n * number of features.
* If an integer value "n" in the range (1, num features) is set,
@@ -111,7 +112,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt".
+ * if numTrees is greater than 1 (forest) set to "sqrt".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return RandomForestModel that can be used for prediction.
*/
@@ -134,7 +135,7 @@ object RandomForest extends Serializable with Logging {
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels should take values {0, 1, ..., numClasses-1}.
* @param numClasses Number of classes for classification.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param numTrees Number of trees in the random forest.
@@ -142,7 +143,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "sqrt".
+ * if numTrees is greater than 1 (forest) set to "sqrt".
* @param impurity Criterion used for information gain calculation.
* Supported values: "gini" (recommended) or "entropy".
* @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
@@ -200,7 +201,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "onethird".
+ * if numTrees is greater than 1 (forest) set to "onethird".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return RandomForestModel that can be used for prediction.
*/
@@ -222,7 +223,7 @@ object RandomForest extends Serializable with Logging {
*
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* Labels are real numbers.
- * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+ * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param numTrees Number of trees in the random forest.
@@ -230,7 +231,7 @@ object RandomForest extends Serializable with Logging {
* Supported values: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
- * if numTrees > 1 (forest) set to "onethird".
+ * if numTrees is greater than 1 (forest) set to "onethird".
* @param impurity Criterion used for information gain calculation.
* The only supported value for regression is "variance".
* @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 8c7222815ea7a..d26c9f9d2e228 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -36,11 +36,11 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError}
* @param validationTol validationTol is a condition which decides iteration termination when
* runWithValidation is used.
* The end of iteration is decided based on below logic:
- * If the current loss on the validation set is > 0.01, the diff
+ * If the current loss on the validation set is greater than 0.01, the diff
* of validation error is compared to relative tolerance which is
* validationTol * (current loss on the validation set).
- * If the current loss on the validation set is <= 0.01, the diff
- * of validation error is compared to absolute tolerance which is
+ * If the current loss on the validation set is less than or euqal to 0.01,
+ * the diff of validation error is compared to absolute tolerance which is
* validationTol * 0.01.
* Ignored when
* `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index b4c1e45596d51..b8450b722d82c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -45,7 +45,7 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
* @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported:
* `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort`
* @param categoricalFeaturesInfo A map storing information about the categorical variables and the
- * number of discrete values they take. An entry (n -> k)
+ * number of discrete values they take. An entry (n and k)
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @param minInstancesPerNode Minimum number of instances each child must have after split.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index f27ca9aeb9235..e23ca339a8f8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -44,7 +44,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* of `x` is close to (p * N).
* More precisely,
*
- * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+ * {{{
+ * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+ * }}}
*
* This method implements a variation of the Greenwald-Khanna algorithm (with some speed
* optimizations).
@@ -55,7 +57,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param probabilities a list of quantile probabilities
* Each number must belong to [0, 1].
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
- * @param relativeError The relative target precision to achieve (>= 0).
+ * @param relativeError The relative target precision to achieve (greater or equal to 0).
* If set to zero, the exact quantiles are computed, which could be very expensive.
* Note that values greater than 1 are accepted but give the same result as 1.
* @return the approximate quantiles at the given probabilities
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
index 43684abc13629..edfcd7d56dc8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
@@ -65,7 +65,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) {
/**
* Returns the value of Spark runtime configuration property for the given key.
*
- * @note Throws `NoSuchElementException` if the key is not set and does not have a default value
+ * @throws java.util.NoSuchElementException if the key is not set and does not have a default
+ * value
* @since 2.0.0
*/
@throws[NoSuchElementException]("if the key is not set")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a7bc7c68270f6..3b229e340efcb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -883,7 +883,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
}
/**
- * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the
+ * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the
* given schema, returning the result as a `DataFrame`.
*
* @group specificdata
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 93e7229b20c1f..650439a193015 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -474,7 +474,9 @@ object functions {
/**
* Aggregate function: returns the level of grouping, equals to
*
- * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+ * {{{
+ * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+ * }}}
*
* @note The list of columns should match with grouping columns exactly, or empty (means all the
* grouping columns).
@@ -487,7 +489,9 @@ object functions {
/**
* Aggregate function: returns the level of grouping, equals to
*
- * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+ * {{{
+ * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+ * }}}
*
* @note The list of columns should match with grouping columns exactly.
*
@@ -1050,7 +1054,10 @@ object functions {
*
* As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
- * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ *
+ * {{{
+ * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ * }}}
*
* @group normal_funcs
* @since 1.4.0
@@ -1068,7 +1075,10 @@ object functions {
*
* As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
- * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ *
+ * {{{
+ * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+ * }}}
*
* @group normal_funcs
* @since 1.6.0
@@ -1846,8 +1856,8 @@ object functions {
def round(e: Column): Column = round(e, 0)
/**
- * Round the value of `e` to `scale` decimal places if `scale` >= 0
- * or at integral part when `scale` < 0.
+ * Round the value of `e` to `scale` decimal places if `scale` is greater than or equal to 0
+ * or at integral part when `scale` is less than 0.
*
* @group math_funcs
* @since 1.5.0
@@ -1864,7 +1874,7 @@ object functions {
/**
* Round the value of `e` to `scale` decimal places with HALF_EVEN round mode
- * if `scale` >= 0 or at integral part when `scale` < 0.
+ * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0.
*
* @group math_funcs
* @since 2.0.0
@@ -2172,7 +2182,7 @@ object functions {
* and returns the result as a string column.
*
* If d is 0, the result has no decimal point or fractional part.
- * If d < 0, the result will be null.
+ * If d is less than 0, the result will be null.
*
* @group string_funcs
* @since 1.5.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index a1ea748de98f9..46a90d67bded3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -216,8 +216,11 @@ abstract class BaseRelation {
/**
* Whether does it need to convert the objects in Row to internal representation, for example:
- * java.lang.String -> UTF8String
- * java.lang.Decimal -> Decimal
+ *
+ * {{{
+ * java.lang.String -> UTF8String
+ * java.lang.Decimal -> Decimal
+ * }}}
*
* If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow`
*
From a2a2011da220b272121b01734bbf640567ef6ae3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon
Date: Sun, 27 Nov 2016 02:57:18 +0900
Subject: [PATCH 12/17] Fix other existing ones for < and >
---
.../org/apache/spark/api/java/JavaRDD.scala | 8 ++++---
.../main/scala/org/apache/spark/rdd/RDD.scala | 24 ++++++++++++-------
.../scala/org/apache/spark/ui/UIUtils.scala | 9 +++++--
.../scala/org/apache/spark/util/Utils.scala | 12 ++++++++--
.../org/apache/spark/graphx/GraphLoader.scala | 2 +-
.../spark/graphx/lib/TriangleCount.scala | 2 +-
.../classification/LogisticRegression.scala | 19 +++++++++------
.../spark/ml/clustering/BisectingKMeans.scala | 4 ++--
.../spark/ml/clustering/GaussianMixture.scala | 2 +-
.../org/apache/spark/ml/clustering/LDA.scala | 6 ++---
.../apache/spark/ml/recommendation/ALS.scala | 4 ++--
.../mllib/clustering/BisectingKMeans.scala | 14 +++++------
.../mllib/clustering/GaussianMixture.scala | 2 +-
.../apache/spark/mllib/clustering/LDA.scala | 20 ++++++++--------
.../spark/mllib/clustering/LDAModel.scala | 2 +-
.../mllib/optimization/GradientDescent.scala | 6 ++---
.../spark/mllib/optimization/LBFGS.scala | 14 +++++++++--
.../spark/mllib/optimization/NNLS.scala | 5 +++-
.../spark/mllib/optimization/Updater.scala | 6 ++---
.../apache/spark/mllib/tree/model/Split.scala | 2 +-
.../main/scala/org/apache/spark/sql/Row.scala | 2 +-
.../apache/spark/sql/types/DecimalType.scala | 3 ++-
.../apache/spark/sql/jdbc/JdbcDialects.scala | 11 ++++++---
.../hive/execution/InsertIntoHiveTable.scala | 12 ++++++++--
24 files changed, 123 insertions(+), 68 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index a20d264be5afd..94e26e687c66b 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -103,7 +103,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
* without replacement: probability that each element is chosen; fraction must be [0, 1]
- * with replacement: expected number of times each element is chosen; fraction must be >= 0
+ * with replacement: expected number of times each element is chosen; fraction must be greater
+ * than or equal to 0
*
* @note This is NOT guaranteed to provide exactly the fraction of the count
* of the given `RDD`.
@@ -117,7 +118,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
* without replacement: probability that each element is chosen; fraction must be [0, 1]
- * with replacement: expected number of times each element is chosen; fraction must be >= 0
+ * with replacement: expected number of times each element is chosen; fraction must be greater
+ * than or equal to 0
* @param seed seed for the random number generator
*
* @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -167,7 +169,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
* Return an RDD with the elements from `this` that are not in `other`.
*
* Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
- * RDD will be <= us.
+ * RDD will be less than or equal to us.
*/
def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8e673447581cf..faac5cb4dbb77 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -469,7 +469,8 @@ abstract class RDD[T: ClassTag](
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
* without replacement: probability that each element is chosen; fraction must be [0, 1]
- * with replacement: expected number of times each element is chosen; fraction must be >= 0
+ * with replacement: expected number of times each element is chosen; fraction must be greater
+ * than or equal to 0
* @param seed seed for the random number generator
*
* @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -750,8 +751,10 @@ abstract class RDD[T: ClassTag](
* print line function (like out.println()) as the 2nd parameter.
* An example of pipe the RDD data of groupBy() in a streaming way,
* instead of constructing a huge String to concat all the elements:
- * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
- * for (e <- record._2) {f(e)}
+ * {{{
+ * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
+ * for (e <- record._2) {f(e)}
+ * }}}
* @param separateWorkingDir Use separate working directories for each task.
* @param bufferSize Buffer size for the stdin writer for the piped process.
* @param encoding Char encoding used for interacting (via stdin, stdout and stderr) with
@@ -1184,8 +1187,13 @@ abstract class RDD[T: ClassTag](
*
* @note This method should only be used if the resulting map is expected to be small, as
* the whole thing is loaded into the driver's memory.
- * To handle very large results, consider using rdd.map(x => (x, 1L)).reduceByKey(_ + _), which
- * returns an RDD[T, Long] instead of a map.
+ * To handle very large results, consider using
+ *
+ * {{{
+ * rdd.map(x => (x, 1L)).reduceByKey(_ + _)
+ * }}},
+ *
+ * which returns an RDD[T, Long] instead of a map.
*/
def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = withScope {
map(value => (value, null)).countByKey()
@@ -1223,9 +1231,9 @@ abstract class RDD[T: ClassTag](
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* here.
*
- * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p`
- * would trigger sparse representation of registers, which may reduce the memory consumption
- * and increase accuracy when the cardinality is small.
+ * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp is greater
+ * than p) would trigger sparse representation of registers, which may reduce the memory
+ * consumption and increase accuracy when the cardinality is small.
*
* @param p The precision value for the normal set.
* `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index dbeb970c81dfe..d31f956886014 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -422,8 +422,13 @@ private[spark] object UIUtils extends Logging {
* the whole string will rendered as a simple escaped text.
*
* Note: In terms of security, only anchor tags with root relative links are supported. So any
- * attempts to embed links outside Spark UI, or other tags like <script> will cause in
- * the whole description to be treated as plain text.
+ * attempts to embed links outside Spark UI, or other tags like
+ *
+ * {{{
+ *