From aa2dcb805d1e498a182b5fbdc500b299dd722efb Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Fri, 25 Nov 2016 19:15:48 +0900 Subject: [PATCH 01/17] Fix errors (first round) --- .../apache/spark/ml/feature/Bucketizer.scala | 2 +- .../spark/ml/feature/CountVectorizer.scala | 4 +- .../apache/spark/ml/feature/HashingTF.scala | 2 +- .../org/apache/spark/ml/feature/NGram.scala | 2 +- .../apache/spark/ml/feature/Normalizer.scala | 2 +- .../spark/ml/feature/OneHotEncoder.scala | 4 +- .../org/apache/spark/ml/feature/PCA.scala | 4 +- .../ml/feature/PolynomialExpansion.scala | 2 +- .../ml/feature/QuantileDiscretizer.scala | 6 +-- .../spark/ml/feature/SQLTransformer.scala | 2 +- .../spark/ml/feature/StopWordsRemover.scala | 2 +- .../spark/ml/feature/StringIndexer.scala | 8 ++-- .../apache/spark/ml/feature/Tokenizer.scala | 2 +- .../spark/ml/feature/VectorIndexer.scala | 8 ++-- .../spark/ml/feature/VectorSlicer.scala | 4 +- .../apache/spark/ml/feature/package-info.java | 4 +- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../ml/regression/DecisionTreeRegressor.scala | 3 +- .../spark/ml/regression/GBTRegressor.scala | 2 +- .../GeneralizedLinearRegression.scala | 22 +++++----- .../ml/regression/IsotonicRegression.scala | 4 +- .../ml/regression/LinearRegression.scala | 30 +++++++------- .../ml/regression/RandomForestRegressor.scala | 2 +- .../apache/spark/ml/util/MetadataUtils.scala | 2 +- .../org/apache/spark/ml/util/ReadWrite.scala | 6 +-- .../apache/spark/mllib/clustering/LDA.scala | 4 +- .../spark/mllib/clustering/LDAModel.scala | 2 +- .../spark/mllib/clustering/LDAOptimizer.scala | 4 +- .../tree/configuration/BoostingStrategy.scala | 10 ++--- .../mllib/tree/configuration/Strategy.scala | 8 ++-- .../scala/org/apache/spark/sql/Column.scala | 40 +++++++++---------- .../spark/sql/DataFrameNaFunctions.scala | 36 ++++++++--------- .../org/apache/spark/sql/SQLContext.scala | 6 +-- 33 files changed, 121 insertions(+), 120 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 1143f0f565ebd..546643d8f91f7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String /** * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which - * also includes y. Splits should be of length >= 3 and strictly increasing. + * also includes y. Splits should be of length >= 3 and strictly increasing. * Values at -inf, inf must be explicitly provided to cover all Double values; * otherwise, values outside the splits specified will be treated as errors. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 6299f74a6bf96..cc0924fd95b08 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -53,7 +53,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** * Specifies the minimum number of different documents a term must appear in to be included * in the vocabulary. - * If this is an integer >= 1, this specifies the number of documents the term must appear in; + * If this is an integer >= 1, this specifies the number of documents the term must appear in; * if this is a double in [0,1), then this specifies the fraction of documents. * * Default: 1.0 @@ -78,7 +78,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** * Filter to ignore rare words in a document. For each document, terms with * frequency/count less than the given threshold are ignored. - * If this is an integer >= 1, then this specifies a count (of times the term must appear + * If this is an integer >= 1, then this specifies a count (of times the term must appear * in the document); * if this is a double in [0,1), then this specifies a fraction (out of the document's token * count). diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index a8792a35ff4ae..8f60ec8788fac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Number of features. Should be > 0. + * Number of features. Should be > 0. * (default = 2^18^) * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala index 4463aea0097e2..c424aaa1f5634 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String) def this() = this(Identifiable.randomUID("ngram")) /** - * Minimum n-gram length, >= 1. + * Minimum n-gram length, >= 1. * Default: 2, bigram features * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala index eb0690058013f..629702051d426 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala @@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) def this() = this(Identifiable.randomUID("normalizer")) /** - * Normalization in L^p^ space. Must be >= 1. + * Normalization in L^p^ space. Must be >= 1. * (default: p = 2) * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index ea401216aec7b..ba1380bdda451 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -33,14 +33,14 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} * at most a single one-value per row that indicates the input category index. * For example with 5 categories, an input value of 2.0 would map to an output vector of * `[0.0, 0.0, 1.0, 0.0]`. - * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]] + * The last category is not included by default (configurable via `OneHotEncoder!.dropLast` * because it makes the vector entries sum up to one, and hence linearly dependent. * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`. * * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories. * The output vectors are sparse. * - * @see [[StringIndexer]] for converting categorical values into category indices + * @see `StringIndexer` for converting categorical values into category indices */ @Since("1.4.0") class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 6e08bf059124c..4143d864d7930 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -63,7 +63,7 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC } /** - * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]] + * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k` * principal components. */ @Since("1.5.0") @@ -144,7 +144,7 @@ class PCAModel private[ml] ( * Transform a vector by computed Principal Components. * * @note Vectors to be transformed must be the same length as the source vectors given - * to [[PCA.fit()]]. + * to `PCA.fit()`. */ @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 4be17da3e9f76..74526a8260a0d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -45,7 +45,7 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str def this() = this(Identifiable.randomUID("poly")) /** - * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion. + * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion. * Default: 2 * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index b9e01dde70d85..a1cceef51d30a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params /** * Number of buckets (quantiles, or categories) into which data points are grouped. Must - * be >= 2. + * be >= 2. * * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values. * @@ -52,7 +52,7 @@ private[feature] trait QuantileDiscretizerBase extends Params /** * Relative error (see documentation for - * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] for description) + * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description) * Must be in the range [0, 1]. * default: 0.001 * @group param @@ -99,7 +99,7 @@ private[feature] trait QuantileDiscretizerBase extends Params * but NaNs will be counted in a special bucket[4]. * * Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for - * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] + * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` * for a detailed description). The precision of the approximation can be controlled with the * `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`, * covering all real values. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala index b25fff973c441..c4398bebf9be6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.StructType * use Spark SQL built-in function and UDFs to operate on these selected columns. * For example, [[SQLTransformer]] supports statements like: * - SELECT a, a + b AS a_b FROM __THIS__ - * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 + * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 * - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b */ @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index a55816249c74b..3fcd84c029e61 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -52,7 +52,7 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String /** * The words to be filtered out. * Default: English stop words - * @see [[StopWordsRemover.loadDefaultStopWords()]] + * @see `StopWordsRemover.loadDefaultStopWords()` * @group param */ @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 8b155f00017cf..0a4d31d1654e7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -60,7 +60,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha * The indices are in [0, numLabels), ordered by label frequencies. * So the most frequent label gets index 0. * - * @see [[IndexToString]] for the inverse transformation + * @see `IndexToString` for the inverse transformation */ @Since("1.4.0") class StringIndexer @Since("1.4.0") ( @@ -116,7 +116,7 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] { * @param labels Ordered list of labels, corresponding to indices to be assigned. * * @note During transformation, if the input column does not exist, - * [[StringIndexerModel.transform]] would return the input dataset unmodified. + * `StringIndexerModel.transform` would return the input dataset unmodified. * This is a temporary fix for the case when target labels do not exist during prediction. */ @Since("1.4.0") @@ -247,12 +247,12 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] { } /** - * A [[Transformer]] that maps a column of indices back to a new column of corresponding + * A `Transformer` that maps a column of indices back to a new column of corresponding * string values. * The index-string mapping is either from the ML attributes of the input column, * or from user-supplied labels (which take precedence over ML attributes). * - * @see [[StringIndexer]] for converting strings into indices + * @see `StringIndexer` for converting strings into indices */ @Since("1.5.0") class IndexToString private[ml] (@Since("1.5.0") override val uid: String) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 45d8fa94a8f8f..bf2b1d7c0f777 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) def this() = this(Identifiable.randomUID("regexTok")) /** - * Minimum token length, >= 0. + * Minimum token length, >= 0. * Default: 1, to avoid returning empty strings * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index d1a5c2e82581e..0ae9f264f4a8a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu /** * Threshold for the number of values a categorical feature can take. - * If a feature is found to have > maxCategories values, then it is declared continuous. - * Must be >= 2. + * If a feature is found to have > maxCategories values, then it is declared continuous. + * Must be >= 2. * * (default = 20) * @group param @@ -59,7 +59,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu } /** - * Class for indexing categorical feature columns in a dataset of [[Vector]]. + * Class for indexing categorical feature columns in a dataset of `Vector`. * * This has 2 usage modes: * - Automatically identify categorical features (default behavior) @@ -76,7 +76,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu * - Warning: This can cause problems if features are continuous since this will collect ALL * unique values to the driver. * - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. - * If maxCategories >= 3, then both features will be declared categorical. + * If maxCategories >= 3, then both features will be declared categorical. * * This returns a model which can transform categorical features to use 0-based indices. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala index 966ccb85d0e0e..e3e462d07e10c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala @@ -32,8 +32,8 @@ import org.apache.spark.sql.types.StructType * This class takes a feature vector and outputs a new feature vector with a subarray of the * original features. * - * The subset of features can be specified with either indices ([[setIndices()]]) - * or names ([[setNames()]]). At least one feature must be selected. Duplicate features + * The subset of features can be specified with either indices (`setIndices()`) + * or names (`setNames()`). At least one feature must be selected. Duplicate features * are not allowed, so there can be no overlap between selected indices and names. * * The output vector will order features with the selected indices first (in the order given), diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java index dcff4245d1d26..ce7f335056872 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java @@ -61,12 +61,12 @@ * createStructField("id", IntegerType, false), * createStructField("text", StringType, false), * createStructField("rating", DoubleType, false))); - * JavaRDD rowRDD = jsc.parallelize( + * JavaRDD<Row> rowRDD = jsc.parallelize( * Arrays.asList( * RowFactory.create(0, "Hi I heard about Spark", 3.0), * RowFactory.create(1, "I wish Java could use case classes", 4.0), * RowFactory.create(2, "Logistic regression models are neat", 4.0))); - * Dataset dataset = jsql.createDataFrame(rowRDD, schema); + * Dataset<Row> dataset = jsql.createDataFrame(rowRDD, schema); * // define feature transformers * RegexTokenizer tok = new RegexTokenizer() * .setInputCol("text") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index d6ad1ea6d1096..ede859db19d5f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -185,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S setDefault(tol -> 1E-6) /** - * Suggested depth for treeAggregate (>= 2). + * Suggested depth for treeAggregate (>= 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 894b6a2ca2041..0b0c46144bfbe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -132,7 +132,8 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor } /** - * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression. + * + * Decision tree (Wikipedia) model for regression. * It supports both continuous and categorical features. * @param rootNode Root of the decision tree */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index ed2d05525d611..b4d5603bdbb20 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -223,7 +223,7 @@ class GBTRegressionModel private[ml]( * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) * and follows the implementation from scikit-learn. * - * @see [[DecisionTreeRegressionModel.featureImportances]] + * @see `DecisionTreeRegressionModel.featureImportances` */ @Since("2.0.0") lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 1201ecd5e4e61..440eacd13fd32 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -131,10 +131,10 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * It supports "gaussian", "binomial", "poisson" and "gamma" as family. * Valid link functions for each family is listed below. The first link function of each family * is the default one. - * - "gaussian" -> "identity", "log", "inverse" - * - "binomial" -> "logit", "probit", "cloglog" - * - "poisson" -> "log", "identity", "sqrt" - * - "gamma" -> "inverse", "identity", "log" + * - "gaussian" -> "identity", "log", "inverse" + * - "binomial" -> "logit", "probit", "cloglog" + * - "poisson" -> "log", "identity", "sqrt" + * - "gamma" -> "inverse", "identity", "log" */ @Experimental @Since("2.0.0") @@ -1066,7 +1066,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( import GeneralizedLinearRegression._ /** - * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver. + * Whether the underlying `WeightedLeastSquares` using the "normal" solver. */ private[ml] val isNormalSolver: Boolean = { diagInvAtWA.length != 1 || diagInvAtWA(0) != 0 @@ -1074,10 +1074,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( /** * Standard error of estimated coefficients and intercept. - * This value is only available when the underlying [[WeightedLeastSquares]] + * This value is only available when the underlying `WeightedLeastSquares` * using the "normal" solver. * - * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * If `GeneralizedLinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") @@ -1092,10 +1092,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( /** * T-statistic of estimated coefficients and intercept. - * This value is only available when the underlying [[WeightedLeastSquares]] + * This value is only available when the underlying `WeightedLeastSquares` * using the "normal" solver. * - * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * If `GeneralizedLinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") @@ -1115,10 +1115,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( /** * Two-sided p-value of estimated coefficients and intercept. - * This value is only available when the underlying [[WeightedLeastSquares]] + * This value is only available when the underlying `WeightedLeastSquares` * using the "normal" solver. * - * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * If `GeneralizedLinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index 4d274f3a5bbf1..c378a99e3c230 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -56,7 +56,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures final def getIsotonic: Boolean = $(isotonic) /** - * Param for the index of the feature if [[featuresCol]] is a vector column (default: `0`), no + * Param for the index of the feature if `featuresCol` is a vector column (default: `0`), no * effect otherwise. * @group param */ @@ -194,7 +194,7 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] { * Model fitted by IsotonicRegression. * Predicts using a piecewise linear function. * - * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]]. + * For detailed rules see `org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()`. * * @param oldModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]] * model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]]. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index eb4e38cc83c19..95c6625920ec0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -119,7 +119,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * For 0 < alpha < 1, the penalty is a combination of L1 and L2. * Default is 0.0 which is an L2 penalty. * * @group setParam @@ -165,7 +165,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String * - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton * optimization method. * - "normal" denotes using Normal Equation as an analytical solution to the linear regression - * problem. This solver is limited to [[LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER]]. + * problem. This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`. * - "auto" (default) means that the solver algorithm is selected automatically. * The Normal Equations solver will be used when possible, but this will automatically fall * back to iterative optimization methods when needed. @@ -181,7 +181,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String setDefault(solver -> "auto") /** - * Suggested depth for treeAggregate (>= 2). + * Suggested depth for treeAggregate (>= 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. @@ -338,12 +338,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String /* Note that in Linear Regression, the objective history (loss + regularization) returned from optimizer is computed in the scaled space given by the following formula. -

+
$$ L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2 + regTerms \\ $$ -

+
*/ val arrayBuilder = mutable.ArrayBuilder.make[Double] var state: optimizer.State = null @@ -414,7 +414,7 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] { override def load(path: String): LinearRegression = super.load(path) /** - * When using [[LinearRegression.solver]] == "normal", the solver must limit the number of + * When using `LinearRegression.solver` == "normal", the solver must limit the number of * features to at most this number. The entire covariance matrix X^T^X will be collected * to the driver. This limit helps prevent memory overflow errors. */ @@ -584,7 +584,7 @@ class LinearRegressionTrainingSummary private[regression] ( * * This value is only available when using the "l-bfgs" solver. * - * @see [[LinearRegression.solver]] + * @see `LinearRegression.solver` */ @Since("1.5.0") val totalIterations = objectiveHistory.length @@ -627,7 +627,7 @@ class LinearRegressionSummary private[regression] ( * Reference: * Wikipedia explain variation * - * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. + * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -637,7 +637,7 @@ class LinearRegressionSummary private[regression] ( * Returns the mean absolute error, which is a risk function corresponding to the * expected value of the absolute error loss or l1-norm loss. * - * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. + * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -647,7 +647,7 @@ class LinearRegressionSummary private[regression] ( * Returns the mean squared error, which is a risk function corresponding to the * expected value of the squared error loss or quadratic loss. * - * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. + * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -657,7 +657,7 @@ class LinearRegressionSummary private[regression] ( * Returns the root mean squared error, which is defined as the square root of * the mean squared error. * - * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. + * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -668,7 +668,7 @@ class LinearRegressionSummary private[regression] ( * Reference: * Wikipedia coefficient of determination * - * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]]. + * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. * This will change in later Spark versions. */ @Since("1.5.0") @@ -714,7 +714,7 @@ class LinearRegressionSummary private[regression] ( * Standard error of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * - * If [[LinearRegression.fitIntercept]] is set to true, + * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see [[LinearRegression.solver]] @@ -742,7 +742,7 @@ class LinearRegressionSummary private[regression] ( * T-statistic of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * - * If [[LinearRegression.fitIntercept]] is set to true, + * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see [[LinearRegression.solver]] @@ -765,7 +765,7 @@ class LinearRegressionSummary private[regression] ( * Two-sided p-value of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * - * If [[LinearRegression.fitIntercept]] is set to true, + * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see [[LinearRegression.solver]] diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index d60f05eed58d9..59798730e7c04 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -208,7 +208,7 @@ class RandomForestRegressionModel private[ml] ( * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) * and follows the implementation from scikit-learn. * - * @see [[DecisionTreeRegressionModel.featureImportances]] + * @see `DecisionTreeRegressionModel.featureImportances` */ @Since("1.5.0") lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala index f34a8310ddf1c..5e081cce0651e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala @@ -48,7 +48,7 @@ private[spark] object MetadataUtils { * If a feature does not have metadata, it is assumed to be continuous. * If a feature is Nominal, then it must have the number of values * specified. - * @return Map: feature index --> number of categories. + * @return Map: feature index --> number of categories. * The map's set of keys will be the set of categorical feature indices. */ def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 5b7e5ec75c842..343a70c5d7a46 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -76,7 +76,7 @@ private[util] sealed trait BaseReadWrite { */ protected final def sqlContext: SQLContext = sparkSession.sqlContext - /** Returns the underlying [[SparkContext]]. */ + /** Returns the underlying `SparkContext`. */ protected final def sc: SparkContext = sparkSession.sparkContext } @@ -169,7 +169,7 @@ trait MLWritable { * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle * [[org.apache.spark.sql.Dataset]]. * - * @see [[DefaultParamsReadable]], the counterpart to this trait + * @see `DefaultParamsReadable`, the counterpart to this trait */ @DeveloperApi trait DefaultParamsWritable extends MLWritable { self: Params => @@ -238,7 +238,7 @@ trait MLReadable[T] { * [[org.apache.spark.sql.Dataset]]. * * @tparam T ML instance type - * @see [[DefaultParamsWritable]], the counterpart to this trait + * @see `DefaultParamsWritable`, the counterpart to this trait */ @DeveloperApi trait DefaultParamsReadable[T] extends MLReadable[T] { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index 16742bd284e69..63a39c1ce0274 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -261,7 +261,7 @@ class LDA private ( def getCheckpointInterval: Int = checkpointInterval /** - * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that + * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that * the cache will get checkpointed every 10 iterations. Checkpointing helps with recovery * (when nodes fail). It also helps with eliminating temporary shuffle files on disk, which can be * important when LDA is run for many iterations. If the checkpoint directory is not set in @@ -340,7 +340,7 @@ class LDA private ( } /** - * Java-friendly version of [[run()]] + * Java-friendly version of `run()` */ @Since("1.3.0") def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 017fbc6feb0d7..436c8bf299dc6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -392,7 +392,7 @@ class LocalLDAModel private[spark] ( * literature). Returns a vector of zeros for an empty document. * * Note this means to allow quick query for single document. For batch documents, please refer - * to [[topicDistributions()]] to avoid overhead. + * to `topicDistributions()` to avoid overhead. * * @param document document to predict topic mixture distributions for * @return topic mixture distribution for the document diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 9687fc8804e89..c65fce4ef11ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -350,9 +350,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in * each iteration. * - * @note This should be adjusted in synch with [[LDA.setMaxIterations()]] + * @note This should be adjusted in synch with `LDA.setMaxIterations()` * so the entire corpus is used. Specifically, set both so that - * maxIterations * miniBatchFraction >= 1. + * maxIterations * miniBatchFraction >= 1. * * Default: 0.05, i.e., 5% of total documents. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index d8405d13ce904..8c7222815ea7a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -36,14 +36,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError} * @param validationTol validationTol is a condition which decides iteration termination when * runWithValidation is used. * The end of iteration is decided based on below logic: - * If the current loss on the validation set is > 0.01, the diff + * If the current loss on the validation set is > 0.01, the diff * of validation error is compared to relative tolerance which is * validationTol * (current loss on the validation set). - * If the current loss on the validation set is <= 0.01, the diff + * If the current loss on the validation set is <= 0.01, the diff * of validation error is compared to absolute tolerance which is * validationTol * 0.01. * Ignored when - * [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used. + * `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used. */ @Since("1.2.0") case class BoostingStrategy @Since("1.4.0") ( @@ -92,8 +92,8 @@ object BoostingStrategy { /** * Returns default configuration for the boosting algorithm * @param algo Learning goal. Supported: - * [[org.apache.spark.mllib.tree.configuration.Algo.Classification]], - * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]] + * `org.apache.spark.mllib.tree.configuration.Algo.Classification`, + * `org.apache.spark.mllib.tree.configuration.Algo.Regression` * @return Configuration for boosting algorithm */ @Since("1.3.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index b34e1b1b56c43..b4c1e45596d51 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -28,8 +28,8 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} /** * Stores all the configuration options for tree construction * @param algo Learning goal. Supported: - * [[org.apache.spark.mllib.tree.configuration.Algo.Classification]], - * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]] + * `org.apache.spark.mllib.tree.configuration.Algo.Classification`, + * `org.apache.spark.mllib.tree.configuration.Algo.Regression` * @param impurity Criterion used for information gain calculation. * Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]], * [[org.apache.spark.mllib.tree.impurity.Entropy]]. @@ -43,9 +43,9 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} * for choosing how to split on features at each node. * More bins give higher granularity. * @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported: - * [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]] + * `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort` * @param categoricalFeaturesInfo A map storing information about the categorical variables and the - * number of discrete values they take. An entry (n -> k) + * number of discrete values they take. An entry (n -> k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param minInstancesPerNode Minimum number of instances each child must have after split. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index fa3b2b9de5d5d..e99d7865bda91 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -97,7 +97,7 @@ class TypedColumn[-T, U]( } /** - * A column that will be computed based on the data in a [[DataFrame]]. + * A column that will be computed based on the data in a `DataFrame`. * * A new column is constructed based on the input columns present in a dataframe: * @@ -801,7 +801,7 @@ class Column(val expr: Expression) extends Logging { /** * An expression that gets an item at position `ordinal` out of an array, - * or gets a value by key `key` in a [[MapType]]. + * or gets a value by key `key` in a `MapType`. * * @group expr_ops * @since 1.3.0 @@ -809,7 +809,7 @@ class Column(val expr: Expression) extends Logging { def getItem(key: Any): Column = withExpr { UnresolvedExtractValue(expr, Literal(key)) } /** - * An expression that gets a field by name in a [[StructType]]. + * An expression that gets a field by name in a `StructType`. * * @group expr_ops * @since 1.3.0 @@ -1195,92 +1195,92 @@ class Column(val expr: Expression) extends Logging { class ColumnName(name: String) extends Column(name) { /** - * Creates a new [[StructField]] of type boolean. + * Creates a new `StructField` of type boolean. * @since 1.3.0 */ def boolean: StructField = StructField(name, BooleanType) /** - * Creates a new [[StructField]] of type byte. + * Creates a new `StructField` of type byte. * @since 1.3.0 */ def byte: StructField = StructField(name, ByteType) /** - * Creates a new [[StructField]] of type short. + * Creates a new `StructField` of type short. * @since 1.3.0 */ def short: StructField = StructField(name, ShortType) /** - * Creates a new [[StructField]] of type int. + * Creates a new `StructField` of type int. * @since 1.3.0 */ def int: StructField = StructField(name, IntegerType) /** - * Creates a new [[StructField]] of type long. + * Creates a new `StructField` of type long. * @since 1.3.0 */ def long: StructField = StructField(name, LongType) /** - * Creates a new [[StructField]] of type float. + * Creates a new `StructField` of type float. * @since 1.3.0 */ def float: StructField = StructField(name, FloatType) /** - * Creates a new [[StructField]] of type double. + * Creates a new `StructField` of type double. * @since 1.3.0 */ def double: StructField = StructField(name, DoubleType) /** - * Creates a new [[StructField]] of type string. + * Creates a new `StructField` of type string. * @since 1.3.0 */ def string: StructField = StructField(name, StringType) /** - * Creates a new [[StructField]] of type date. + * Creates a new `StructField` of type date. * @since 1.3.0 */ def date: StructField = StructField(name, DateType) /** - * Creates a new [[StructField]] of type decimal. + * Creates a new `StructField` of type decimal. * @since 1.3.0 */ def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT) /** - * Creates a new [[StructField]] of type decimal. + * Creates a new `StructField` of type decimal. * @since 1.3.0 */ def decimal(precision: Int, scale: Int): StructField = StructField(name, DecimalType(precision, scale)) /** - * Creates a new [[StructField]] of type timestamp. + * Creates a new `StructField` of type timestamp. * @since 1.3.0 */ def timestamp: StructField = StructField(name, TimestampType) /** - * Creates a new [[StructField]] of type binary. + * Creates a new `StructField` of type binary. * @since 1.3.0 */ def binary: StructField = StructField(name, BinaryType) /** - * Creates a new [[StructField]] of type array. + * Creates a new `StructField` of type array. * @since 1.3.0 */ def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType)) /** - * Creates a new [[StructField]] of type map. + * Creates a new `StructField` of type map. * @since 1.3.0 */ def map(keyType: DataType, valueType: DataType): StructField = @@ -1289,13 +1289,13 @@ class ColumnName(name: String) extends Column(name) { def map(mapType: MapType): StructField = StructField(name, mapType) /** - * Creates a new [[StructField]] of type struct. + * Creates a new `StructField` of type struct. * @since 1.3.0 */ def struct(fields: StructField*): StructField = struct(StructType(fields)) /** - * Creates a new [[StructField]] of type struct. + * Creates a new `StructField` of type struct. * @since 1.3.0 */ def struct(structType: StructType): StructField = StructField(name, structType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala index 0d43f09bc54cd..184c5a11298d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types._ /** - * Functionality for working with missing data in [[DataFrame]]s. + * Functionality for working with missing data in `DataFrame`s. * * @since 1.3.1 */ @@ -36,14 +36,14 @@ import org.apache.spark.sql.types._ final class DataFrameNaFunctions private[sql](df: DataFrame) { /** - * Returns a new [[DataFrame]] that drops rows containing any null or NaN values. + * Returns a new `DataFrame` that drops rows containing any null or NaN values. * * @since 1.3.1 */ def drop(): DataFrame = drop("any", df.columns) /** - * Returns a new [[DataFrame]] that drops rows containing null or NaN values. + * Returns a new `DataFrame` that drops rows containing null or NaN values. * * If `how` is "any", then drop rows containing any null or NaN values. * If `how` is "all", then drop rows only if every column is null or NaN for that row. @@ -53,7 +53,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(how: String): DataFrame = drop(how, df.columns) /** - * Returns a new [[DataFrame]] that drops rows containing any null or NaN values + * Returns a new `DataFrame` that drops rows containing any null or NaN values * in the specified columns. * * @since 1.3.1 @@ -61,7 +61,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(cols: Array[String]): DataFrame = drop(cols.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing any null or NaN values + * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values * in the specified columns. * * @since 1.3.1 @@ -69,7 +69,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(cols: Seq[String]): DataFrame = drop(cols.size, cols) /** - * Returns a new [[DataFrame]] that drops rows containing null or NaN values + * Returns a new `DataFrame` that drops rows containing null or NaN values * in the specified columns. * * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. @@ -80,7 +80,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null or NaN values + * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values * in the specified columns. * * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. @@ -97,7 +97,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { } /** - * Returns a new [[DataFrame]] that drops rows containing + * Returns a new `DataFrame` that drops rows containing * less than `minNonNulls` non-null and non-NaN values. * * @since 1.3.1 @@ -105,7 +105,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(minNonNulls: Int): DataFrame = drop(minNonNulls, df.columns) /** - * Returns a new [[DataFrame]] that drops rows containing + * Returns a new `DataFrame` that drops rows containing * less than `minNonNulls` non-null and non-NaN values in the specified columns. * * @since 1.3.1 @@ -113,7 +113,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing less than + * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than * `minNonNulls` non-null and non-NaN values in the specified columns. * * @since 1.3.1 @@ -126,21 +126,21 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { } /** - * Returns a new [[DataFrame]] that replaces null or NaN values in numeric columns with `value`. + * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`. * * @since 1.3.1 */ def fill(value: Double): DataFrame = fill(value, df.columns) /** - * Returns a new [[DataFrame]] that replaces null values in string columns with `value`. + * Returns a new `DataFrame` that replaces null values in string columns with `value`. * * @since 1.3.1 */ def fill(value: String): DataFrame = fill(value, df.columns) /** - * Returns a new [[DataFrame]] that replaces null or NaN values in specified numeric columns. + * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. * If a specified column is not a numeric column, it is ignored. * * @since 1.3.1 @@ -148,7 +148,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that replaces null or NaN values in specified + * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified * numeric columns. If a specified column is not a numeric column, it is ignored. * * @since 1.3.1 @@ -167,7 +167,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { } /** - * Returns a new [[DataFrame]] that replaces null values in specified string columns. + * Returns a new `DataFrame` that replaces null values in specified string columns. * If a specified column is not a string column, it is ignored. * * @since 1.3.1 @@ -175,7 +175,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that replaces null values in + * (Scala-specific) Returns a new `DataFrame` that replaces null values in * specified string columns. If a specified column is not a string column, it is ignored. * * @since 1.3.1 @@ -194,7 +194,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { } /** - * Returns a new [[DataFrame]] that replaces null values. + * Returns a new `DataFrame` that replaces null values. * * The key of the map is the column name, and the value of the map is the replacement value. * The value must be of the following type: @@ -213,7 +213,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.asScala.toSeq) /** - * (Scala-specific) Returns a new [[DataFrame]] that replaces null values. + * (Scala-specific) Returns a new `DataFrame` that replaces null values. * * The key of the map is the column name, and the value of the map is the replacement value. * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 858fa4c7609b6..a7bc7c68270f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -84,7 +84,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary - * tables, registered functions, but sharing the same [[SparkContext]], cached data and + * tables, registered functions, but sharing the same `SparkContext`, cached data and * other things. * * @since 1.6.0 @@ -883,8 +883,8 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Loads an JavaRDD storing JSON objects (one object per record) and applies the given - * schema, returning the result as a `DataFrame`. + * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the + * given schema, returning the result as a `DataFrame`. * * @group specificdata * @deprecated As of 1.4.0, replaced by `read().json()`. From aa5acbb016483c88caad0a57d6481a8fee93e1c3 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 00:08:06 +0900 Subject: [PATCH 02/17] Fix errors (second round) --- .../ml/regression/LinearRegression.scala | 10 ++--- .../apache/spark/mllib/clustering/LDA.scala | 2 +- .../apache/spark/sql/DataFrameReader.scala | 43 +++++++++---------- .../spark/sql/DataFrameStatFunctions.scala | 20 ++++----- .../apache/spark/sql/DataFrameWriter.scala | 30 ++++++------- .../scala/org/apache/spark/sql/Dataset.scala | 38 ++++++++-------- .../org/apache/spark/sql/ForeachWriter.scala | 3 +- .../org/apache/spark/sql/functions.scala | 20 ++++----- 8 files changed, 82 insertions(+), 84 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 95c6625920ec0..556e48a604ea7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -60,11 +60,11 @@ private[regression] trait LinearRegressionParams extends PredictorParams * The learning objective is to minimize the squared error, with regularization. * The specific squared error loss function used is: * - *

+ *
* $$ * L = 1/2n ||A coefficients - y||^2^ * $$ - *

+ *
* * This supports multiple types of regularization: * - none (a.k.a. ordinary least squares) @@ -717,7 +717,7 @@ class LinearRegressionSummary private[regression] ( * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * - * @see [[LinearRegression.solver]] + * @see `LinearRegression.solver` */ lazy val coefficientStandardErrors: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -745,7 +745,7 @@ class LinearRegressionSummary private[regression] ( * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * - * @see [[LinearRegression.solver]] + * @see `LinearRegression.solver` */ lazy val tValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -768,7 +768,7 @@ class LinearRegressionSummary private[regression] ( * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * - * @see [[LinearRegression.solver]] + * @see `LinearRegression.solver` */ lazy val pValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index 63a39c1ce0274..14dfd3af54a71 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -216,7 +216,7 @@ class LDA private ( def getBeta: Double = getTopicConcentration /** - * Alias for [[setTopicConcentration()]] + * Alias for `setTopicConcentration()` */ @Since("1.3.0") def setBeta(beta: Double): this.type = setTopicConcentration(beta) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 5be9a99369997..1af2f9afea5eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.types.StructType /** * Interface used to load a [[Dataset]] from external storage systems (e.g. file systems, - * key-value stores, etc). Use [[SparkSession.read]] to access this. + * key-value stores, etc). Use `SparkSession.read` to access this. * * @since 1.4.0 */ @@ -116,7 +116,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external + * Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external * key-value stores). * * @since 1.4.0 @@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by + * Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by * a local or distributed file system). * * @since 1.4.0 @@ -136,7 +136,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads input in as a [[DataFrame]], for data sources that support multiple paths. + * Loads input in as a `DataFrame`, for data sources that support multiple paths. * Only works if the source is a HadoopFsRelationProvider. * * @since 1.6.0 @@ -153,7 +153,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table and connection properties. * * @since 1.4.0 @@ -163,7 +163,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table. Partitions of the table will be retrieved in parallel based on the parameters * passed to this function. * @@ -198,10 +198,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Construct a [[DataFrame]] representing the database table accessible via JDBC URL + * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table using connection properties. The `predicates` parameter gives a list * expressions suitable for inclusion in WHERE clauses; each one defines one partition - * of the [[DataFrame]]. + * of the `DataFrame`. * * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash * your external database systems. @@ -240,7 +240,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads a JSON file (JSON Lines text format or - * newline-delimited JSON) and returns the result as a [[DataFrame]]. + * newline-delimited JSON) and returns the result as a `DataFrame`. * See the documentation on the overloaded `json()` method with varargs for more details. * * @since 1.4.0 @@ -252,7 +252,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads a JSON file (JSON Lines text format or - * newline-delimited JSON) and returns the result as a [[DataFrame]]. + * newline-delimited JSON) and returns the result as a `DataFrame`. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. @@ -299,7 +299,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads a `JavaRDD[String]` storing JSON objects (JSON * Lines text format or newline-delimited JSON) and returns the result as - * a [[DataFrame]]. + * a `DataFrame`. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. @@ -311,7 +311,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads an `RDD[String]` storing JSON objects (JSON Lines - * text format or newline-delimited JSON) and returns the result as a [[DataFrame]]. + * text format or newline-delimited JSON) and returns the result as a `DataFrame`. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. @@ -341,7 +341,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the + * Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the * other overloaded `csv()` method for more details. * * @since 2.0.0 @@ -352,7 +352,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a CSV file and returns the result as a [[DataFrame]]. + * Loads a CSV file and returns the result as a `DataFrame`. * * This function will go through the input once to determine the input schema if `inferSchema` * is enabled. To avoid going through the entire data once, disable `inferSchema` option or @@ -392,7 +392,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { *
  • `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at * `java.text.SimpleDateFormat`. This applies to timestamp type.
  • - * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format. *
  • `maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.
  • *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed @@ -415,7 +414,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def csv(paths: String*): DataFrame = format("csv").load(paths : _*) /** - * Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation + * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation * on the other overloaded `parquet()` method for more details. * * @since 2.0.0 @@ -426,7 +425,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a Parquet file, returning the result as a [[DataFrame]]. + * Loads a Parquet file, returning the result as a `DataFrame`. * * You can set the following Parquet-specific option(s) for reading Parquet files: *
      @@ -442,7 +441,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads an ORC file and returns the result as a [[DataFrame]]. + * Loads an ORC file and returns the result as a `DataFrame`. * * @param path input path * @since 1.5.0 @@ -454,7 +453,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads an ORC file and returns the result as a [[DataFrame]]. + * Loads an ORC file and returns the result as a `DataFrame`. * * @param paths input paths * @since 2.0.0 @@ -464,7 +463,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def orc(paths: String*): DataFrame = format("orc").load(paths: _*) /** - * Returns the specified table as a [[DataFrame]]. + * Returns the specified table as a `DataFrame`. * * @since 1.4.0 */ @@ -475,7 +474,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named + * Loads text files and returns a `DataFrame` whose schema starts with a string column named * "value", and followed by partitioned columns if there are any. See the documentation on * the other overloaded `text()` method for more details. * @@ -487,7 +486,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named + * Loads text files and returns a `DataFrame` whose schema starts with a string column named * "value", and followed by partitioned columns if there are any. * * Each line in the text files is a new row in the resulting DataFrame. For example: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index a9a861c4635b2..f48ddd54d3650 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -44,7 +44,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * of `x` is close to (p * N). * More precisely, * - * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). + * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). * * This method implements a variation of the Greenwald-Khanna algorithm (with some speed * optimizations). @@ -55,7 +55,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (>= 0). + * @param relativeError The relative target precision to achieve (>= 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities @@ -254,7 +254,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * and Papadimitriou. * * This function is meant for exploratory data analysis, as we make no guarantee about the - * backward compatibility of the schema of the resulting [[DataFrame]]. + * backward compatibility of the schema of the resulting `DataFrame`. * * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. @@ -299,7 +299,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * Uses a `default` support of 1%. * * This function is meant for exploratory data analysis, as we make no guarantee about the - * backward compatibility of the schema of the resulting [[DataFrame]]. + * backward compatibility of the schema of the resulting `DataFrame`. * * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. @@ -317,7 +317,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * its fraction as zero. * @param seed random seed * @tparam T stratum type - * @return a new [[DataFrame]] that represents the stratified sample + * @return a new `DataFrame` that represents the stratified sample * * {{{ * val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), @@ -354,7 +354,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * its fraction as zero. * @param seed random seed * @tparam T stratum type - * @return a new [[DataFrame]] that represents the stratified sample + * @return a new `DataFrame` that represents the stratified sample * * @since 1.5.0 */ @@ -369,7 +369,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param depth depth of the sketch * @param width width of the sketch * @param seed random seed - * @return a [[CountMinSketch]] over column `colName` + * @return a `CountMinSketch` over column `colName` * @since 2.0.0 */ def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = { @@ -383,7 +383,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param eps relative error of the sketch * @param confidence confidence of the sketch * @param seed random seed - * @return a [[CountMinSketch]] over column `colName` + * @return a `CountMinSketch` over column `colName` * @since 2.0.0 */ def countMinSketch( @@ -398,7 +398,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param depth depth of the sketch * @param width width of the sketch * @param seed random seed - * @return a [[CountMinSketch]] over column `colName` + * @return a `CountMinSketch` over column `colName` * @since 2.0.0 */ def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = { @@ -412,7 +412,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param eps relative error of the sketch * @param confidence confidence of the sketch * @param seed random seed - * @return a [[CountMinSketch]] over column `colName` + * @return a `CountMinSketch` over column `colName` * @since 2.0.0 */ def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 2d863422fbabe..fc699095ad719 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType /** * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems, - * key-value stores, etc). Use [[Dataset.write]] to access this. + * key-value stores, etc). Use `Dataset.write` to access this. * * @since 1.4.0 */ @@ -189,7 +189,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] at the specified path. + * Saves the content of the `DataFrame` at the specified path. * * @since 1.4.0 */ @@ -199,7 +199,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] as the specified table. + * Saves the content of the `DataFrame` as the specified table. * * @since 1.4.0 */ @@ -215,8 +215,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { dataSource.write(mode, df) } /** - * Inserts the content of the [[DataFrame]] to the specified table. It requires that - * the schema of the [[DataFrame]] is the same as the schema of the table. + * Inserts the content of the `DataFrame` to the specified table. It requires that + * the schema of the `DataFrame` is the same as the schema of the table. * * @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based * resolution. For example: @@ -322,15 +322,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] as the specified table. + * Saves the content of the `DataFrame` as the specified table. * * In the case the table already exists, behavior of this function depends on the * save mode, specified by the `mode` function (default to throwing an exception). - * When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be + * When `mode` is `Overwrite`, the schema of the `DataFrame` does not need to be * the same as that of the existing table. * * When `mode` is `Append`, if there is an existing table, we will use the format and options of - * the existing table. The column order in the schema of the [[DataFrame]] doesn't need to be same + * the existing table. The column order in the schema of the `DataFrame` doesn't need to be same * as that of the existing table. Unlike `insertInto`, `saveAsTable` will use the column names to * find the correct column positions. For example: * @@ -346,7 +346,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * +---+---+ * }}} * - * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input + * When the DataFrame is created from a non-partitioned `HadoopFsRelation` with a single input * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC * and Parquet), the table is persisted in a Hive compatible format, which means other systems * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL @@ -401,7 +401,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] to an external database table via JDBC. In the case the + * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the * table already exists in the external database, behavior of this function depends on the * save mode, specified by the `mode` function (default to throwing an exception). * @@ -442,7 +442,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in JSON format ( + * Saves the content of the `DataFrame` in JSON format ( * JSON Lines text format or newline-delimited JSON) at the specified path. * This is equivalent to: * {{{ @@ -469,7 +469,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in Parquet format at the specified path. + * Saves the content of the `DataFrame` in Parquet format at the specified path. * This is equivalent to: * {{{ * format("parquet").save(path) @@ -490,7 +490,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in ORC format at the specified path. + * Saves the content of the `DataFrame` in ORC format at the specified path. * This is equivalent to: * {{{ * format("orc").save(path) @@ -511,7 +511,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in a text file at the specified path. + * Saves the content of the `DataFrame` in a text file at the specified path. * The DataFrame must have only one column that is of string type. * Each row becomes a new line in the output file. For example: * {{{ @@ -536,7 +536,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in CSV format at the specified path. + * Saves the content of the `DataFrame` in CSV format at the specified path. * This is equivalent to: * {{{ * format("csv").save(path) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 7ba6ffce278cf..127a31a756cba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -472,8 +472,8 @@ class Dataset[T] private[sql]( /** * Returns true if this Dataset contains one or more sources that continuously * return data as it arrives. A Dataset that reads data from a streaming source - * must be executed as a [[StreamingQuery]] using the `start()` method in - * [[DataStreamWriter]]. Methods that return a single answer, e.g. `count()` or + * must be executed as a `StreamingQuery` using the `start()` method in + * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or * `collect()`, will throw an [[AnalysisException]] when there is a streaming * source present. * @@ -685,7 +685,7 @@ class Dataset[T] private[sql]( def stat: DataFrameStatFunctions = new DataFrameStatFunctions(toDF()) /** - * Join with another [[DataFrame]]. + * Join with another `DataFrame`. * * Behaves as an INNER JOIN and requires a subsequent join predicate. * @@ -699,7 +699,7 @@ class Dataset[T] private[sql]( } /** - * Inner equi-join with another [[DataFrame]] using the given column. + * Inner equi-join with another `DataFrame` using the given column. * * Different from other join functions, the join column will only appear once in the output, * i.e. similar to SQL's `JOIN USING` syntax. @@ -713,7 +713,7 @@ class Dataset[T] private[sql]( * @param usingColumn Name of the column to join on. This column must exist on both sides. * * @note If you perform a self-join using this function without aliasing the input - * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since + * `DataFrame`s, you will NOT be able to reference any columns after the join, since * there is no way to disambiguate which side of the join you would like to reference. * * @group untypedrel @@ -724,7 +724,7 @@ class Dataset[T] private[sql]( } /** - * Inner equi-join with another [[DataFrame]] using the given columns. + * Inner equi-join with another `DataFrame` using the given columns. * * Different from other join functions, the join columns will only appear once in the output, * i.e. similar to SQL's `JOIN USING` syntax. @@ -738,7 +738,7 @@ class Dataset[T] private[sql]( * @param usingColumns Names of the columns to join on. This columns must exist on both sides. * * @note If you perform a self-join using this function without aliasing the input - * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since + * `DataFrame`s, you will NOT be able to reference any columns after the join, since * there is no way to disambiguate which side of the join you would like to reference. * * @group untypedrel @@ -749,7 +749,7 @@ class Dataset[T] private[sql]( } /** - * Equi-join with another [[DataFrame]] using the given columns. + * Equi-join with another `DataFrame` using the given columns. * * Different from other join functions, the join columns will only appear once in the output, * i.e. similar to SQL's `JOIN USING` syntax. @@ -759,7 +759,7 @@ class Dataset[T] private[sql]( * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`. * * @note If you perform a self-join using this function without aliasing the input - * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since + * `DataFrame`s, you will NOT be able to reference any columns after the join, since * there is no way to disambiguate which side of the join you would like to reference. * * @group untypedrel @@ -782,7 +782,7 @@ class Dataset[T] private[sql]( } /** - * Inner join with another [[DataFrame]], using the given join expression. + * Inner join with another `DataFrame`, using the given join expression. * * {{{ * // The following two are equivalent: @@ -796,7 +796,7 @@ class Dataset[T] private[sql]( def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner") /** - * Join with another [[DataFrame]], using the given join expression. The following performs + * Join with another `DataFrame`, using the given join expression. The following performs * a full outer join between `df1` and `df2`. * * {{{ @@ -860,7 +860,7 @@ class Dataset[T] private[sql]( } /** - * Explicit cartesian join with another [[DataFrame]]. + * Explicit cartesian join with another `DataFrame`. * * @param right Right side of the join operation. * @@ -875,7 +875,7 @@ class Dataset[T] private[sql]( /** * :: Experimental :: - * Joins this Dataset returning a [[Tuple2]] for each pair where `condition` evaluates to + * Joins this Dataset returning a `Tuple2` for each pair where `condition` evaluates to * true. * * This is similar to the relation `join` function with one important difference in the @@ -956,7 +956,7 @@ class Dataset[T] private[sql]( /** * :: Experimental :: - * Using inner equi-join to join this Dataset returning a [[Tuple2]] for each pair + * Using inner equi-join to join this Dataset returning a `Tuple2` for each pair * where `condition` evaluates to true. * * @param other Right side of the join. @@ -2232,7 +2232,7 @@ class Dataset[T] private[sql]( } /** - * Returns a new [[DataFrame]] that contains the result of applying a serialized R function + * Returns a new `DataFrame` that contains the result of applying a serialized R function * `func` to each partition. */ private[sql] def mapPartitionsInR( @@ -2446,7 +2446,7 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset that has exactly `numPartitions` partitions. - * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g. + * Similar to coalesce defined on an `RDD`, this operation results in a narrow dependency, e.g. * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of * the 100 new partitions will claim 10 of the current partitions. * @@ -2536,7 +2536,7 @@ class Dataset[T] private[sql]( def unpersist(): this.type = unpersist(blocking = false) /** - * Represents the content of the Dataset as an [[RDD]] of [[T]]. + * Represents the content of the Dataset as an `RDD` of [[T]]. * * @group basic * @since 1.6.0 @@ -2550,14 +2550,14 @@ class Dataset[T] private[sql]( } /** - * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s. + * Returns the content of the Dataset as a `JavaRDD` of [[T]]s. * @group basic * @since 1.6.0 */ def toJavaRDD: JavaRDD[T] = rdd.toJavaRDD() /** - * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s. + * Returns the content of the Dataset as a `JavaRDD` of [[T]]s. * @group basic * @since 1.6.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala index 1163035e315fc..b94ad59fa2f6e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala @@ -18,11 +18,10 @@ package org.apache.spark.sql import org.apache.spark.annotation.{Experimental, InterfaceStability} -import org.apache.spark.sql.streaming.StreamingQuery /** * :: Experimental :: - * A class to consume data generated by a [[StreamingQuery]]. Typically this is used to send the + * A class to consume data generated by a `StreamingQuery`. Typically this is used to send the * generated data to external systems. Each partition will use a new deserialized instance, so you * usually should do all the initialization (e.g. opening a connection or initiating a transaction) * in the `open` method. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index d5940c638acdb..fbeebb9c2a5fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -474,7 +474,7 @@ object functions { /** * Aggregate function: returns the level of grouping, equals to * - * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) + * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) * * @note The list of columns should match with grouping columns exactly, or empty (means all the * grouping columns). @@ -487,7 +487,7 @@ object functions { /** * Aggregate function: returns the level of grouping, equals to * - * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) + * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) * * @note The list of columns should match with grouping columns exactly. * @@ -1048,9 +1048,9 @@ object functions { * within each partition in the lower 33 bits. The assumption is that the data frame has * less than 1 billion partitions, and each partition has less than 8 billion records. * - * As an example, consider a [[DataFrame]] with two partitions, each with 3 records. + * As an example, consider a `DataFrame` with two partitions, each with 3 records. * This expression would return the following IDs: - * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. * * @group normal_funcs * @since 1.4.0 @@ -1066,9 +1066,9 @@ object functions { * within each partition in the lower 33 bits. The assumption is that the data frame has * less than 1 billion partitions, and each partition has less than 8 billion records. * - * As an example, consider a [[DataFrame]] with two partitions, each with 3 records. + * As an example, consider a `DataFrame` with two partitions, each with 3 records. * This expression would return the following IDs: - * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. * * @group normal_funcs * @since 1.6.0 @@ -1184,7 +1184,7 @@ object functions { /** * Creates a new struct column. - * If the input column is a column in a [[DataFrame]], or a derived column expression + * If the input column is a column in a `DataFrame`, or a derived column expression * that is named (i.e. aliased), its name would be remained as the StructField's name, * otherwise, the newly generated StructField's name would be auto generated as col${index + 1}, * i.e. col1, col2, col3, ... @@ -1846,8 +1846,8 @@ object functions { def round(e: Column): Column = round(e, 0) /** - * Round the value of `e` to `scale` decimal places if `scale` >= 0 - * or at integral part when `scale` < 0. + * Round the value of `e` to `scale` decimal places if `scale` >= 0 + * or at integral part when `scale` < 0. * * @group math_funcs * @since 1.5.0 @@ -1864,7 +1864,7 @@ object functions { /** * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode - * if `scale` >= 0 or at integral part when `scale` < 0. + * if `scale` >= 0 or at integral part when `scale` < 0. * * @group math_funcs * @since 2.0.0 From ff17c3a03f681947a6bc2729bd29fd14f07fe20d Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 00:38:32 +0900 Subject: [PATCH 03/17] Fix errors (third round) --- .../scala/org/apache/spark/Accumulator.scala | 2 +- .../scala/org/apache/spark/SparkConf.scala | 2 +- .../org/apache/spark/TaskEndReason.scala | 2 +- .../spark/scheduler/InputFormatInfo.scala | 2 +- .../spark/streaming/kafka/KafkaCluster.scala | 8 +-- .../spark/streaming/kafka/KafkaUtils.scala | 18 +++--- .../spark/streaming/kafka/OffsetRange.scala | 2 +- .../stat/test/KolmogorovSmirnovTest.scala | 3 +- .../spark/mllib/stat/test/StreamingTest.scala | 6 +- .../mllib/stat/test/StreamingTestMethod.scala | 4 +- .../spark/sql/InternalOutputModes.scala | 2 +- .../main/scala/org/apache/spark/sql/Row.scala | 2 +- .../spark/sql/DataFrameStatFunctions.scala | 6 +- .../scala/org/apache/spark/sql/Dataset.scala | 6 +- .../spark/sql/KeyValueGroupedDataset.scala | 8 +-- .../spark/sql/RelationalGroupedDataset.scala | 28 ++++----- .../org/apache/spark/sql/RuntimeConfig.scala | 4 +- .../org/apache/spark/sql/SparkSession.scala | 60 +++++++++---------- .../apache/spark/sql/UDFRegistration.scala | 2 +- .../org/apache/spark/sql/functions.scala | 16 ++--- 20 files changed, 92 insertions(+), 91 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index 9d1f1d59dbce1..1a45e15cdc728 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -26,7 +26,7 @@ package org.apache.spark * * An accumulator is created from an initial value `v` by calling * [[SparkContext#accumulator SparkContext.accumulator]]. - * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator. + * Tasks running on the cluster can then add to it using the [[Accumulable.+=]] operator. * However, they cannot read its value. Only the driver program can read the accumulator's value, * using its [[#value]] method. * diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 04d657c09afd0..bc8010eca2e1b 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -262,7 +262,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no * suffix is provided then seconds are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getTimeAsSeconds(key: String): Long = { Utils.timeStringAsSeconds(get(key)) diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 7ca3c103dbf5b..7745387dbceba 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -65,7 +65,7 @@ sealed trait TaskFailedReason extends TaskEndReason { /** * :: DeveloperApi :: - * A [[org.apache.spark.scheduler.ShuffleMapTask]] that completed successfully earlier, but we + * A `org.apache.spark.scheduler.ShuffleMapTask` that completed successfully earlier, but we * lost the executor before the stage completed. This means Spark needs to reschedule the task * to be re-executed on a different executor. */ diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala index a6b032cc0084c..5f23d657e1155 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala @@ -153,7 +153,7 @@ object InputFormatInfo { a) For each host, count number of splits hosted on that host. b) Decrement the currently allocated containers on that host. - c) Compute rack info for each host and update rack -> count map based on (b). + c) Compute rack info for each host and update rack -> count map based on (b). d) Allocate nodes based on (c) e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node (even if data locality on that is very high) : this is to prevent fragility of job if a diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala index 35acb7b09f12b..c419221aa607a 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala @@ -231,7 +231,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { // this 0 here indicates api version, in this case the original ZK backed api. private def defaultConsumerApiVersion: Short = 0 - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ def getConsumerOffsets( groupId: String, topicAndPartitions: Set[TopicAndPartition] @@ -250,7 +250,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { } } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ def getConsumerOffsetMetadata( groupId: String, topicAndPartitions: Set[TopicAndPartition] @@ -287,7 +287,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { Left(errs) } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ def setConsumerOffsets( groupId: String, offsets: Map[TopicAndPartition, Long] @@ -305,7 +305,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { setConsumerOffsetMetadata(groupId, meta, consumerApiVersion) } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ def setConsumerOffsetMetadata( groupId: String, metadata: Map[TopicAndPartition, OffsetAndMetadata] diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala index 56f0cb0b166a2..59f4e408569f6 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala @@ -47,7 +47,7 @@ object KafkaUtils { * @param ssc StreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..) * @param groupId The group id for this consumer - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread * @param storageLevel Storage level to use for storing the received objects * (default: StorageLevel.MEMORY_AND_DISK_SER_2) @@ -72,7 +72,7 @@ object KafkaUtils { * @param ssc StreamingContext object * @param kafkaParams Map of kafka configuration parameters, * see http://kafka.apache.org/08/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread. * @param storageLevel Storage level to use for storing the received objects * @tparam K type of Kafka message key @@ -97,7 +97,7 @@ object KafkaUtils { * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..) * @param groupId The group id for this consumer - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread * @return DStream of (Kafka message key, Kafka message value) */ @@ -115,7 +115,7 @@ object KafkaUtils { * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..). * @param groupId The group id for this consumer. - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread. * @param storageLevel RDD storage level. * @return DStream of (Kafka message key, Kafka message value) @@ -140,7 +140,7 @@ object KafkaUtils { * @param valueDecoderClass Type of kafka value decoder * @param kafkaParams Map of kafka configuration parameters, * see http://kafka.apache.org/08/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread * @param storageLevel RDD storage level. * @tparam K type of Kafka message key @@ -396,7 +396,7 @@ object KafkaUtils { * You can access the offsets used in each batch from the generated RDDs (see * [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). * - Failure Recovery: To recover from driver failures, you have to enable checkpointing - * in the [[StreamingContext]]. The information on consumed offset can be + * in the `StreamingContext`. The information on consumed offset can be * recovered from the checkpoint. See the programming guide for details (constraints, etc.). * - End-to-end semantics: This stream ensures that every records is effectively received and * transformed exactly once, but gives no guarantees on whether the transformed data are @@ -448,7 +448,7 @@ object KafkaUtils { * You can access the offsets used in each batch from the generated RDDs (see * [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). * - Failure Recovery: To recover from driver failures, you have to enable checkpointing - * in the [[StreamingContext]]. The information on consumed offset can be + * in the `StreamingContext`. The information on consumed offset can be * recovered from the checkpoint. See the programming guide for details (constraints, etc.). * - End-to-end semantics: This stream ensures that every records is effectively received and * transformed exactly once, but gives no guarantees on whether the transformed data are @@ -499,7 +499,7 @@ object KafkaUtils { * You can access the offsets used in each batch from the generated RDDs (see * [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). * - Failure Recovery: To recover from driver failures, you have to enable checkpointing - * in the [[StreamingContext]]. The information on consumed offset can be + * in the `StreamingContext`. The information on consumed offset can be * recovered from the checkpoint. See the programming guide for details (constraints, etc.). * - End-to-end semantics: This stream ensures that every records is effectively received and * transformed exactly once, but gives no guarantees on whether the transformed data are @@ -565,7 +565,7 @@ object KafkaUtils { * You can access the offsets used in each batch from the generated RDDs (see * [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). * - Failure Recovery: To recover from driver failures, you have to enable checkpointing - * in the [[StreamingContext]]. The information on consumed offset can be + * in the `StreamingContext`. The information on consumed offset can be * recovered from the checkpoint. See the programming guide for details (constraints, etc.). * - End-to-end semantics: This stream ensures that every records is effectively received and * transformed exactly once, but gives no guarantees on whether the transformed data are diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala index d9b856e4697a0..10d364f987405 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala @@ -22,7 +22,7 @@ import kafka.common.TopicAndPartition /** * Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the * offset ranges in RDDs generated by the direct Kafka DStream (see - * [[KafkaUtils.createDirectStream()]]). + * `KafkaUtils.createDirectStream()`). * {{{ * KafkaUtils.createDirectStream(...).foreachRDD { rdd => * val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala index a8b5955a7285d..d17f7047c5b2b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala @@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD * distribution of the sample data and the theoretical distribution we can provide a test for the * the null hypothesis that the sample data comes from that theoretical distribution. * For more information on KS Test: - * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]] + * @see + * Kolmogorov-Smirnov test (Wikipedia) * * Implementation note: We seek to implement the KS test with a minimal number of distributed * passes. We sort the RDD, and then perform the following operations on a per-partition basis: diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala index 97c032de7a813..d680237bf687f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala @@ -47,7 +47,7 @@ case class BinarySample @Since("1.6.0") ( * of the observation. * * To address novelty affects, the `peacePeriod` specifies a set number of initial - * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing. + * [[org.apache.spark.rdd.RDD]] batches of the `DStream` to be dropped from significance testing. * * The `windowSize` sets the number of batches each significance test is to be performed over. The * window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform @@ -97,7 +97,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable { } /** - * Register a [[DStream]] of values for significance testing. + * Register a `DStream` of values for significance testing. * * @param data stream of BinarySample(key,value) pairs where the key denotes group membership * (true = experiment, false = control) and the value is the numerical metric to @@ -114,7 +114,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable { } /** - * Register a [[JavaDStream]] of values for significance testing. + * Register a `JavaDStream` of values for significance testing. * * @param data stream of BinarySample(isExperiment,value) pairs where the isExperiment denotes * group (true = experiment, false = control) and the value is the numerical metric diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala index ff27f28459e26..14ac14d6d61f4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala @@ -73,7 +73,7 @@ private[stat] sealed trait StreamingTestMethod extends Serializable { * This test does not assume equal variance between the two samples and does not assume equal * sample size. * - * @see http://en.wikipedia.org/wiki/Welch%27s_t_test + * @see Welch's t-test (Wikipedia) */ private[stat] object WelchTTest extends StreamingTestMethod with Logging { @@ -115,7 +115,7 @@ private[stat] object WelchTTest extends StreamingTestMethod with Logging { * mean. This test assumes equal variance between the two samples and does not assume equal sample * size. For unequal variances, Welch's t-test should be used instead. * - * @see http://en.wikipedia.org/wiki/Student%27s_t-test + * @see Student's t-test (Wikipedia) */ private[stat] object StudentTTest extends StreamingTestMethod with Logging { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala index 153f9f57faf42..594c41c2c7446 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import org.apache.spark.sql.streaming.OutputMode /** - * Internal helper class to generate objects representing various [[OutputMode]]s, + * Internal helper class to generate objects representing various `OutputMode`s, */ private[sql] object InternalOutputModes { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index a821d2ca34579..71c5151c74478 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -74,7 +74,7 @@ object Row { * It is invalid to use the native primitive interface to retrieve a value that is null, instead a * user must check `isNullAt` before attempting to retrieve a value that might be null. * - * To create a new Row, use [[RowFactory.create()]] in Java or [[Row.apply()]] in Scala. + * To create a new Row, use `RowFactory.create()` in Java or `Row.apply()` in Scala. * * A [[Row]] object can be constructed by providing field values. Example: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index f48ddd54d3650..f27ca9aeb9235 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch} /** - * Statistic functions for [[DataFrame]]s. + * Statistic functions for `DataFrame`s. * * @since 1.4.0 */ @@ -189,7 +189,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * The `support` should be greater than 1e-4. * * This function is meant for exploratory data analysis, as we make no guarantee about the - * backward compatibility of the schema of the resulting [[DataFrame]]. + * backward compatibility of the schema of the resulting `DataFrame`. * * @param cols the names of the columns to search frequent items in. * @param support The minimum frequency for an item to be considered `frequent`. Should be greater @@ -236,7 +236,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * Uses a `default` support of 1%. * * This function is meant for exploratory data analysis, as we make no guarantee about the - * backward compatibility of the schema of the resulting [[DataFrame]]. + * backward compatibility of the schema of the resulting `DataFrame`. * * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 127a31a756cba..fcc02e5eb3ef9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -68,7 +68,7 @@ private[sql] object Dataset { /** * A Dataset is a strongly typed collection of domain-specific objects that can be transformed * in parallel using functional or relational operations. Each Dataset also has an untyped view - * called a [[DataFrame]], which is a Dataset of [[Row]]. + * called a `DataFrame`, which is a Dataset of [[Row]]. * * Operations available on Datasets are divided into transformations and actions. Transformations * are the ones that produce new Datasets, and actions are the ones that trigger computation and @@ -363,7 +363,7 @@ class Dataset[T] private[sql]( * - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will * be assigned to `_1`). * - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the - * [[DataFrame]] will be used. + * `DataFrame` will be used. * * If the schema of the Dataset does not match the desired `U` type, you can use `select` * along with `alias` or `as` to rearrange or rename as required. @@ -377,7 +377,7 @@ class Dataset[T] private[sql]( /** * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed. - * This can be quite convenient in conversion from an RDD of tuples into a [[DataFrame]] with + * This can be quite convenient in conversion from an RDD of tuples into a `DataFrame` with * meaningful names. For example: * {{{ * val rdd: RDD[(Int, String)] = ... diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 31ce8eb25e808..395d709f26591 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -131,7 +131,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( * This function does not support partial aggregation, and as a result requires shuffling all * the data in the [[Dataset]]. If an application intends to perform an aggregation over each * key, it is best to use the reduce function or an - * [[org.apache.spark.sql.expressions#Aggregator Aggregator]]. + * `org.apache.spark.sql.expressions#Aggregator`. * * Internally, the implementation will spill to disk if any given group is too large to fit into * memory. However, users must take care to avoid materializing the whole iterator for a group @@ -160,7 +160,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( * This function does not support partial aggregation, and as a result requires shuffling all * the data in the [[Dataset]]. If an application intends to perform an aggregation over each * key, it is best to use the reduce function or an - * [[org.apache.spark.sql.expressions#Aggregator Aggregator]]. + * `org.apache.spark.sql.expressions#Aggregator`. * * Internally, the implementation will spill to disk if any given group is too large to fit into * memory. However, users must take care to avoid materializing the whole iterator for a group @@ -182,7 +182,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( * This function does not support partial aggregation, and as a result requires shuffling all * the data in the [[Dataset]]. If an application intends to perform an aggregation over each * key, it is best to use the reduce function or an - * [[org.apache.spark.sql.expressions#Aggregator Aggregator]]. + * `org.apache.spark.sql.expressions#Aggregator`. * * Internally, the implementation will spill to disk if any given group is too large to fit into * memory. However, users must take care to avoid materializing the whole iterator for a group @@ -205,7 +205,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( * This function does not support partial aggregation, and as a result requires shuffling all * the data in the [[Dataset]]. If an application intends to perform an aggregation over each * key, it is best to use the reduce function or an - * [[org.apache.spark.sql.expressions#Aggregator Aggregator]]. + * `org.apache.spark.sql.expressions#Aggregator`. * * Internally, the implementation will spill to disk if any given group is too large to fit into * memory. However, users must take care to avoid materializing the whole iterator for a group diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index f019d1e9daceb..0b1e191a1cd99 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -129,7 +129,7 @@ class RelationalGroupedDataset protected[sql]( /** * (Scala-specific) Compute aggregates by specifying the column names and - * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns. + * aggregate methods. The resulting `DataFrame` will also contain the grouping columns. * * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. * {{{ @@ -150,7 +150,7 @@ class RelationalGroupedDataset protected[sql]( /** * (Scala-specific) Compute aggregates by specifying a map from column name to - * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns. + * aggregate methods. The resulting `DataFrame` will also contain the grouping columns. * * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. * {{{ @@ -171,7 +171,7 @@ class RelationalGroupedDataset protected[sql]( /** * (Java-specific) Compute aggregates by specifying a map from column name to - * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns. + * aggregate methods. The resulting `DataFrame` will also contain the grouping columns. * * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. * {{{ @@ -228,7 +228,7 @@ class RelationalGroupedDataset protected[sql]( /** * Count the number of rows for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * * @since 1.3.0 */ @@ -236,7 +236,7 @@ class RelationalGroupedDataset protected[sql]( /** * Compute the average value for each numeric columns for each group. This is an alias for `avg`. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * When specified columns are given, only compute the average values for them. * * @since 1.3.0 @@ -248,7 +248,7 @@ class RelationalGroupedDataset protected[sql]( /** * Compute the max value for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * When specified columns are given, only compute the max values for them. * * @since 1.3.0 @@ -260,7 +260,7 @@ class RelationalGroupedDataset protected[sql]( /** * Compute the mean value for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * When specified columns are given, only compute the mean values for them. * * @since 1.3.0 @@ -272,7 +272,7 @@ class RelationalGroupedDataset protected[sql]( /** * Compute the min value for each numeric column for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * When specified columns are given, only compute the min values for them. * * @since 1.3.0 @@ -284,7 +284,7 @@ class RelationalGroupedDataset protected[sql]( /** * Compute the sum for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. + * The resulting `DataFrame` will also contain the grouping columns. * When specified columns are given, only compute the sum for them. * * @since 1.3.0 @@ -295,7 +295,7 @@ class RelationalGroupedDataset protected[sql]( } /** - * Pivots a column of the current [[DataFrame]] and perform the specified aggregation. + * Pivots a column of the current `DataFrame` and perform the specified aggregation. * There are two versions of pivot function: one that requires the caller to specify the list * of distinct values to pivot on, and one that does not. The latter is more concise but less * efficient, because Spark needs to first compute the list of distinct values internally. @@ -335,7 +335,7 @@ class RelationalGroupedDataset protected[sql]( } /** - * Pivots a column of the current [[DataFrame]] and perform the specified aggregation. + * Pivots a column of the current `DataFrame` and perform the specified aggregation. * There are two versions of pivot function: one that requires the caller to specify the list * of distinct values to pivot on, and one that does not. The latter is more concise but less * efficient, because Spark needs to first compute the list of distinct values internally. @@ -367,7 +367,7 @@ class RelationalGroupedDataset protected[sql]( } /** - * Pivots a column of the current [[DataFrame]] and perform the specified aggregation. + * Pivots a column of the current `DataFrame` and perform the specified aggregation. * There are two versions of pivot function: one that requires the caller to specify the list * of distinct values to pivot on, and one that does not. The latter is more concise but less * efficient, because Spark needs to first compute the list of distinct values internally. @@ -392,12 +392,12 @@ class RelationalGroupedDataset protected[sql]( * Applies the given serialized R function `func` to each group of data. For each unique group, * the function will be passed the group key and an iterator that contains all of the elements in * the group. The function can return an iterator containing elements of an arbitrary type which - * will be returned as a new [[DataFrame]]. + * will be returned as a new `DataFrame`. * * This function does not support partial aggregation, and as a result requires shuffling all * the data in the [[Dataset]]. If an application intends to perform an aggregation over each * key, it is best to use the reduce function or an - * [[org.apache.spark.sql.expressions#Aggregator Aggregator]]. + * `org.apache.spark.sql.expressions#Aggregator`. * * Internally, the implementation will spill to disk if any given group is too large to fit into * memory. However, users must take care to avoid materializing the whole iterator for a group diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 9108d19d0a0c2..43684abc13629 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} /** - * Runtime configuration interface for Spark. To access this, use [[SparkSession.conf]]. + * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`. * * Options set here are automatically propagated to the Hadoop configuration during I/O. * @@ -65,7 +65,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { /** * Returns the value of Spark runtime configuration property for the given key. * - * @throws NoSuchElementException if the key is not set and does not have a default value + * @note Throws `NoSuchElementException` if the key is not set and does not have a default value * @since 2.0.0 */ @throws[NoSuchElementException]("if the key is not set") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 71b1880dc0715..08d74ac0185b8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -93,7 +93,7 @@ class SparkSession private( * ----------------------- */ /** - * State shared across sessions, including the [[SparkContext]], cached data, listener, + * State shared across sessions, including the `SparkContext`, cached data, listener, * and a catalog that interacts with external systems. */ @transient @@ -125,7 +125,7 @@ class SparkSession private( * * This is the interface through which the user can get and set all Spark and Hadoop * configurations that are relevant to Spark SQL. When getting the value of a config, - * this defaults to the value set in the underlying [[SparkContext]], if any. + * this defaults to the value set in the underlying `SparkContext`, if any. * * @since 2.0.0 */ @@ -189,8 +189,8 @@ class SparkSession private( /** * :: Experimental :: - * Returns a [[StreamingQueryManager]] that allows managing all the - * [[StreamingQuery StreamingQueries]] active on `this`. + * Returns a `StreamingQueryManager` that allows managing all the + * `StreamingQuery`s active on `this`. * * @since 2.0.0 */ @@ -200,9 +200,9 @@ class SparkSession private( /** * Start a new session with isolated SQL configurations, temporary tables, registered - * functions are isolated, but sharing the underlying [[SparkContext]] and cached data. + * functions are isolated, but sharing the underlying `SparkContext` and cached data. * - * @note Other than the [[SparkContext]], all shared state is initialized lazily. + * @note Other than the `SparkContext`, all shared state is initialized lazily. * This method will force the initialization of the shared state to ensure that parent * and child sessions are set up with the same shared state. If the underlying catalog * implementation is Hive, this will initialize the metastore, which may take some time. @@ -219,7 +219,7 @@ class SparkSession private( * --------------------------------- */ /** - * Returns a [[DataFrame]] with no rows or columns. + * Returns a `DataFrame` with no rows or columns. * * @since 2.0.0 */ @@ -243,7 +243,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples). + * Creates a `DataFrame` from an RDD of Product (e.g. case classes, tuples). * * @since 2.0.0 */ @@ -257,7 +257,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[DataFrame]] from a local Seq of Product. + * Creates a `DataFrame` from a local Seq of Product. * * @since 2.0.0 */ @@ -272,7 +272,7 @@ class SparkSession private( /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from an `RDD` containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided RDD matches * the provided schema. Otherwise, there will be runtime exception. * Example: @@ -309,7 +309,7 @@ class SparkSession private( /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from a `JavaRDD` containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided RDD matches * the provided schema. Otherwise, there will be runtime exception. * @@ -323,7 +323,7 @@ class SparkSession private( /** * :: DeveloperApi :: - * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema. + * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema. * It is important to make sure that the structure of every [[Row]] of the provided List matches * the provided schema. Otherwise, there will be runtime exception. * @@ -381,7 +381,7 @@ class SparkSession private( } /** - * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]]. + * Convert a `BaseRelation` created for external data sources into a `DataFrame`. * * @since 2.0.0 */ @@ -470,7 +470,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements * in a range from 0 to `end` (exclusive) with step value 1. * * @since 2.0.0 @@ -481,7 +481,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements * in a range from `start` to `end` (exclusive) with step value 1. * * @since 2.0.0 @@ -494,7 +494,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements * in a range from `start` to `end` (exclusive) with a step value. * * @since 2.0.0 @@ -507,7 +507,7 @@ class SparkSession private( /** * :: Experimental :: - * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements * in a range from `start` to `end` (exclusive) with a step value, with partition number * specified. * @@ -520,7 +520,7 @@ class SparkSession private( } /** - * Creates a [[DataFrame]] from an RDD[Row]. + * Creates a `DataFrame` from an RDD[Row]. * User can specify whether the input rows should be converted to Catalyst rows. */ private[sql] def internalCreateDataFrame( @@ -533,7 +533,7 @@ class SparkSession private( } /** - * Creates a [[DataFrame]] from an RDD[Row]. + * Creates a `DataFrame` from an RDD[Row]. * User can specify whether the input rows should be converted to Catalyst rows. */ private[sql] def createDataFrame( @@ -566,7 +566,7 @@ class SparkSession private( @transient lazy val catalog: Catalog = new CatalogImpl(self) /** - * Returns the specified table as a [[DataFrame]]. + * Returns the specified table as a `DataFrame`. * * @since 2.0.0 */ @@ -583,7 +583,7 @@ class SparkSession private( * ----------------- */ /** - * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. + * Executes a SQL query using Spark, returning the result as a `DataFrame`. * The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'. * * @since 2.0.0 @@ -594,7 +594,7 @@ class SparkSession private( /** * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a - * [[DataFrame]]. + * `DataFrame`. * {{{ * sparkSession.read.parquet("/path/to/file.parquet") * sparkSession.read.schema(schema).json("/path/to/file.json") @@ -606,7 +606,7 @@ class SparkSession private( /** * :: Experimental :: - * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]]. + * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`. * {{{ * sparkSession.readStream.parquet("/path/to/directory/of/parquet/files") * sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files") @@ -624,7 +624,7 @@ class SparkSession private( /** * :: Experimental :: * (Scala-specific) Implicit methods available in Scala for converting - * common Scala objects into [[DataFrame]]s. + * common Scala objects into `DataFrame`s. * * {{{ * val sparkSession = SparkSession.builder.getOrCreate() @@ -641,7 +641,7 @@ class SparkSession private( // scalastyle:on /** - * Stop the underlying [[SparkContext]]. + * Stop the underlying `SparkContext`. * * @since 2.0.0 */ @@ -726,7 +726,7 @@ object SparkSession { /** * Sets a config option. Options set using this method are automatically propagated to - * both [[SparkConf]] and SparkSession's own configuration. + * both `SparkConf` and SparkSession's own configuration. * * @since 2.0.0 */ @@ -737,7 +737,7 @@ object SparkSession { /** * Sets a config option. Options set using this method are automatically propagated to - * both [[SparkConf]] and SparkSession's own configuration. + * both `SparkConf` and SparkSession's own configuration. * * @since 2.0.0 */ @@ -748,7 +748,7 @@ object SparkSession { /** * Sets a config option. Options set using this method are automatically propagated to - * both [[SparkConf]] and SparkSession's own configuration. + * both `SparkConf` and SparkSession's own configuration. * * @since 2.0.0 */ @@ -759,7 +759,7 @@ object SparkSession { /** * Sets a config option. Options set using this method are automatically propagated to - * both [[SparkConf]] and SparkSession's own configuration. + * both `SparkConf` and SparkSession's own configuration. * * @since 2.0.0 */ @@ -769,7 +769,7 @@ object SparkSession { } /** - * Sets a list of config options based on the given [[SparkConf]]. + * Sets a list of config options based on the given `SparkConf`. * * @since 2.0.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 6043c5ee14b54..c8be89c646957 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.types.{DataType, DataTypes} import org.apache.spark.util.Utils /** - * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this. + * Functions for registering user-defined functions. Use `SQLContext.udf` to access this. * * @note The user-defined functions must be deterministic. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index fbeebb9c2a5fe..93e7229b20c1f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2172,7 +2172,7 @@ object functions { * and returns the result as a string column. * * If d is 0, the result has no decimal point or fractional part. - * If d < 0, the result will be null. + * If d < 0, the result will be null. * * @group string_funcs * @since 1.5.0 @@ -2888,7 +2888,7 @@ object functions { } /** - * (Scala-specific) Parses a column containing a JSON string into a [[StructType]] with the + * (Scala-specific) Parses a column containing a JSON string into a `StructType` with the * specified schema. Returns `null`, in the case of an unparseable string. * * @param e a string column containing JSON data. @@ -2904,7 +2904,7 @@ object functions { } /** - * (Java-specific) Parses a column containing a JSON string into a [[StructType]] with the + * (Java-specific) Parses a column containing a JSON string into a `StructType` with the * specified schema. Returns `null`, in the case of an unparseable string. * * @param e a string column containing JSON data. @@ -2919,7 +2919,7 @@ object functions { from_json(e, schema, options.asScala.toMap) /** - * Parses a column containing a JSON string into a [[StructType]] with the specified schema. + * Parses a column containing a JSON string into a `StructType` with the specified schema. * Returns `null`, in the case of an unparseable string. * * @param e a string column containing JSON data. @@ -2932,7 +2932,7 @@ object functions { from_json(e, schema, Map.empty[String, String]) /** - * Parses a column containing a JSON string into a [[StructType]] with the specified schema. + * Parses a column containing a JSON string into a `StructType` with the specified schema. * Returns `null`, in the case of an unparseable string. * * @param e a string column containing JSON data. @@ -2946,7 +2946,7 @@ object functions { /** - * (Scala-specific) Converts a column containing a [[StructType]] into a JSON string with the + * (Scala-specific) Converts a column containing a `StructType` into a JSON string with the * specified schema. Throws an exception, in the case of an unsupported type. * * @param e a struct column. @@ -2961,7 +2961,7 @@ object functions { } /** - * (Java-specific) Converts a column containing a [[StructType]] into a JSON string with the + * (Java-specific) Converts a column containing a `StructType` into a JSON string with the * specified schema. Throws an exception, in the case of an unsupported type. * * @param e a struct column. @@ -2975,7 +2975,7 @@ object functions { to_json(e, options.asScala.toMap) /** - * Converts a column containing a [[StructType]] into a JSON string with the + * Converts a column containing a `StructType` into a JSON string with the * specified schema. Throws an exception, in the case of an unsupported type. * * @param e a struct column. From 1593545d367009c17b3c271e23ae35723af9bd46 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 00:39:20 +0900 Subject: [PATCH 04/17] Fix the linelength style in KafkaUtils.scala --- .../scala/org/apache/spark/streaming/kafka/KafkaUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala index 59f4e408569f6..437c797e55605 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala @@ -72,8 +72,8 @@ object KafkaUtils { * @param ssc StreamingContext object * @param kafkaParams Map of kafka configuration parameters, * see http://kafka.apache.org/08/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed - * in its own thread. + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is + * consumed in its own thread. * @param storageLevel Storage level to use for storing the received objects * @tparam K type of Kafka message key * @tparam V type of Kafka message value From 558d5e3982bd7af552fa6f5a1b581355f7b316ff Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 01:41:44 +0900 Subject: [PATCH 05/17] Fix errors (fourth round) --- .../scala/org/apache/spark/Accumulator.scala | 2 +- .../scala/org/apache/spark/SparkConf.scala | 10 ++--- .../scala/org/apache/spark/SparkContext.scala | 14 +++---- .../scala/org/apache/spark/TaskContext.scala | 4 +- .../scala/org/apache/spark/TestUtils.scala | 2 +- .../apache/spark/rdd/DoubleRDDFunctions.scala | 2 +- .../org/apache/spark/rdd/HadoopRDD.scala | 2 +- .../scala/org/apache/spark/rdd/JdbcRDD.scala | 6 +-- .../org/apache/spark/rdd/NewHadoopRDD.scala | 2 +- .../apache/spark/rdd/PairRDDFunctions.scala | 16 ++++---- .../apache/spark/rdd/RDDCheckpointData.scala | 2 +- .../apache/spark/rdd/coalesce-public.scala | 4 +- .../spark/storage/BlockManagerMessages.scala | 2 +- .../spark/util/random/SamplingUtils.scala | 16 ++++---- .../util/random/StratifiedSamplingUtils.scala | 8 ++-- .../org/apache/spark/ml/param/params.scala | 38 +++++++++---------- .../spark/mllib/classification/SVM.scala | 2 +- .../BinaryClassificationMetrics.scala | 5 ++- .../linalg/EigenValueDecomposition.scala | 2 +- .../apache/spark/mllib/linalg/Vectors.scala | 4 +- .../spark/mllib/random/RandomRDDs.scala | 8 ++-- .../spark/mllib/tree/DecisionTree.scala | 6 +-- .../mllib/tree/GradientBoostedTrees.scala | 6 +-- .../spark/mllib/tree/RandomForest.scala | 16 ++++---- .../spark/sql/RelationalGroupedDataset.scala | 2 +- .../apache/spark/sql/sources/interfaces.scala | 12 +++--- .../apache/spark/streaming/StateSpec.scala | 2 +- .../streaming/api/java/JavaPairDStream.scala | 4 +- .../api/java/JavaStreamingContext.scala | 2 +- .../dstream/PairDStreamFunctions.scala | 4 +- 30 files changed, 104 insertions(+), 101 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index 1a45e15cdc728..bcf157078813d 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -26,7 +26,7 @@ package org.apache.spark * * An accumulator is created from an initial value `v` by calling * [[SparkContext#accumulator SparkContext.accumulator]]. - * Tasks running on the cluster can then add to it using the [[Accumulable.+=]] operator. + * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]]. * However, they cannot read its value. Only the driver program can read the accumulator's value, * using its [[#value]] method. * diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index bc8010eca2e1b..3f5b19eda2cc3 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -279,7 +279,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no * suffix is provided then milliseconds are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getTimeAsMs(key: String): Long = { Utils.timeStringAsMs(get(key)) @@ -296,7 +296,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then bytes are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getSizeAsBytes(key: String): Long = { Utils.byteStringAsBytes(get(key)) @@ -320,7 +320,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Kibibytes are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getSizeAsKb(key: String): Long = { Utils.byteStringAsKb(get(key)) @@ -337,7 +337,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Mebibytes are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getSizeAsMb(key: String): Long = { Utils.byteStringAsMb(get(key)) @@ -354,7 +354,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Gibibytes are assumed. - * @throws NoSuchElementException + * @note Throws `NoSuchElementException` */ def getSizeAsGb(key: String): Long = { Utils.byteStringAsGb(get(key)) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1261e3e735761..872c46ab689e1 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -645,7 +645,7 @@ class SparkContext(config: SparkConf) extends Logging { /** * Get a local property set in this thread, or null if it is missing. See - * [[org.apache.spark.SparkContext.setLocalProperty]]. + * `org.apache.spark.SparkContext.setLocalProperty`. */ def getLocalProperty(key: String): String = Option(localProperties.get).map(_.getProperty(key)).orNull @@ -663,7 +663,7 @@ class SparkContext(config: SparkConf) extends Logging { * Application programmers can use this method to group all those jobs together and give a * group description. Once set, the Spark web UI will associate such jobs with this group. * - * The application can also use [[org.apache.spark.SparkContext.cancelJobGroup]] to cancel all + * The application can also use `org.apache.spark.SparkContext.cancelJobGroup` to cancel all * running jobs in this group. For example, * {{{ * // In the main thread: @@ -1384,7 +1384,7 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates + * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates * inputs by adding them into the list. */ def collectionAccumulator[T]: CollectionAccumulator[T] = { @@ -1394,7 +1394,7 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates + * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates * inputs by adding them into the list. */ def collectionAccumulator[T](name: String): CollectionAccumulator[T] = { @@ -2043,7 +2043,7 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]] + * Cancel active jobs for the specified group. See `org.apache.spark.SparkContext.setJobGroup` * for more information. */ def cancelJobGroup(groupId: String) { @@ -2061,7 +2061,7 @@ class SparkContext(config: SparkConf) extends Logging { * Cancel a given job if it's scheduled or running. * * @param jobId the job ID to cancel - * @throws InterruptedException if the cancel message cannot be sent + * @note Throws `InterruptedException` if the cancel message cannot be sent */ def cancelJob(jobId: Int) { dagScheduler.cancelJob(jobId) @@ -2071,7 +2071,7 @@ class SparkContext(config: SparkConf) extends Logging { * Cancel a given stage and all jobs associated with it. * * @param stageId the stage ID to cancel - * @throws InterruptedException if the cancel message cannot be sent + * @note Throws `InterruptedException` if the cancel message cannot be sent */ def cancelStage(stageId: Int) { dagScheduler.cancelStage(stageId) diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 27abccf5ac2a9..0fd777ed12829 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -164,7 +164,7 @@ abstract class TaskContext extends Serializable { /** * Get a local property set upstream in the driver, or null if it is missing. See also - * [[org.apache.spark.SparkContext.setLocalProperty]]. + * `org.apache.spark.SparkContext.setLocalProperty`. */ def getLocalProperty(key: String): String @@ -174,7 +174,7 @@ abstract class TaskContext extends Serializable { /** * ::DeveloperApi:: * Returns all metrics sources with the given name which are associated with the instance - * which runs the task. For more information see [[org.apache.spark.metrics.MetricsSystem!]]. + * which runs the task. For more information see `org.apache.spark.metrics.MetricsSystem`. */ @DeveloperApi def getMetricsSources(sourceName: String): Seq[Source] diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala index 871b9d1ad575b..2909191bd6f14 100644 --- a/core/src/main/scala/org/apache/spark/TestUtils.scala +++ b/core/src/main/scala/org/apache/spark/TestUtils.scala @@ -186,7 +186,7 @@ private[spark] object TestUtils { /** - * A [[SparkListener]] that detects whether spills have occurred in Spark jobs. + * A `SparkListener` that detects whether spills have occurred in Spark jobs. */ private class SpillListener extends SparkListener { private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]] diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index f3ab324d59119..f4bc3e3021447 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -155,7 +155,7 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { * to the right except for the last which is closed * e.g. for the array * [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50] - * e.g 1<=x<10 , 10<=x<20, 20<=x<=50 + * e.g 1<=x<10 , 10<=x<20, 20<=x<=50 * And on the input of 1 and 50 we would have a histogram of 1, 0, 1 * * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 86351b8c575e5..ae4320d4583d6 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -96,7 +96,7 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp * @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate. * * @note Instantiating this class directly is not recommended, please use - * [[org.apache.spark.SparkContext.hadoopRDD()]] + * `org.apache.spark.SparkContext.hadoopRDD()` */ @DeveloperApi class HadoopRDD[K, V]( diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala index 0970b98071675..3197c57d1c4ad 100644 --- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala @@ -41,7 +41,7 @@ private[spark] class JdbcPartition(idx: Int, val lower: Long, val upper: Long) e * The RDD takes care of closing the connection. * @param sql the text of the query. * The query must contain two ? placeholders for parameters used to partition the results. - * E.g. "select title, author from books where ? <= id and id <= ?" + * E.g. "select title, author from books where ? <= id and id <= ?" * @param lowerBound the minimum value of the first placeholder * @param upperBound the maximum value of the second placeholder * The lower and upper bounds are inclusive. @@ -151,7 +151,7 @@ object JdbcRDD { * The RDD takes care of closing the connection. * @param sql the text of the query. * The query must contain two ? placeholders for parameters used to partition the results. - * E.g. "select title, author from books where ? <= id and id <= ?" + * E.g. "select title, author from books where ? <= id and id <= ?" * @param lowerBound the minimum value of the first placeholder * @param upperBound the maximum value of the second placeholder * The lower and upper bounds are inclusive. @@ -191,7 +191,7 @@ object JdbcRDD { * The RDD takes care of closing the connection. * @param sql the text of the query. * The query must contain two ? placeholders for parameters used to partition the results. - * E.g. "select title, author from books where ? <= id and id <= ?" + * E.g. "select title, author from books where ? <= id and id <= ?" * @param lowerBound the minimum value of the first placeholder * @param upperBound the maximum value of the second placeholder * The lower and upper bounds are inclusive. diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index a5965f597038d..c783e1375283a 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -63,7 +63,7 @@ private[spark] class NewHadoopPartition( * @param valueClass Class of the value associated with the inputFormatClass. * * @note Instantiating this class directly is not recommended, please use - * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]] + * `org.apache.spark.SparkContext.newAPIHadoopRDD()` */ @DeveloperApi class NewHadoopRDD[K, V]( diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 33e695ec5322b..ab8582f3e44cf 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -399,7 +399,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p` + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero {{{ sp > p }}} * would trigger sparse representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. * @@ -492,8 +492,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. * * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]]. @@ -516,8 +516,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * each group is not guaranteed, and may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. * * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]]. @@ -637,8 +637,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an - * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]] - * or [[PairRDDFunctions.reduceByKey]] will provide much better performance. + * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` + * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupByKey(): RDD[(K, Iterable[V])] = self.withScope { groupByKey(defaultPartitioner(self)) @@ -908,7 +908,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Return an RDD with the pairs from `this` whose keys are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting - * RDD will be <= us. + * RDD will be <= us. */ def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope { subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length))) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala index 1070bb96b2524..9f70d79c52cb4 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala @@ -23,7 +23,7 @@ import org.apache.spark.Partition /** * Enumeration to manage state transitions of an RDD through checkpointing - * [ Initialized --> checkpointing in progress --> checkpointed ]. + * [ Initialized --> checkpointing in progress --> checkpointed ]. */ private[spark] object CheckpointState extends Enumeration { type CheckpointState = Value diff --git a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala index d8a80aa5aeb15..e00bc22aba44d 100644 --- a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala +++ b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala @@ -35,14 +35,14 @@ trait PartitionCoalescer { * @param maxPartitions the maximum number of partitions to have after coalescing * @param parent the parent RDD whose partitions to coalesce * @return an array of [[PartitionGroup]]s, where each element is itself an array of - * [[Partition]]s and represents a partition after coalescing is performed. + * `Partition`s and represents a partition after coalescing is performed. */ def coalesce(maxPartitions: Int, parent: RDD[_]): Array[PartitionGroup] } /** * ::DeveloperApi:: - * A group of [[Partition]]s + * A group of `Partition`s * @param prefLoc preferred location for the partition group */ @DeveloperApi diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala index 6bded92700504..ce82e43b2d58b 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala @@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages { extends ToBlockManagerSlave /** - * Driver -> Executor message to trigger a thread dump. + * Driver -> Executor message to trigger a thread dump. */ case object TriggerThreadDump extends ToBlockManagerSlave diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala index f98932a470165..1099747444be5 100644 --- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala @@ -67,17 +67,17 @@ private[spark] object SamplingUtils { } /** - * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of + * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of * the time. * * How the sampling rate is determined: * Let p = num / total, where num is the sample size and total is the total number of - * datapoints in the RDD. We're trying to compute q > p such that - * - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), - * where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), - * i.e. the failure rate of not having a sufficiently large sample < 0.0001. + * datapoints in the RDD. We're trying to compute q > p such that + * - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), where + * we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), + * i.e. the failure rate of not having a sufficiently large sample < 0.0001. * Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for - * num > 12, but we need a slightly larger q (9 empirically determined). + * num > 12, but we need a slightly larger q (9 empirically determined). * - when sampling without replacement, we're drawing each datapoint with prob_i * ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success * rate, where success rate is defined the same as in sampling with replacement. @@ -108,14 +108,14 @@ private[spark] object SamplingUtils { private[spark] object PoissonBounds { /** - * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda). + * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda). */ def getLowerBound(s: Double): Double = { math.max(s - numStd(s) * math.sqrt(s), 1e-15) } /** - * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda). + * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda). * * @param s sample size */ diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala index 67822749112c6..83547072a08b2 100644 --- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala @@ -160,9 +160,11 @@ private[spark] object StratifiedSamplingUtils extends Logging { * * To do so, we compute sampleSize = math.ceil(size * samplingRate) for each stratum and compare * it to the number of items that were accepted instantly and the number of items in the waitlist - * for that stratum. Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), - * which means we need to sort the elements in the waitlist by their associated values in order - * to find the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize. + * for that stratum. + * + * Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which + * means we need to sort the elements in the waitlist by their associated values in order to find + * the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize. * Note that all elements in the waitlist have values >= bound for instant accept, so a T value * in the waitlist range would allow all elements that were instantly accepted on the first pass * to be included in the sample. diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 96206e0b7ad88..4850a9e43f91c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -87,7 +87,7 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali def ->(value: T): ParamPair[T] = ParamPair(this, value) // scalastyle:on - /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */ + /** Encodes a param value into JSON, which can be decoded by `jsonDecode()`. */ def jsonEncode(value: T): String = { value match { case x: String => @@ -140,7 +140,7 @@ private[ml] object Param { /** * :: DeveloperApi :: - * Factory methods for common validation functions for [[Param.isValid]]. + * Factory methods for common validation functions for `Param.isValid`. * The numerical methods only support Int, Long, Float, and Double. */ @DeveloperApi @@ -165,32 +165,32 @@ object ParamValidators { s" of unexpected input type: ${value.getClass}") } - /** Check if value > lowerBound */ + /** Check if value > lowerBound */ def gt[T](lowerBound: Double): T => Boolean = { (value: T) => getDouble(value) > lowerBound } - /** Check if value >= lowerBound */ + /** Check if value >= lowerBound */ def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) => getDouble(value) >= lowerBound } - /** Check if value < upperBound */ + /** Check if value < upperBound */ def lt[T](upperBound: Double): T => Boolean = { (value: T) => getDouble(value) < upperBound } - /** Check if value <= upperBound */ + /** Check if value <= upperBound */ def ltEq[T](upperBound: Double): T => Boolean = { (value: T) => getDouble(value) <= upperBound } /** * Check for value in range lowerBound to upperBound. - * @param lowerInclusive If true, check for value >= lowerBound. - * If false, check for value > lowerBound. - * @param upperInclusive If true, check for value <= upperBound. - * If false, check for value < upperBound. + * @param lowerInclusive If true, check for value >= lowerBound. + * If false, check for value > lowerBound. + * @param upperInclusive If true, check for value <= upperBound. + * If false, check for value < upperBound. */ def inRange[T]( lowerBound: Double, @@ -203,7 +203,7 @@ object ParamValidators { lowerValid && upperValid } - /** Version of [[inRange()]] which uses inclusive be default: [lowerBound, upperBound] */ + /** Version of `inRange()` which uses inclusive be default: [lowerBound, upperBound] */ def inRange[T](lowerBound: Double, upperBound: Double): T => Boolean = { inRange[T](lowerBound, upperBound, lowerInclusive = true, upperInclusive = true) } @@ -228,7 +228,7 @@ object ParamValidators { /** * :: DeveloperApi :: - * Specialized version of [[Param[Double]]] for Java. + * Specialized version of `Param[Double]` for Java. */ @DeveloperApi class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean) @@ -288,7 +288,7 @@ private[param] object DoubleParam { /** * :: DeveloperApi :: - * Specialized version of [[Param[Int]]] for Java. + * Specialized version of `Param[Int]` for Java. */ @DeveloperApi class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean) @@ -317,7 +317,7 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea /** * :: DeveloperApi :: - * Specialized version of [[Param[Float]]] for Java. + * Specialized version of `Param[Float]` for Java. */ @DeveloperApi class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean) @@ -378,7 +378,7 @@ private object FloatParam { /** * :: DeveloperApi :: - * Specialized version of [[Param[Long]]] for Java. + * Specialized version of `Param[Long]` for Java. */ @DeveloperApi class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean) @@ -407,7 +407,7 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool /** * :: DeveloperApi :: - * Specialized version of [[Param[Boolean]]] for Java. + * Specialized version of `Param[Boolean]` for Java. */ @DeveloperApi class BooleanParam(parent: String, name: String, doc: String) // No need for isValid @@ -430,7 +430,7 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV /** * :: DeveloperApi :: - * Specialized version of [[Param[Array[String]]]] for Java. + * Specialized version of `Param[Array[String]]` for Java. */ @DeveloperApi class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array[String] => Boolean) @@ -455,7 +455,7 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array /** * :: DeveloperApi :: - * Specialized version of [[Param[Array[Double]]]] for Java. + * Specialized version of `Param[Array[Double]]` for Java. */ @DeveloperApi class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array[Double] => Boolean) @@ -485,7 +485,7 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array /** * :: DeveloperApi :: - * Specialized version of [[Param[Array[Int]]]] for Java. + * Specialized version of `Param[Array[Int]]` for Java. */ @DeveloperApi class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[Int] => Boolean) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index aec1526b55c49..5fb04ed0ee9a2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -124,7 +124,7 @@ object SVMModel extends Loader[SVMModel] { /** * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2 - * regularization is used, which can be changed via [[SVMWithSGD.optimizer]]. + * regularization is used, which can be changed via `SVMWithSGD.optimizer`. * * @note Labels used in SVM should be {0, 1}. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 92cd7f22dc439..a8588bf182641 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -78,7 +78,8 @@ class BinaryClassificationMetrics @Since("1.3.0") ( * Returns the receiver operating characteristic (ROC) curve, * which is an RDD of (false positive rate, true positive rate) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic + * @see + * Receiver operating characteristic (Wikipedia) */ @Since("1.0.0") def roc(): RDD[(Double, Double)] = { @@ -98,7 +99,7 @@ class BinaryClassificationMetrics @Since("1.3.0") ( /** * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. - * @see http://en.wikipedia.org/wiki/Precision_and_recall + * @see Precision and recall */ @Since("1.0.0") def pr(): RDD[(Double, Double)] = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala index bb94745f078e8..7a1d2577c20e0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala @@ -32,7 +32,7 @@ private[mllib] object EigenValueDecomposition { * * @param mul a function that multiplies the symmetric matrix with a DenseVector. * @param n dimension of the square matrix (maximum Int.MaxValue). - * @param k number of leading eigenvalues required, 0 < k < n. + * @param k number of leading eigenvalues required, 0 < k < n. * @param tol tolerance of the eigs computation. * @param maxIterations the maximum number of Arnoldi update iterations. * @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index c94d7890cf557..2a226dc341762 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -77,7 +77,7 @@ sealed trait Vector extends Serializable { /** * Returns a hash code value for the vector. The hash code is based on its size and its first 128 - * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]]. + * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`. */ override def hashCode(): Int = { // This is a reference implementation. It calls return in foreachActive, which is slow. @@ -351,7 +351,7 @@ object Vectors { } /** - * Parses a string resulted from [[Vector.toString]] into a [[Vector]]. + * Parses a string resulted from [[Vector.toString()]] into a [[Vector]]. */ @Since("1.1.0") def parse(s: String): Vector = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala index 6d60136ddc38f..b2e37bad3cf69 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala @@ -249,8 +249,8 @@ object RandomRDDs { * shape and scale. * * @param sc SparkContext used to create the RDD. - * @param shape shape parameter (> 0) for the gamma distribution - * @param scale scale parameter (> 0) for the gamma distribution + * @param shape shape parameter (> 0) for the gamma distribution + * @param scale scale parameter (> 0) for the gamma distribution * @param size Size of the RDD. * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`). * @param seed Random seed (default: a random long integer). @@ -766,8 +766,8 @@ object RandomRDDs { * gamma distribution with the input shape and scale. * * @param sc SparkContext used to create the RDD. - * @param shape shape parameter (> 0) for the gamma distribution. - * @param scale scale parameter (> 0) for the gamma distribution. + * @param shape shape parameter (> 0) for the gamma distribution. + * @param scale scale parameter (> 0) for the gamma distribution. * @param numRows Number of Vectors in the RDD. * @param numCols Number of elements in each Vector. * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index d846c43cf2913..95b155b037194 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -162,7 +162,7 @@ object DecisionTree extends Serializable with Logging { * @param numClasses Number of classes for classification. Default value of 2. * @param maxBins Maximum number of bins used for splitting features. * @param quantileCalculationStrategy Algorithm for calculating quantiles. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -$gt; k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @return DecisionTreeModel that can be used for prediction. @@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. * @param numClasses Number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. @@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index cdeef16135015..a7017f0339101 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] ( } /** - * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]]. + * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#run`. */ @Since("1.2.0") def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = { @@ -89,7 +89,7 @@ class GradientBoostedTrees private[spark] ( * This dataset should be different from the training dataset, * but it should follow the same distribution. * E.g., these two datasets could be created from an original dataset - * by using [[org.apache.spark.rdd.RDD.randomSplit()]] + * by using `org.apache.spark.rdd.RDD.randomSplit()` * @return GradientBoostedTreesModel that can be used for prediction. */ @Since("1.4.0") @@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] ( } /** - * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]]. + * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation`. */ @Since("1.4.0") def runWithValidation( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 428af21406092..81c1bb27ea207 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -53,13 +53,13 @@ import org.apache.spark.util.Utils * the type of random forest (classification or regression), feature type * (continuous, categorical), depth of the tree, quantile calculation strategy, * etc. - * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. + * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. * @param featureSubsetStrategy Number of features to consider for splits at each node. * Supported values: "auto", "all", "sqrt", "log2", "onethird". * Supported numerical values: "(0.0-1.0]", "[1-n]". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and + * if numTrees > 1 (forest) set to "sqrt" for classification and * to "onethird" for regression. * If a real value "n" in the range (0, 1.0] is set, * use n * number of features. @@ -111,7 +111,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt". + * if numTrees > 1 (forest) set to "sqrt". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return RandomForestModel that can be used for prediction. */ @@ -134,7 +134,7 @@ object RandomForest extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. * @param numClasses Number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. @@ -142,7 +142,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt". + * if numTrees > 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means @@ -200,7 +200,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "onethird". + * if numTrees > 1 (forest) set to "onethird". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return RandomForestModel that can be used for prediction. */ @@ -222,7 +222,7 @@ object RandomForest extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. @@ -230,7 +230,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "onethird". + * if numTrees > 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. * The only supported value for regression is "variance". * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 0b1e191a1cd99..0fe8d87ebd6ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.NumericType import org.apache.spark.sql.types.StructType /** - * A set of methods for aggregations on a [[DataFrame]], created by [[Dataset.groupBy]]. + * A set of methods for aggregations on a `DataFrame`, created by `Dataset.groupBy`. * * The main method is the agg function, which has multiple variants. This class also contains * convenience some first order statistics such as mean, sum for convenience. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index ff6dd8cb0cf92..a1ea748de98f9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -112,7 +112,7 @@ trait SchemaRelationProvider { /** * ::Experimental:: - * Implemented by objects that can produce a streaming [[Source]] for a specific format or system. + * Implemented by objects that can produce a streaming `Source` for a specific format or system. * * @since 2.0.0 */ @@ -143,7 +143,7 @@ trait StreamSourceProvider { /** * ::Experimental:: - * Implemented by objects that can produce a streaming [[Sink]] for a specific format or system. + * Implemented by objects that can produce a streaming `Sink` for a specific format or system. * * @since 2.0.0 */ @@ -185,7 +185,7 @@ trait CreatableRelationProvider { /** * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must - * be able to produce the schema of their data in the form of a [[StructType]]. Concrete + * be able to produce the schema of their data in the form of a `StructType`. Concrete * implementation should inherit from one of the descendant `Scan` classes, which define various * abstract methods for execution. * @@ -216,10 +216,10 @@ abstract class BaseRelation { /** * Whether does it need to convert the objects in Row to internal representation, for example: - * java.lang.String -> UTF8String - * java.lang.Decimal -> Decimal + * java.lang.String -> UTF8String + * java.lang.Decimal -> Decimal * - * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]] + * If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow` * * @note The internal representation is not stable across releases and thus data sources outside * of Spark SQL should leave this as true. diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala index 7c1ea2f89ddb8..ea20105892bf0 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala @@ -100,7 +100,7 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] exten /** * :: Experimental :: - * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]] + * Builder object for creating instances of `org.apache.spark.streaming.StateSpec` * that is used for specifying the parameters of the DStream transformation `mapWithState` * that is used for specifying the parameters of the DStream transformation * `mapWithState` operation of a diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala index aa4003c62e1e7..2ec907c8cfd5f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala @@ -434,8 +434,8 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( * Return a [[JavaMapWithStateDStream]] by applying a function to every key-value element of * `this` stream, while maintaining some state data for each unique key. The mapping function * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this - * transformation can be specified using [[StateSpec]] class. The state data is accessible in - * as a parameter of type [[State]] in the mapping function. + * transformation can be specified using `StateSpec` class. The state data is accessible in + * as a parameter of type `State` in the mapping function. * * Example of using `mapWithState`: * {{{ diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala index b43b9405def97..982e72cffbf3f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala @@ -44,7 +44,7 @@ import org.apache.spark.streaming.scheduler.StreamingListener * A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main * entry point for Spark Streaming functionality. It provides methods to create * [[org.apache.spark.streaming.api.java.JavaDStream]] and - * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal + * [[org.apache.spark.streaming.api.java.JavaPairDStream]] from input sources. The internal * org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed * using `context.sparkContext`. After creating and transforming DStreams, the streaming * computation can be started and stopped using `context.start()` and `context.stop()`, diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala index ac739411fd212..f38c1e7996595 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala @@ -356,8 +356,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)]) * Return a [[MapWithStateDStream]] by applying a function to every key-value element of * `this` stream, while maintaining some state data for each unique key. The mapping function * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this - * transformation can be specified using [[StateSpec]] class. The state data is accessible in - * as a parameter of type [[State]] in the mapping function. + * transformation can be specified using `StateSpec` class. The state data is accessible in + * as a parameter of type `State` in the mapping function. * * Example of using `mapWithState`: * {{{ From 9a9d0cd6f74fa32256ce6edf0832865585242ba2 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 02:06:12 +0900 Subject: [PATCH 06/17] Fix errors (last round) --- .../org/apache/spark/rdd/PairRDDFunctions.scala | 7 ++++++- .../apache/spark/rpc/netty/RpcEndpointVerifier.scala | 2 +- .../org/apache/spark/scheduler/ResultTask.scala | 2 +- .../org/apache/spark/scheduler/ShuffleMapTask.scala | 2 +- .../main/scala/org/apache/spark/scheduler/Task.scala | 2 +- .../org/apache/spark/scheduler/TaskDescription.scala | 2 +- .../spark/storage/ShuffleBlockFetcherIterator.scala | 2 +- .../spark/util/random/StratifiedSamplingUtils.scala | 6 +++--- .../streaming/flume/FlumePollingInputDStream.scala | 2 +- .../spark/streaming/kafka/KafkaInputDStream.scala | 2 +- .../spark/graphx/impl/VertexPartitionBase.scala | 2 +- .../spark/graphx/impl/VertexPartitionBaseOps.scala | 2 +- .../org/apache/spark/ml/recommendation/ALS.scala | 2 +- .../spark/ml/tree/impl/DecisionTreeMetadata.scala | 2 +- .../scala/org/apache/spark/ml/util/ReadWrite.scala | 2 +- .../spark/mllib/evaluation/RankingMetrics.scala | 2 +- .../binary/BinaryClassificationMetricComputers.scala | 2 +- .../apache/spark/mllib/fpm/AssociationRules.scala | 4 ++-- .../scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 6 +++--- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++++----- .../org/apache/spark/mllib/linalg/Vectors.scala | 2 +- .../org/apache/spark/mllib/rdd/SlidingRDD.scala | 4 ++-- .../org/apache/spark/sql/internal/CatalogImpl.scala | 12 ++++++------ .../spark/sql/internal/VariableSubstitution.scala | 2 +- .../scala/org/apache/spark/sql/hive/hiveUDFs.scala | 2 +- .../apache/spark/sql/hive/hiveWriterContainers.scala | 2 +- .../scala/org/apache/spark/streaming/StateSpec.scala | 2 +- 27 files changed, 47 insertions(+), 42 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index ab8582f3e44cf..01d203685f407 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -399,7 +399,12 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero {{{ sp > p }}} + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero + * + * {{{ + * sp > p + * }}} + * * would trigger sparse representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. * diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala index 99f20da2d66aa..0e980b1089221 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala @@ -20,7 +20,7 @@ package org.apache.spark.rpc.netty import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv} /** - * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists. + * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an `RpcEndpoint` exists. * * This is used when setting up a remote endpoint reference. */ diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala index 1e7c63af2e797..d19353f2a9930 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala @@ -42,7 +42,7 @@ import org.apache.spark.rdd.RDD * @param outputId index of the task in this job (a job can launch tasks on only a subset of the * input RDD's partitions). * @param localProperties copy of thread-local properties set by the user on the driver side. - * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side. + * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side. * * The parameters below are optional: * @param jobId id of the job this task belongs to diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala index 66d6790e168f2..31011de85bf7e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala @@ -42,7 +42,7 @@ import org.apache.spark.shuffle.ShuffleWriter * the type should be (RDD[_], ShuffleDependency[_, _, _]). * @param partition partition of the RDD this task is associated with * @param locs preferred task execution locations for locality scheduling - * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side. + * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side. * @param localProperties copy of thread-local properties set by the user on the driver side. * * The parameters below are optional: diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index d39651a722325..1554200aeaf64 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -46,7 +46,7 @@ import org.apache.spark.util._ * @param stageId id of the stage this task belongs to * @param stageAttemptId attempt id of the stage this task belongs to * @param partitionId index of the number in the RDD - * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side. + * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side. * @param localProperties copy of thread-local properties set by the user on the driver side. * * The parameters below are optional: diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 1c7c81c488c3a..45c742cbff5e7 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -23,7 +23,7 @@ import org.apache.spark.util.SerializableBuffer /** * Description of a task that gets passed onto executors to be executed, usually created by - * [[TaskSetManager.resourceOffer]]. + * `TaskSetManager.resourceOffer`. */ private[spark] class TaskDescription( val taskId: Long, diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index 4dc2f362329a0..a94cf0fbb10ef 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -247,7 +247,7 @@ final class ShuffleBlockFetcherIterator( /** * Fetch the local blocks while we are fetching remote blocks. This is ok because - * [[ManagedBuffer]]'s memory is allocated lazily when we create the input stream, so all we + * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we * track in-memory are the ManagedBuffer references themselves. */ private[this] def fetchLocalBlocks() { diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala index 83547072a08b2..debca177155cd 100644 --- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala @@ -35,8 +35,8 @@ import org.apache.spark.rdd.RDD * high probability. This is achieved by maintaining a waitlist of size O(log(s)), where s is the * desired sample size for each stratum. * - * Like in simple random sampling, we generate a random value for each item from the - * uniform distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist) + * Like in simple random sampling, we generate a random value for each item from the uniform + * distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist) * are accepted into the sample instantly. The threshold for instant accept is designed so that * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding @@ -165,7 +165,7 @@ private[spark] object StratifiedSamplingUtils extends Logging { * Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which * means we need to sort the elements in the waitlist by their associated values in order to find * the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize. - * Note that all elements in the waitlist have values >= bound for instant accept, so a T value + * Note that all elements in the waitlist have values >= bound for instant accept, so a T value * in the waitlist range would allow all elements that were instantly accepted on the first pass * to be included in the sample. */ diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala index 54565840fa665..d84e289272c62 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala @@ -36,7 +36,7 @@ import org.apache.spark.streaming.flume.sink._ import org.apache.spark.streaming.receiver.Receiver /** - * A [[ReceiverInputDStream]] that can be used to read data from several Flume agents running + * A `ReceiverInputDStream` that can be used to read data from several Flume agents running * [[org.apache.spark.streaming.flume.sink.SparkSink]]s. * @param _ssc Streaming context that will execute this input stream * @param addresses List of addresses at which SparkSinks are listening diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala index 3713bda41b8ee..fffb920e97a53 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala @@ -38,7 +38,7 @@ import org.apache.spark.util.ThreadUtils * * @param kafkaParams Map of kafka configuration parameters. * See: http://kafka.apache.org/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed * in its own thread. * @param storageLevel RDD storage level. */ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala index 8d608c99b1a1d..8da46db98be81 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala @@ -57,7 +57,7 @@ private[graphx] object VertexPartitionBase { * concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for * VertexPartitionBase and subclasses that provide implicit evidence of membership in the * `VertexPartitionBaseOpsConstructor` typeclass (for example, - * [[VertexPartition.VertexPartitionOpsConstructor]]). + * `VertexPartition.VertexPartitionOpsConstructor`). */ private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag] extends Serializable { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala index 43594573cf013..a8ed59b09bbb7 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala @@ -29,7 +29,7 @@ import org.apache.spark.util.collection.BitSet /** * A class containing additional operations for subclasses of VertexPartitionBase that provide * implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for - * example, [[VertexPartition.VertexPartitionOpsConstructor]]). + * example, `VertexPartition.VertexPartitionOpsConstructor`). */ private[graphx] abstract class VertexPartitionBaseOps [VD: ClassTag, Self[X] <: VertexPartitionBase[X]: VertexPartitionBaseOpsConstructor] diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 6d2c59a905ec7..c6de6eb02774b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -878,7 +878,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging { } /** - * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing. + * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing. */ private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag] extends Serializable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index 442f52bf0231d..9d7a3bd07abd3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD * @param numClasses For classification: labels can take values {0, ..., numClasses - 1}. * For regression: fixed at 0 (no meaning). * @param maxBins Maximum number of bins, for all features. - * @param featureArity Map: categorical feature index --> arity. + * @param featureArity Map: categorical feature index --> arity. * I.e., the feature takes values in {0, ..., arity - 1}. * @param numBins Number of bins for each feature. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 343a70c5d7a46..b0759dca718a9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -345,7 +345,7 @@ private[ml] object DefaultParamsReader { /** * All info from metadata file. * - * @param params paramMap, as a [[JValue]] + * @param params paramMap, as a `JValue` * @param metadata All metadata, including the other fields * @param metadataJson Full metadata file String (for debugging) */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index e29b51c3a19da..3d274d68f1180 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD /** * Evaluator for ranking algorithms. * - * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance. + * Java users should use `RankingMetrics$.of` to create a [[RankingMetrics]] instance. * * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala index be3319d60ce25..5a4c6aef50b7b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala @@ -62,7 +62,7 @@ private[evaluation] object Recall extends BinaryClassificationMetricComputer { * F-Measure. Defined as 0 if both precision and recall are 0. EG in the case that all examples * are false positives. * @param beta the beta constant in F-Measure - * @see http://en.wikipedia.org/wiki/F1_score + * @see F1 score (Wikipedia) */ private[evaluation] case class FMeasure(beta: Double) extends BinaryClassificationMetricComputer { private val beta2 = beta * beta diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 3c26d2670841b..dca031477d3b7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -28,7 +28,7 @@ import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset import org.apache.spark.rdd.RDD /** - * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates + * Generates association rules from a `RDD[FreqItemset[Item]]`. This method only generates * association rules which have a single item as the consequent. * */ @@ -56,7 +56,7 @@ class AssociationRules private[fpm] ( /** * Computes the association rules with confidence above [[minConfidence]]. * @param freqItemsets frequent itemset model obtained from [[FPGrowth]] - * @return a [[Set[Rule[Item]]] containing the association rules. + * @return a `Set[Rule[Item]]` containing the association rules. * */ @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index b53386012280d..5f5b3a497b9ad 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -44,7 +44,7 @@ import org.apache.spark.storage.StorageLevel /** * Model trained by [[FPGrowth]], which holds frequent itemsets. - * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]] + * @param freqItemsets frequent itemset, which is an RDD of `FreqItemset` * @tparam Item item type */ @Since("1.3.0") @@ -69,7 +69,7 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") ( * - human-readable (JSON) model metadata to path/metadata/ * - Parquet formatted data to path/data/ * - * The model may be loaded using [[FPGrowthModel.load]]. + * The model may be loaded using `FPGrowthModel.load`. * * @param sc Spark context used to save model data. * @param path Path specifying the directory in which to save this model. @@ -309,7 +309,7 @@ object FPGrowth { /** * Frequent itemset. - * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead. + * @param items items in this itemset. Java users should call `FreqItemset#javaItems` instead. * @param freq frequency * @tparam Item item type * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index a5641672218dd..08f32ca4736bb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -211,7 +211,7 @@ class PrefixSpan private ( } /** - * A Java-friendly version of [[run()]] that reads sequences from a [[JavaRDD]] and returns + * A Java-friendly version of `run()` that reads sequences from a `JavaRDD` and returns * frequent sequences in a [[PrefixSpanModel]]. * @param data ordered sequences of itemsets stored as Java Iterable of Iterables * @tparam Item item type @@ -366,13 +366,13 @@ object PrefixSpan extends Logging { * Items are represented by positive integers, and items in each itemset must be distinct and * ordered. * we use 0 as the delimiter between itemsets. - * For example, a sequence `<(12)(31)1>` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`. - * The postfix of this sequence w.r.t. to prefix `<1>` is `<(_2)(13)1>`. + * For example, a sequence <(12)(31)1> is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`. + * The postfix of this sequence w.r.t. to prefix <1> is <(_2)(13)1>. * We may reuse the original items array `[0, 1, 2, 0, 1, 3, 0, 1, 0]` to represent the postfix, * and mark the start index of the postfix, which is `2` in this example. * So the active items in this postfix are `[2, 0, 1, 3, 0, 1, 0]`. * We also remember the start indices of partial projections, the ones that split an itemset. - * For example, another possible partial projection w.r.t. `<1>` is `<(_3)1>`. + * For example, another possible partial projection w.r.t. <1> is <(_3)1>. * We remember the start indices of partial projections, which is `[2, 5]` in this example. * This data structure makes it easier to do projections. * @@ -583,7 +583,7 @@ class PrefixSpanModel[Item] @Since("1.5.0") ( * - human-readable (JSON) model metadata to path/metadata/ * - Parquet formatted data to path/data/ * - * The model may be loaded using [[PrefixSpanModel.load]]. + * The model may be loaded using `PrefixSpanModel.load`. * * @param sc Spark context used to save model data. * @param path Path specifying the directory in which to save this model. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 2a226dc341762..63ea9d3264b0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -351,7 +351,7 @@ object Vectors { } /** - * Parses a string resulted from [[Vector.toString()]] into a [[Vector]]. + * Parses a string resulted from `Vector.toString` into a [[Vector]]. */ @Since("1.1.0") def parse(s: String): Vector = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala index adb5e51947f6d..365b2a06110f6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala @@ -42,8 +42,8 @@ class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T] * @param windowSize the window size, must be greater than 1 * @param step step size for windows * - * @see [[org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*]] - * @see [[scala.collection.IterableLike.sliding(Int, Int)*]] + * @see `org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*` + * @see `scala.collection.IterableLike.sliding(Int, Int)*` */ private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index d3e323cb12891..822949af7d899 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType /** - * Internal implementation of the user-facing [[Catalog]]. + * Internal implementation of the user-facing `Catalog`. */ class CatalogImpl(sparkSession: SparkSession) extends Catalog { @@ -175,7 +175,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { } /** - * Get the database with the specified name. This throws an [[AnalysisException]] when no + * Get the database with the specified name. This throws an `AnalysisException` when no * [[Database]] can be found. */ override def getDatabase(dbName: String): Database = { @@ -184,7 +184,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Get the table or view with the specified name. This table can be a temporary view or a - * table/view in the current database. This throws an [[AnalysisException]] when no [[Table]] + * table/view in the current database. This throws an `AnalysisException` when no `Table` * can be found. */ override def getTable(tableName: String): Table = { @@ -193,7 +193,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Get the table or view with the specified name in the specified database. This throws an - * [[AnalysisException]] when no [[Table]] can be found. + * `AnalysisException` when no `Table` can be found. */ override def getTable(dbName: String, tableName: String): Table = { makeTable(TableIdentifier(tableName, Option(dbName))) @@ -201,7 +201,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Get the function with the specified name. This function can be a temporary function or a - * function in the current database. This throws an [[AnalysisException]] when no [[Function]] + * function in the current database. This throws an `AnalysisException` when no `Function` * can be found. */ override def getFunction(functionName: String): Function = { @@ -209,7 +209,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { } /** - * Get the function with the specified name. This returns [[None]] when no [[Function]] can be + * Get the function with the specified name. This returns `None` when no `Function` can be * found. */ override def getFunction(dbName: String, functionName: String): Function = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala index 791a9cf813b6a..4e7c813be9922 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala @@ -23,7 +23,7 @@ import org.apache.spark.internal.config._ * A helper class that enables substitution using syntax like * `${var}`, `${system:var}` and `${env:var}`. * - * Variable substitution is controlled by [[SQLConf.variableSubstituteEnabled]]. + * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`. */ class VariableSubstitution(conf: SQLConf) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index 32edd4aec2865..90e86959cd0e4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -177,7 +177,7 @@ private[hive] case class HiveGenericUDF( /** * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a - * [[Generator]]. Note that the semantics of Generators do not allow + * `Generator`. Note that the semantics of Generators do not allow * Generators to maintain state in between input rows. Thus UDTFs that rely on partitioning * dependent operations like calls to `close()` before producing output will not operate the same as * in Hive. However, in practice this should not affect compatibility for most sane UDTFs diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala index a34e2e76f5838..0c9321068c4c1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala @@ -49,7 +49,7 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter /** * Internal helper class that saves an RDD using a Hive OutputFormat. - * It is based on [[SparkHadoopWriter]]. + * It is based on `SparkHadoopWriter`. */ private[hive] class SparkHiveWriterContainer( @transient private val jobConf: JobConf, diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala index ea20105892bf0..c3b28bd516da5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala @@ -30,7 +30,7 @@ import org.apache.spark.util.ClosureCleaner * `mapWithState` operation of a * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java). - * Use [[org.apache.spark.streaming.StateSpec.function() StateSpec.function]] factory methods + * Use `org.apache.spark.streaming.StateSpec.function()` factory methods * to create instances of this class. * * Example in Scala: From 842a738e8966f70e3b882182a06f2d1e8d257f73 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 02:19:38 +0900 Subject: [PATCH 07/17] Fix more errors --- core/src/main/scala/org/apache/spark/Accumulator.scala | 2 +- .../org/apache/spark/rpc/netty/RpcEndpointVerifier.scala | 2 +- .../apache/spark/storage/ShuffleBlockFetcherIterator.scala | 2 +- .../spark/mllib/evaluation/BinaryClassificationMetrics.scala | 5 +++-- .../org/apache/spark/mllib/evaluation/RankingMetrics.scala | 2 +- .../scala/org/apache/spark/sql/internal/CatalogImpl.scala | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index bcf157078813d..b28e9408062b5 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -26,7 +26,7 @@ package org.apache.spark * * An accumulator is created from an initial value `v` by calling * [[SparkContext#accumulator SparkContext.accumulator]]. - * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]]. + * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]]. * However, they cannot read its value. Only the driver program can read the accumulator's value, * using its [[#value]] method. * diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala index 0e980b1089221..430dcc50ba711 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala @@ -35,6 +35,6 @@ private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher private[netty] object RpcEndpointVerifier { val NAME = "endpoint-verifier" - /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */ + /** A message used to ask the remote [[RpcEndpointVerifier]] if an `RpcEndpoint` exists. */ case class CheckExistence(name: String) } diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index a94cf0fbb10ef..269c12d6da444 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -423,7 +423,7 @@ object ShuffleBlockFetcherIterator { * @param address BlockManager that the block was fetched from. * @param size estimated size of the block, used to calculate bytesInFlight. * Note that this is NOT the exact bytes. - * @param buf [[ManagedBuffer]] for the content. + * @param buf `ManagedBuffer` for the content. * @param isNetworkReqDone Is this the last network request for this host in this fetch request. */ private[storage] case class SuccessFetchResult( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index a8588bf182641..9b7cd0427f5ed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -99,7 +99,8 @@ class BinaryClassificationMetrics @Since("1.3.0") ( /** * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. - * @see Precision and recall + * @see + * Precision and recall (Wikipedia) */ @Since("1.0.0") def pr(): RDD[(Double, Double)] = { @@ -119,7 +120,7 @@ class BinaryClassificationMetrics @Since("1.3.0") ( * Returns the (threshold, F-Measure) curve. * @param beta the beta factor in F-Measure computation. * @return an RDD of (threshold, F-Measure) pairs. - * @see http://en.wikipedia.org/wiki/F1_score + * @see F1 score (Wikipedia) */ @Since("1.0.0") def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index 3d274d68f1180..cedfdbf0dc127 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -41,7 +41,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] /** * Compute the average precision of all the queries, truncated at ranking position k. * - * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be + * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be * computed as #(relevant items retrieved) / k. This formula also applies when the size of the * ground truth set is less than k. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 822949af7d899..6d984621ccca1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -176,7 +176,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Get the database with the specified name. This throws an `AnalysisException` when no - * [[Database]] can be found. + * `Database` can be found. */ override def getDatabase(dbName: String): Database = { makeDatabase(dbName) From 366031768acc4d93a30f23151ce23c00060a322f Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 02:27:08 +0900 Subject: [PATCH 08/17] Make markdown pretty for some links --- core/src/main/scala/org/apache/spark/Accumulator.scala | 2 +- .../src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 2 +- .../org/apache/spark/mllib/tree/GradientBoostedTrees.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index b28e9408062b5..7bea636c94aa0 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -26,7 +26,7 @@ package org.apache.spark * * An accumulator is created from an initial value `v` by calling * [[SparkContext#accumulator SparkContext.accumulator]]. - * Tasks running on the cluster can then add to it using the `+=` operator in [[Accumulable]]. + * Tasks running on the cluster can then add to it using the `+=` operator. * However, they cannot read its value. Only the driver program can read the accumulator's value, * using its [[#value]] method. * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index 5f5b3a497b9ad..e3cf0d4979ed4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -309,7 +309,7 @@ object FPGrowth { /** * Frequent itemset. - * @param items items in this itemset. Java users should call `FreqItemset#javaItems` instead. + * @param items items in this itemset. Java users should call `FreqItemset.javaItems` instead. * @param freq frequency * @tparam Item item type * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index a7017f0339101..3e85678906b33 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] ( } /** - * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#run`. + * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.run`. */ @Since("1.2.0") def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = { @@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] ( } /** - * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation`. + * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.runWithValidation`. */ @Since("1.4.0") def runWithValidation( From 22bfb6854a7e0ecfc9b7d9ffe8c49715484eb3a1 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 03:01:38 +0900 Subject: [PATCH 09/17] Keep original style without new indentations --- .../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 01d203685f407..b31d21c354d11 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -399,12 +399,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero - * - * {{{ - * sp > p - * }}} - * + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp > p) * would trigger sparse representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. * From 73fcd355a565c5ea433b1f8ca11e08ee6c3f2a9e Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 26 Nov 2016 12:44:12 +0900 Subject: [PATCH 10/17] Use throws annotation --- .../scala/org/apache/spark/SparkConf.scala | 18 ++++++++++++------ .../scala/org/apache/spark/SparkContext.scala | 6 ++++-- .../scala/org/apache/spark/util/Utils.scala | 3 ++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 3f5b19eda2cc3..d7df031c856c0 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -262,8 +262,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no * suffix is provided then seconds are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getTimeAsSeconds(key: String): Long = { Utils.timeStringAsSeconds(get(key)) } @@ -279,8 +280,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no * suffix is provided then milliseconds are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getTimeAsMs(key: String): Long = { Utils.timeStringAsMs(get(key)) } @@ -296,8 +298,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then bytes are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getSizeAsBytes(key: String): Long = { Utils.byteStringAsBytes(get(key)) } @@ -320,8 +323,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Kibibytes are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getSizeAsKb(key: String): Long = { Utils.byteStringAsKb(get(key)) } @@ -337,8 +341,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Mebibytes are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getSizeAsMb(key: String): Long = { Utils.byteStringAsMb(get(key)) } @@ -354,8 +359,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria /** * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no * suffix is provided then Gibibytes are assumed. - * @note Throws `NoSuchElementException` + * @throws java.util.NoSuchElementException */ + @throws(classOf[NoSuchElementException]) def getSizeAsGb(key: String): Long = { Utils.byteStringAsGb(get(key)) } diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 872c46ab689e1..bd1f1683cf82c 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -2061,8 +2061,9 @@ class SparkContext(config: SparkConf) extends Logging { * Cancel a given job if it's scheduled or running. * * @param jobId the job ID to cancel - * @note Throws `InterruptedException` if the cancel message cannot be sent + * @throws InterruptedException if the cancel message cannot be sent */ + @throws(classOf[InterruptedException]) def cancelJob(jobId: Int) { dagScheduler.cancelJob(jobId) } @@ -2071,8 +2072,9 @@ class SparkContext(config: SparkConf) extends Logging { * Cancel a given stage and all jobs associated with it. * * @param stageId the stage ID to cancel - * @note Throws `InterruptedException` if the cancel message cannot be sent + * @throws InterruptedException if the cancel message cannot be sent */ + @throws(classOf[InterruptedException]) def cancelStage(stageId: Int) { dagScheduler.cancelStage(stageId) } diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index f051860a23b65..252f754e0e36e 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2354,8 +2354,9 @@ private[spark] object Utils extends Logging { * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains * host and port. * - * @note Throws `SparkException` if sparkUrl is invalid. + * @throws org.apache.spark.SparkException if sparkUrl is invalid. */ + @throws(classOf[SparkException]) def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = { try { val uri = new java.net.URI(sparkUrl) From 246bef3ce61f7fe74a97577e8cfe4a3a1531f1ff Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sun, 27 Nov 2016 01:58:49 +0900 Subject: [PATCH 11/17] Print < and > properly within this PR --- .../apache/spark/rdd/DoubleRDDFunctions.scala | 6 ++- .../apache/spark/rdd/PairRDDFunctions.scala | 8 ++-- .../spark/scheduler/InputFormatInfo.scala | 4 +- .../spark/storage/BlockManagerMessages.scala | 2 +- .../spark/util/random/SamplingUtils.scala | 19 ++++++-- .../util/random/StratifiedSamplingUtils.scala | 32 ++++++++----- .../spark/streaming/kafka/KafkaCluster.scala | 20 ++++++-- .../spark/streaming/kafka/KafkaUtils.scala | 10 ++-- .../org/apache/spark/ml/clustering/LDA.scala | 4 +- .../apache/spark/ml/feature/Bucketizer.scala | 2 +- .../spark/ml/feature/CountVectorizer.scala | 5 +- .../apache/spark/ml/feature/HashingTF.scala | 2 +- .../org/apache/spark/ml/feature/NGram.scala | 2 +- .../apache/spark/ml/feature/Normalizer.scala | 2 +- .../ml/feature/PolynomialExpansion.scala | 3 +- .../ml/feature/QuantileDiscretizer.scala | 2 +- .../spark/ml/feature/SQLTransformer.scala | 4 +- .../apache/spark/ml/feature/Tokenizer.scala | 2 +- .../spark/ml/feature/VectorIndexer.scala | 7 +-- .../org/apache/spark/ml/param/params.scala | 48 +++++++++++++++---- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../GeneralizedLinearRegression.scala | 10 ++-- .../ml/regression/LinearRegression.scala | 9 ++-- .../ml/tree/impl/DecisionTreeMetadata.scala | 2 +- .../apache/spark/ml/util/MetadataUtils.scala | 2 +- .../spark/mllib/clustering/LDAOptimizer.scala | 5 +- .../mllib/evaluation/RankingMetrics.scala | 6 +-- .../linalg/EigenValueDecomposition.scala | 5 +- .../spark/mllib/random/RandomRDDs.scala | 8 ++-- .../spark/mllib/tree/DecisionTree.scala | 6 +-- .../spark/mllib/tree/RandomForest.scala | 19 ++++---- .../tree/configuration/BoostingStrategy.scala | 6 +-- .../mllib/tree/configuration/Strategy.scala | 2 +- .../spark/sql/DataFrameStatFunctions.scala | 6 ++- .../org/apache/spark/sql/RuntimeConfig.scala | 3 +- .../org/apache/spark/sql/SQLContext.scala | 2 +- .../org/apache/spark/sql/functions.scala | 26 ++++++---- .../apache/spark/sql/sources/interfaces.scala | 7 ++- 38 files changed, 205 insertions(+), 105 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index f4bc3e3021447..5ebddf8b41a47 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -152,10 +152,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { /** * Compute a histogram using the provided buckets. The buckets are all open - * to the right except for the last which is closed + * to the right except for the last which is closed. + * {{{ * e.g. for the array * [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50] - * e.g 1<=x<10 , 10<=x<20, 20<=x<=50 + * e.g 1<=x<10 , 10<=x<20, 20<=x<=50 + * }}} * And on the input of 1 and 50 we would have a histogram of 1, 0, 1 * * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index b31d21c354d11..aad99e3eb2c5b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -399,9 +399,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp > p) - * would trigger sparse representation of registers, which may reduce the memory consumption - * and increase accuracy when the cardinality is small. + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is + * greater than `p`) would trigger sparse representation of registers, which may reduce the + * memory consumption and increase accuracy when the cardinality is small. * * @param p The precision value for the normal set. * `p` must be a value between 4 and `sp` if `sp` is not zero (32 max). @@ -908,7 +908,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Return an RDD with the pairs from `this` whose keys are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting - * RDD will be <= us. + * RDD will be less than or equal to us. */ def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope { subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length))) diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala index 5f23d657e1155..a80e45e2c3c25 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala @@ -151,9 +151,10 @@ object InputFormatInfo { Computes the preferred locations based on input(s) and returned a location to block map. Typical use of this method for allocation would follow some algo like this: + {{{ a) For each host, count number of splits hosted on that host. b) Decrement the currently allocated containers on that host. - c) Compute rack info for each host and update rack -> count map based on (b). + c) Compute rack info for each host and update rack -> count map based on (b). d) Allocate nodes based on (c) e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node (even if data locality on that is very high) : this is to prevent fragility of job if a @@ -162,6 +163,7 @@ object InputFormatInfo { go to (a) until required nodes are allocated. If a node 'dies', follow same procedure. + }}} PS: I know the wording here is weird, hopefully it makes some sense ! */ diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala index ce82e43b2d58b..d71acbb4cf771 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala @@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages { extends ToBlockManagerSlave /** - * Driver -> Executor message to trigger a thread dump. + * Driver to Executor message to trigger a thread dump. */ case object TriggerThreadDump extends ToBlockManagerSlave diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala index 1099747444be5..cc467cabcd037 100644 --- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala @@ -67,10 +67,12 @@ private[spark] object SamplingUtils { } /** - * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of - * the time. + * Returns a sampling rate that guarantees a sample of size greater than or equal to + * sampleSizeLowerBound 99.99% of the time. * * How the sampling rate is determined: + * + * {{{ * Let p = num / total, where num is the sample size and total is the total number of * datapoints in the RDD. We're trying to compute q > p such that * - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q), where @@ -81,6 +83,7 @@ private[spark] object SamplingUtils { * - when sampling without replacement, we're drawing each datapoint with prob_i * ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success * rate, where success rate is defined the same as in sampling with replacement. + * }}} * * The smallest sampling rate supported is 1e-10 (in order to avoid running into the limit of the * RNG's resolution). @@ -108,14 +111,22 @@ private[spark] object SamplingUtils { private[spark] object PoissonBounds { /** - * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda). + * Returns a lambda such that + * {{{ + * Pr[X > s] + * }}} + * is very small, where X ~ Pois(lambda). */ def getLowerBound(s: Double): Double = { math.max(s - numStd(s) * math.sqrt(s), 1e-15) } /** - * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda). + * Returns a lambda such that + * {{{ + * Pr[X < s] + * }}} + * is very small, where X ~ Pois(lambda). * * @param s sample size */ diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala index debca177155cd..ab4b125d2ac34 100644 --- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala @@ -36,12 +36,13 @@ import org.apache.spark.rdd.RDD * desired sample size for each stratum. * * Like in simple random sampling, we generate a random value for each item from the uniform - * distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist) - * are accepted into the sample instantly. The threshold for instant accept is designed so that - * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a - * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding - * a portion of the waitlist to the set of items that are instantly accepted. The exact threshold - * is computed by sorting the values in the waitlist and picking the value at (s - numAccepted). + * distribution [0.0, 1.0]. All items with values less than or equal to min(values of items in the + * waitlist) are accepted into the sample instantly. The threshold for instant accept is designed + * so that s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by + * maintaining a waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size + * s by adding a portion of the waitlist to the set of items that are instantly accepted. The exact + * threshold is computed by sorting the values in the waitlist and picking the value at + * (s - numAccepted). * * Note that since we use the same seed for the RNG when computing the thresholds and the actual * sample, our computed thresholds are guaranteed to produce the desired sample size. @@ -162,12 +163,19 @@ private[spark] object StratifiedSamplingUtils extends Logging { * it to the number of items that were accepted instantly and the number of items in the waitlist * for that stratum. * - * Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted), which - * means we need to sort the elements in the waitlist by their associated values in order to find - * the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize. - * Note that all elements in the waitlist have values >= bound for instant accept, so a T value - * in the waitlist range would allow all elements that were instantly accepted on the first pass - * to be included in the sample. + * Most of the time, + * {{{ + * numAccepted <= sampleSize <= (numAccepted + numWaitlisted) + * }}}, + * which means we need to sort the elements in the waitlist by their associated values in order + * to find the value T s.t. + * {{{ + * |{elements in the stratum whose associated values <= T}| = sampleSize + * }}}. + * + * Note that all elements in the waitlist have values greater than or equal to bound for instant + * accept, so a T value in the waitlist range would allow all elements that were instantly + * accepted on the first pass to be included in the sample. */ def computeThresholdByKey[K](finalResult: Map[K, AcceptanceResult], fractions: Map[K, Double]): Map[K, Double] = { diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala index c419221aa607a..a27490aad0f60 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala @@ -231,7 +231,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { // this 0 here indicates api version, in this case the original ZK backed api. private def defaultConsumerApiVersion: Short = 0 - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** + * Requires Kafka higher than or equal to 0.8.1.1. + * Defaults to the original ZooKeeper backed api version. + */ def getConsumerOffsets( groupId: String, topicAndPartitions: Set[TopicAndPartition] @@ -250,7 +253,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { } } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** + * Requires Kafka higher than or equal to 0.8.1.1. + * Defaults to the original ZooKeeper backed api version. + */ def getConsumerOffsetMetadata( groupId: String, topicAndPartitions: Set[TopicAndPartition] @@ -287,7 +293,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { Left(errs) } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** + * Requires Kafka higher than or equal to 0.8.1.1. + * Defaults to the original ZooKeeper backed api version. + */ def setConsumerOffsets( groupId: String, offsets: Map[TopicAndPartition, Long] @@ -305,7 +314,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable { setConsumerOffsetMetadata(groupId, meta, consumerApiVersion) } - /** Requires Kafka >= 0.8.1.1. Defaults to the original ZooKeeper backed api version. */ + /** + * Requires Kafka higher than or equal to 0.8.1.1. + * Defaults to the original ZooKeeper backed api version. + */ def setConsumerOffsetMetadata( groupId: String, metadata: Map[TopicAndPartition, OffsetAndMetadata] diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala index 437c797e55605..41d3b728110ed 100644 --- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala +++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala @@ -47,7 +47,7 @@ object KafkaUtils { * @param ssc StreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..) * @param groupId The group id for this consumer - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed * in its own thread * @param storageLevel Storage level to use for storing the received objects * (default: StorageLevel.MEMORY_AND_DISK_SER_2) @@ -72,7 +72,7 @@ object KafkaUtils { * @param ssc StreamingContext object * @param kafkaParams Map of kafka configuration parameters, * see http://kafka.apache.org/08/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is + * @param topics Map of (topic_name and numPartitions) to consume. Each partition is * consumed in its own thread. * @param storageLevel Storage level to use for storing the received objects * @tparam K type of Kafka message key @@ -97,7 +97,7 @@ object KafkaUtils { * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..) * @param groupId The group id for this consumer - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed * in its own thread * @return DStream of (Kafka message key, Kafka message value) */ @@ -115,7 +115,7 @@ object KafkaUtils { * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..). * @param groupId The group id for this consumer. - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed * in its own thread. * @param storageLevel RDD storage level. * @return DStream of (Kafka message key, Kafka message value) @@ -140,7 +140,7 @@ object KafkaUtils { * @param valueDecoderClass Type of kafka value decoder * @param kafkaParams Map of kafka configuration parameters, * see http://kafka.apache.org/08/configuration.html - * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed + * @param topics Map of (topic_name and numPartitions) to consume. Each partition is consumed * in its own thread * @param storageLevel RDD storage level. * @tparam K type of Kafka message key diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 6032ab3db9350..865615ef4dc98 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -120,11 +120,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * * Optimizer-specific parameter settings: * - EM - * - Value should be > 1.0 + * - Value should be greater than 1.0 * - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows * Asuncion et al. (2009), who recommend a +1 adjustment for EM. * - Online - * - Value should be >= 0 + * - Value should be greater than or equal to 0 * - default = (1.0 / k), following the implementation from * here. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 546643d8f91f7..260159f8b7ac4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String /** * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which - * also includes y. Splits should be of length >= 3 and strictly increasing. + * also includes y. Splits should be of length greater than or equal to 3 and strictly increasing. * Values at -inf, inf must be explicitly provided to cover all Double values; * otherwise, values outside the splits specified will be treated as errors. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index cc0924fd95b08..773f5daf70a8d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -53,8 +53,9 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** * Specifies the minimum number of different documents a term must appear in to be included * in the vocabulary. - * If this is an integer >= 1, this specifies the number of documents the term must appear in; - * if this is a double in [0,1), then this specifies the fraction of documents. + * If this is an integer greater than or equal to 1, this specifies the number of documents + * the term must appear in; if this is a double in [0,1), then this specifies the fraction + * of documents. * * Default: 1.0 * @group param diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 8f60ec8788fac..db432b6fefaff 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Number of features. Should be > 0. + * Number of features. Should be greater than 0. * (default = 2^18^) * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala index c424aaa1f5634..c8760f9dc178f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String) def this() = this(Identifiable.randomUID("ngram")) /** - * Minimum n-gram length, >= 1. + * Minimum n-gram length, greater than or equal to 1. * Default: 2, bigram features * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala index 629702051d426..6e96545c8cb7a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala @@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) def this() = this(Identifiable.randomUID("normalizer")) /** - * Normalization in L^p^ space. Must be >= 1. + * Normalization in L^p^ space. Must be greater than equal to 1. * (default: p = 2) * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 74526a8260a0d..292f9496a456c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -45,7 +45,8 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str def this() = this(Identifiable.randomUID("poly")) /** - * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion. + * The polynomial degree to expand, which should be greater than equal to 1. A value of 1 means + * no expansion. * Default: 2 * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index a1cceef51d30a..d8f33cd768dcd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params /** * Number of buckets (quantiles, or categories) into which data points are grouped. Must - * be >= 2. + * be greater than or equal to 2. * * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala index c4398bebf9be6..a82aba10b5b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala @@ -32,9 +32,11 @@ import org.apache.spark.sql.types.StructType * the output, it can be any select clause that Spark SQL supports. Users can also * use Spark SQL built-in function and UDFs to operate on these selected columns. * For example, [[SQLTransformer]] supports statements like: + * {{{ * - SELECT a, a + b AS a_b FROM __THIS__ - * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 + * - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 * - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b + * }}} */ @Since("1.6.0") class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index bf2b1d7c0f777..cfaf6c0e610b3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) def this() = this(Identifiable.randomUID("regexTok")) /** - * Minimum token length, >= 0. + * Minimum token length, greater than or equal to 0. * Default: 1, to avoid returning empty strings * @group param */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 0ae9f264f4a8a..16bcc22f4a631 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu /** * Threshold for the number of values a categorical feature can take. - * If a feature is found to have > maxCategories values, then it is declared continuous. - * Must be >= 2. + * If a feature is found to have greater than maxCategories values, then it is declared + * continuous. Must be greater than or equal to 2. * * (default = 20) * @group param @@ -76,7 +76,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu * - Warning: This can cause problems if features are continuous since this will collect ALL * unique values to the driver. * - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. - * If maxCategories >= 3, then both features will be declared categorical. + * If maxCategories greater than or equal to 3, then both features will be declared + * categorical. * * This returns a model which can transform categorical features to use 0-based indices. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 4850a9e43f91c..7481f1d57c956 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -165,32 +165,64 @@ object ParamValidators { s" of unexpected input type: ${value.getClass}") } - /** Check if value > lowerBound */ + /** + * Check if + * {{{ + * value > lowerBound + * }}} + */ def gt[T](lowerBound: Double): T => Boolean = { (value: T) => getDouble(value) > lowerBound } - /** Check if value >= lowerBound */ + /** + * Check if + * {{{ + * value >= lowerBound + * }}} + */ def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) => getDouble(value) >= lowerBound } - /** Check if value < upperBound */ + /** + * Check if + * {{{ + * value < upperBound + * }}} + */ def lt[T](upperBound: Double): T => Boolean = { (value: T) => getDouble(value) < upperBound } - /** Check if value <= upperBound */ + /** + * Check if + * {{{ + * value <= upperBound + * }}} + */ def ltEq[T](upperBound: Double): T => Boolean = { (value: T) => getDouble(value) <= upperBound } /** * Check for value in range lowerBound to upperBound. - * @param lowerInclusive If true, check for value >= lowerBound. - * If false, check for value > lowerBound. - * @param upperInclusive If true, check for value <= upperBound. - * If false, check for value < upperBound. + * @param lowerInclusive If true, check for + * {{{ + * value >= lowerBound + * }}} + * If false, check for + * {{{ + * value > lowerBound + * }}} + * @param upperInclusive If true, check for + * {{{ + * value <= upperBound. + * }}} + * If false, check for + * {{{ + * value < upperBound. + * }}} */ def inRange[T]( lowerBound: Double, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index ede859db19d5f..af68e7b9d5809 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -185,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S setDefault(tol -> 1E-6) /** - * Suggested depth for treeAggregate (>= 2). + * Suggested depth for treeAggregate (greater than or equal to 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 440eacd13fd32..13b80bdbe8ec7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -131,10 +131,12 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam * It supports "gaussian", "binomial", "poisson" and "gamma" as family. * Valid link functions for each family is listed below. The first link function of each family * is the default one. - * - "gaussian" -> "identity", "log", "inverse" - * - "binomial" -> "logit", "probit", "cloglog" - * - "poisson" -> "log", "identity", "sqrt" - * - "gamma" -> "inverse", "identity", "log" + * {{{ + * - "gaussian" -> "identity", "log", "inverse" + * - "binomial" -> "logit", "probit", "cloglog" + * - "poisson" -> "log", "identity", "sqrt" + * - "gamma" -> "inverse", "identity", "log" + * }}} */ @Experimental @Since("2.0.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 556e48a604ea7..9ba664ea9b186 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -118,8 +118,11 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String /** * Set the ElasticNet mixing parameter. - * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * {{{ + * For alpha = 0, the penalty is an L2 penalty. + * For alpha = 1, it is an L1 penalty. + * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * }}} * Default is 0.0 which is an L2 penalty. * * @group setParam @@ -181,7 +184,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String setDefault(solver -> "auto") /** - * Suggested depth for treeAggregate (>= 2). + * Suggested depth for treeAggregate (greater than or equal to 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index 9d7a3bd07abd3..70926a4ecb778 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD * @param numClasses For classification: labels can take values {0, ..., numClasses - 1}. * For regression: fixed at 0 (no meaning). * @param maxBins Maximum number of bins, for all features. - * @param featureArity Map: categorical feature index --> arity. + * @param featureArity Map: categorical feature index and arity. * I.e., the feature takes values in {0, ..., arity - 1}. * @param numBins Number of bins for each feature. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala index 5e081cce0651e..e80ca6b73c292 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala @@ -48,7 +48,7 @@ private[spark] object MetadataUtils { * If a feature does not have metadata, it is assumed to be continuous. * If a feature is Nominal, then it must have the number of values * specified. - * @return Map: feature index --> number of categories. + * @return Map: feature index and number of categories. * The map's set of keys will be the set of categorical feature indices. */ def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index c65fce4ef11ae..47562c377a8e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -352,7 +352,10 @@ final class OnlineLDAOptimizer extends LDAOptimizer { * * @note This should be adjusted in synch with `LDA.setMaxIterations()` * so the entire corpus is used. Specifically, set both so that - * maxIterations * miniBatchFraction >= 1. + * + * {{{ + * maxIterations * miniBatchFraction >= 1. + * }}} * * Default: 0.05, i.e., 5% of total documents. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index cedfdbf0dc127..b98aa0534152b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -41,9 +41,9 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] /** * Compute the average precision of all the queries, truncated at ranking position k. * - * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be - * computed as #(relevant items retrieved) / k. This formula also applies when the size of the - * ground truth set is less than k. + * If for a query, the ranking algorithm returns n (n is less than k) results, the precision + * value will be computed as #(relevant items retrieved) / k. This formula also applies when + * the size of the ground truth set is less than k. * * If a query has an empty ground truth set, zero will be used as precision together with * a log warning. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala index 7a1d2577c20e0..5c55fe8b08fb2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala @@ -32,7 +32,10 @@ private[mllib] object EigenValueDecomposition { * * @param mul a function that multiplies the symmetric matrix with a DenseVector. * @param n dimension of the square matrix (maximum Int.MaxValue). - * @param k number of leading eigenvalues required, 0 < k < n. + * @param k number of leading eigenvalues required, + * {{{ + * 0 < k < n + * }}}. * @param tol tolerance of the eigs computation. * @param maxIterations the maximum number of Arnoldi update iterations. * @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala index b2e37bad3cf69..85d4d7f37f2c0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala @@ -249,8 +249,8 @@ object RandomRDDs { * shape and scale. * * @param sc SparkContext used to create the RDD. - * @param shape shape parameter (> 0) for the gamma distribution - * @param scale scale parameter (> 0) for the gamma distribution + * @param shape shape parameter (greater than 0) for the gamma distribution + * @param scale scale parameter (greater than 0) for the gamma distribution * @param size Size of the RDD. * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`). * @param seed Random seed (default: a random long integer). @@ -766,8 +766,8 @@ object RandomRDDs { * gamma distribution with the input shape and scale. * * @param sc SparkContext used to create the RDD. - * @param shape shape parameter (> 0) for the gamma distribution. - * @param scale scale parameter (> 0) for the gamma distribution. + * @param shape shape parameter (greater than 0) for the gamma distribution. + * @param scale scale parameter (greater than 0) for the gamma distribution. * @param numRows Number of Vectors in the RDD. * @param numCols Number of elements in each Vector. * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 95b155b037194..4ab63dab4d897 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -162,7 +162,7 @@ object DecisionTree extends Serializable with Logging { * @param numClasses Number of classes for classification. Default value of 2. * @param maxBins Maximum number of bins used for splitting features. * @param quantileCalculationStrategy Algorithm for calculating quantiles. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -$gt; k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @return DecisionTreeModel that can be used for prediction. @@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. * @param numClasses Number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. @@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 81c1bb27ea207..74f1a6edb26c6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -53,14 +53,15 @@ import org.apache.spark.util.Utils * the type of random forest (classification or regression), feature type * (continuous, categorical), depth of the tree, quantile calculation strategy, * etc. - * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. + * @param numTrees If 1, then no bootstrapping is used. If greater than 1, then bootstrapping is + * done. * @param featureSubsetStrategy Number of features to consider for splits at each node. * Supported values: "auto", "all", "sqrt", "log2", "onethird". * Supported numerical values: "(0.0-1.0]", "[1-n]". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees greater than 1 (forest) set to "sqrt" for + * classification and to "onethird" for regression. * If a real value "n" in the range (0, 1.0] is set, * use n * number of features. * If an integer value "n" in the range (1, num features) is set, @@ -111,7 +112,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt". + * if numTrees is greater than 1 (forest) set to "sqrt". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return RandomForestModel that can be used for prediction. */ @@ -134,7 +135,7 @@ object RandomForest extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. * @param numClasses Number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. @@ -142,7 +143,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt". + * if numTrees is greater than 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means @@ -200,7 +201,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "onethird". + * if numTrees is greater than 1 (forest) set to "onethird". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return RandomForestModel that can be used for prediction. */ @@ -222,7 +223,7 @@ object RandomForest extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. @@ -230,7 +231,7 @@ object RandomForest extends Serializable with Logging { * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "onethird". + * if numTrees is greater than 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. * The only supported value for regression is "variance". * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 8c7222815ea7a..d26c9f9d2e228 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -36,11 +36,11 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError} * @param validationTol validationTol is a condition which decides iteration termination when * runWithValidation is used. * The end of iteration is decided based on below logic: - * If the current loss on the validation set is > 0.01, the diff + * If the current loss on the validation set is greater than 0.01, the diff * of validation error is compared to relative tolerance which is * validationTol * (current loss on the validation set). - * If the current loss on the validation set is <= 0.01, the diff - * of validation error is compared to absolute tolerance which is + * If the current loss on the validation set is less than or euqal to 0.01, + * the diff of validation error is compared to absolute tolerance which is * validationTol * 0.01. * Ignored when * `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index b4c1e45596d51..b8450b722d82c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -45,7 +45,7 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} * @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported: * `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort` * @param categoricalFeaturesInfo A map storing information about the categorical variables and the - * number of discrete values they take. An entry (n -> k) + * number of discrete values they take. An entry (n and k) * indicates that feature n is categorical with k categories * indexed from 0: {0, 1, ..., k-1}. * @param minInstancesPerNode Minimum number of instances each child must have after split. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index f27ca9aeb9235..e23ca339a8f8f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -44,7 +44,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * of `x` is close to (p * N). * More precisely, * - * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). + * {{{ + * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). + * }}} * * This method implements a variation of the Greenwald-Khanna algorithm (with some speed * optimizations). @@ -55,7 +57,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param probabilities a list of quantile probabilities * Each number must belong to [0, 1]. * For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - * @param relativeError The relative target precision to achieve (>= 0). + * @param relativeError The relative target precision to achieve (greater or equal to 0). * If set to zero, the exact quantiles are computed, which could be very expensive. * Note that values greater than 1 are accepted but give the same result as 1. * @return the approximate quantiles at the given probabilities diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 43684abc13629..edfcd7d56dc8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -65,7 +65,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { /** * Returns the value of Spark runtime configuration property for the given key. * - * @note Throws `NoSuchElementException` if the key is not set and does not have a default value + * @throws java.util.NoSuchElementException if the key is not set and does not have a default + * value * @since 2.0.0 */ @throws[NoSuchElementException]("if the key is not set") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index a7bc7c68270f6..3b229e340efcb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -883,7 +883,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the + * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the * given schema, returning the result as a `DataFrame`. * * @group specificdata diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 93e7229b20c1f..650439a193015 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -474,7 +474,9 @@ object functions { /** * Aggregate function: returns the level of grouping, equals to * - * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) + * {{{ + * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn) + * }}} * * @note The list of columns should match with grouping columns exactly, or empty (means all the * grouping columns). @@ -487,7 +489,9 @@ object functions { /** * Aggregate function: returns the level of grouping, equals to * - * (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) + * {{{ + * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn) + * }}} * * @note The list of columns should match with grouping columns exactly. * @@ -1050,7 +1054,10 @@ object functions { * * As an example, consider a `DataFrame` with two partitions, each with 3 records. * This expression would return the following IDs: - * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * + * {{{ + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * }}} * * @group normal_funcs * @since 1.4.0 @@ -1068,7 +1075,10 @@ object functions { * * As an example, consider a `DataFrame` with two partitions, each with 3 records. * This expression would return the following IDs: - * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * + * {{{ + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * }}} * * @group normal_funcs * @since 1.6.0 @@ -1846,8 +1856,8 @@ object functions { def round(e: Column): Column = round(e, 0) /** - * Round the value of `e` to `scale` decimal places if `scale` >= 0 - * or at integral part when `scale` < 0. + * Round the value of `e` to `scale` decimal places if `scale` is greater than or equal to 0 + * or at integral part when `scale` is less than 0. * * @group math_funcs * @since 1.5.0 @@ -1864,7 +1874,7 @@ object functions { /** * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode - * if `scale` >= 0 or at integral part when `scale` < 0. + * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0. * * @group math_funcs * @since 2.0.0 @@ -2172,7 +2182,7 @@ object functions { * and returns the result as a string column. * * If d is 0, the result has no decimal point or fractional part. - * If d < 0, the result will be null. + * If d is less than 0, the result will be null. * * @group string_funcs * @since 1.5.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index a1ea748de98f9..46a90d67bded3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -216,8 +216,11 @@ abstract class BaseRelation { /** * Whether does it need to convert the objects in Row to internal representation, for example: - * java.lang.String -> UTF8String - * java.lang.Decimal -> Decimal + * + * {{{ + * java.lang.String -> UTF8String + * java.lang.Decimal -> Decimal + * }}} * * If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow` * From a2a2011da220b272121b01734bbf640567ef6ae3 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sun, 27 Nov 2016 02:57:18 +0900 Subject: [PATCH 12/17] Fix other existing ones for < and > --- .../org/apache/spark/api/java/JavaRDD.scala | 8 ++++--- .../main/scala/org/apache/spark/rdd/RDD.scala | 24 ++++++++++++------- .../scala/org/apache/spark/ui/UIUtils.scala | 9 +++++-- .../scala/org/apache/spark/util/Utils.scala | 12 ++++++++-- .../org/apache/spark/graphx/GraphLoader.scala | 2 +- .../spark/graphx/lib/TriangleCount.scala | 2 +- .../classification/LogisticRegression.scala | 19 +++++++++------ .../spark/ml/clustering/BisectingKMeans.scala | 4 ++-- .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../org/apache/spark/ml/clustering/LDA.scala | 6 ++--- .../apache/spark/ml/recommendation/ALS.scala | 4 ++-- .../mllib/clustering/BisectingKMeans.scala | 14 +++++------ .../mllib/clustering/GaussianMixture.scala | 2 +- .../apache/spark/mllib/clustering/LDA.scala | 20 ++++++++-------- .../spark/mllib/clustering/LDAModel.scala | 2 +- .../mllib/optimization/GradientDescent.scala | 6 ++--- .../spark/mllib/optimization/LBFGS.scala | 14 +++++++++-- .../spark/mllib/optimization/NNLS.scala | 5 +++- .../spark/mllib/optimization/Updater.scala | 6 ++--- .../apache/spark/mllib/tree/model/Split.scala | 2 +- .../main/scala/org/apache/spark/sql/Row.scala | 2 +- .../apache/spark/sql/types/DecimalType.scala | 3 ++- .../apache/spark/sql/jdbc/JdbcDialects.scala | 11 ++++++--- .../hive/execution/InsertIntoHiveTable.scala | 12 ++++++++-- 24 files changed, 123 insertions(+), 68 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala index a20d264be5afd..94e26e687c66b 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala @@ -103,7 +103,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be greater + * than or equal to 0 * * @note This is NOT guaranteed to provide exactly the fraction of the count * of the given `RDD`. @@ -117,7 +118,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be greater + * than or equal to 0 * @param seed seed for the random number generator * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -167,7 +169,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) * Return an RDD with the elements from `this` that are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting - * RDD will be <= us. + * RDD will be less than or equal to us. */ def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other)) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 8e673447581cf..faac5cb4dbb77 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -469,7 +469,8 @@ abstract class RDD[T: ClassTag]( * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] - * with replacement: expected number of times each element is chosen; fraction must be >= 0 + * with replacement: expected number of times each element is chosen; fraction must be greater + * than or equal to 0 * @param seed seed for the random number generator * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -750,8 +751,10 @@ abstract class RDD[T: ClassTag]( * print line function (like out.println()) as the 2nd parameter. * An example of pipe the RDD data of groupBy() in a streaming way, * instead of constructing a huge String to concat all the elements: - * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = - * for (e <- record._2) {f(e)} + * {{{ + * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = + * for (e <- record._2) {f(e)} + * }}} * @param separateWorkingDir Use separate working directories for each task. * @param bufferSize Buffer size for the stdin writer for the piped process. * @param encoding Char encoding used for interacting (via stdin, stdout and stderr) with @@ -1184,8 +1187,13 @@ abstract class RDD[T: ClassTag]( * * @note This method should only be used if the resulting map is expected to be small, as * the whole thing is loaded into the driver's memory. - * To handle very large results, consider using rdd.map(x => (x, 1L)).reduceByKey(_ + _), which - * returns an RDD[T, Long] instead of a map. + * To handle very large results, consider using + * + * {{{ + * rdd.map(x => (x, 1L)).reduceByKey(_ + _) + * }}}, + * + * which returns an RDD[T, Long] instead of a map. */ def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = withScope { map(value => (value, null)).countByKey() @@ -1223,9 +1231,9 @@ abstract class RDD[T: ClassTag]( * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p` - * would trigger sparse representation of registers, which may reduce the memory consumption - * and increase accuracy when the cardinality is small. + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (sp is greater + * than p) would trigger sparse representation of registers, which may reduce the memory + * consumption and increase accuracy when the cardinality is small. * * @param p The precision value for the normal set. * `p` must be a value between 4 and `sp` if `sp` is not zero (32 max). diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index dbeb970c81dfe..d31f956886014 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -422,8 +422,13 @@ private[spark] object UIUtils extends Logging { * the whole string will rendered as a simple escaped text. * * Note: In terms of security, only anchor tags with root relative links are supported. So any - * attempts to embed links outside Spark UI, or other tags like <script> will cause in - * the whole description to be treated as plain text. + * attempts to embed links outside Spark UI, or other tags like + * + * {{{ + *