diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index 08b0cb9b8f6a5..8e05e4b31e39e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -115,7 +115,7 @@ abstract class Predictor[ case _ => labelCasted } - copyValues(train(casted).setParent(this)) + copyValues(train(casted).setParent(this), extra = ParamMap.empty, copyDefault = false) } override def copy(extra: ParamMap): Learner diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 9f60f0896ec52..c978f1b5ba650 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.classification import org.apache.hadoop.fs.Path -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -256,8 +256,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica override protected def saveImpl(path: String): Unit = { val extraMetadata: JObject = Map( - "numFeatures" -> instance.numFeatures, - "numClasses" -> instance.numClasses) + "numFeatures" -> JInt(instance.numFeatures), + "numClasses" -> JInt(instance.numClasses), + "impurity" -> JString(instance.getImpurity)) DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) val (nodeData, _) = NodeData.build(instance.rootNode, 0) val dataPath = new Path(path, "data").toString diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index f11bc1d8fe415..d7a2ca56fec92 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.classification import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -343,6 +343,7 @@ object GBTClassificationModel extends MLReadable[GBTClassificationModel] { private val numFeaturesKey: String = "numFeatures" private val numTreesKey: String = "numTrees" + private val impurityKey: String = "impurity" @Since("2.0.0") override def read: MLReader[GBTClassificationModel] = new GBTClassificationModelReader @@ -356,8 +357,9 @@ object GBTClassificationModel extends MLReadable[GBTClassificationModel] { override protected def saveImpl(path: String): Unit = { val extraMetadata: JObject = Map( - numFeaturesKey -> instance.numFeatures, - numTreesKey -> instance.getNumTrees) + numFeaturesKey -> JInt(instance.numFeatures), + numTreesKey -> JInt(instance.getNumTrees), + impurityKey -> JString(instance.getImpurity)) EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index fa191604218db..f2bf2f37e1a0b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -244,6 +244,16 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas @Since("2.2.0") def getUpperBoundsOnIntercepts: Vector = $(upperBoundsOnIntercepts) + setDefault(regParam -> 0.0) + setDefault(elasticNetParam -> 0.0) + setDefault(maxIter -> 100) + setDefault(tol -> 1E-6) + setDefault(fitIntercept -> true) + setDefault(standardization -> true) + setDefault(threshold -> 0.5) + setDefault(aggregationDepth -> 2) + setDefault(family -> "auto") + protected def usingBoundConstrainedOptimization: Boolean = { isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) || isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts) @@ -291,7 +301,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.2.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set the ElasticNet mixing parameter. @@ -307,7 +316,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) /** * Set the maximum number of iterations. @@ -317,7 +325,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.2.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. @@ -328,7 +335,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Whether to fit an intercept term. @@ -338,7 +344,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Sets the value of param [[family]]. @@ -348,7 +353,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("2.1.0") def setFamily(value: String): this.type = set(family, value) - setDefault(family -> "auto") /** * Whether to standardize the training features before fitting the model. @@ -362,11 +366,9 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.5.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) @Since("1.5.0") override def setThreshold(value: Double): this.type = super.setThreshold(value) - setDefault(threshold -> 0.5) @Since("1.5.0") override def getThreshold: Double = super.getThreshold @@ -397,7 +399,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) /** * Set the lower bounds on coefficients if fitting under bound constrained optimization. diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 0293e03d47435..7e20f798878ca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -41,6 +41,7 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW */ final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.", ParamValidators.gtEq(0)) + setDefault(smoothing -> 1.0) /** @group getParam */ final def getSmoothing: Double = $(smoothing) @@ -54,6 +55,7 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW final val modelType: Param[String] = new Param[String](this, "modelType", "The model type " + "which is a string (case-sensitive). Supported options: multinomial (default) and bernoulli.", ParamValidators.inArray[String](NaiveBayes.supportedModelTypes.toArray)) + setDefault(modelType -> NaiveBayes.Multinomial) /** @group getParam */ final def getModelType: String = $(modelType) @@ -91,7 +93,6 @@ class NaiveBayes @Since("1.5.0") ( */ @Since("1.5.0") def setSmoothing(value: Double): this.type = set(smoothing, value) - setDefault(smoothing -> 1.0) /** * Set the model type using a string (case-sensitive). @@ -101,7 +102,6 @@ class NaiveBayes @Since("1.5.0") ( */ @Since("1.5.0") def setModelType(value: String): this.type = set(modelType, value) - setDefault(modelType -> NaiveBayes.Multinomial) /** * Sets the value of param [[weightCol]]. diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 78a4972adbdbb..991902424c172 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.classification -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -293,9 +293,10 @@ object RandomForestClassificationModel extends MLReadable[RandomForestClassifica override protected def saveImpl(path: String): Unit = { // Note: numTrees is not currently used, but could be nice to store for fast querying. val extraMetadata: JObject = Map( - "numFeatures" -> instance.numFeatures, - "numClasses" -> instance.numClasses, - "numTrees" -> instance.getNumTrees) + "numFeatures" -> JInt(instance.numFeatures), + "numClasses" -> JInt(instance.numClasses), + "numTrees" -> JInt(instance.getNumTrees), + "impurity" -> JString(instance.getImpurity)) EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index f19ad7a5a6938..19837dffe7d2c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -419,7 +419,8 @@ class GaussianMixture @Since("2.0.0") ( new MultivariateGaussian(mean, cov) } - val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this) + val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists), + extra = ParamMap.empty, copyDefault = false).setParent(this) val summary = new GaussianMixtureSummary(model.transform(dataset), $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index c8145de564cbe..719c2a0d159e8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -93,6 +93,14 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe @Since("1.5.0") def getInitSteps: Int = $(initSteps) + setDefault( + k -> 2, + maxIter -> 20, + initMode -> MLlibKMeans.K_MEANS_PARALLEL, + initSteps -> 2, + tol -> 1e-4, + distanceMeasure -> DistanceMeasure.EUCLIDEAN) + /** * Validates and transforms the input schema. * @param schema input schema @@ -264,14 +272,6 @@ class KMeans @Since("1.5.0") ( @Since("1.5.0") override val uid: String) extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable { - setDefault( - k -> 2, - maxIter -> 20, - initMode -> MLlibKMeans.K_MEANS_PARALLEL, - initSteps -> 2, - tol -> 1e-4, - distanceMeasure -> DistanceMeasure.EUCLIDEAN) - @Since("1.5.0") override def copy(extra: ParamMap): KMeans = defaultCopy(extra) @@ -339,7 +339,8 @@ class KMeans @Since("1.5.0") ( .setEpsilon($(tol)) .setDistanceMeasure($(distanceMeasure)) val parentModel = algo.run(instances, Option(instr)) - val model = copyValues(new KMeansModel(uid, parentModel).setParent(this)) + val model = copyValues(new KMeansModel(uid, parentModel).setParent(this), + extra = ParamMap.empty, copyDefault = false) val summary = new KMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 4bab670cc159f..a1001bb0d99f5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -195,8 +195,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM " with estimates of the topic mixture distribution for each document (often called \"theta\"" + " in the literature). Returns a vector of zeros for an empty document.") - setDefault(topicDistributionCol -> "topicDistribution") - /** @group getParam */ @Since("1.6.0") def getTopicDistributionCol: String = $(topicDistributionCol) @@ -311,6 +309,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM @Since("2.0.0") def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint) + setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10, + learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05, + optimizeDocConcentration -> true, keepLastCheckpoint -> true, + topicDistributionCol -> "topicDistribution") + /** * Validates and transforms the input schema. * @@ -818,10 +821,6 @@ class LDA @Since("1.6.0") ( @Since("1.6.0") def this() = this(Identifiable.randomUID("lda")) - setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10, - learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05, - optimizeDocConcentration -> true, keepLastCheckpoint -> true) - /** * The features for LDA should be a `Vector` representing the word counts in a document. * The vector should be of length vocabSize, with counts for each term (word). @@ -918,7 +917,7 @@ class LDA @Since("1.6.0") ( } instr.logNumFeatures(newModel.vocabSize) - val model = copyValues(newModel).setParent(this) + val model = copyValues(newModel, extra = ParamMap.empty, copyDefault = false).setParent(this) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 9a83a5882ce29..16594fc5b6661 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -865,7 +865,7 @@ trait Params extends Identifiable with Serializable { } /** Internal param map for user-supplied values. */ - private val paramMap: ParamMap = ParamMap.empty + private[ml] val paramMap: ParamMap = ParamMap.empty /** Internal param map for default values. */ private val defaultParamMap: ParamMap = ParamMap.empty @@ -886,14 +886,18 @@ trait Params extends Identifiable with Serializable { * * @param to the target instance, which should work with the same set of default Params as this * source instance + * @param copyDefault whether to copy default Params. Default is `true`. * @param extra extra params to be copied to the target's `paramMap` * @return the target instance with param values copied */ - protected def copyValues[T <: Params](to: T, extra: ParamMap = ParamMap.empty): T = { + protected def copyValues[T <: Params]( + to: T, + extra: ParamMap = ParamMap.empty, + copyDefault: Boolean = true): T = { val map = paramMap ++ extra params.foreach { param => // copy default Params - if (defaultParamMap.contains(param) && to.hasParam(param.name)) { + if (copyDefault && defaultParamMap.contains(param) && to.hasParam(param.name)) { to.defaultParamMap.put(to.getParam(param.name), defaultParamMap(param)) } // copy explicitly set Params diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 81a8f50761e0e..2846571a76f7b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -54,6 +54,7 @@ import org.apache.spark.util.random.XORShiftRandom * Common params for ALS and ALSModel. */ private[recommendation] trait ALSModelParams extends Params with HasPredictionCol { + /** * Param for the column name for user ids. Ids must be integers. Other * numeric types are supported for this column, but will be cast to integers as long as they @@ -124,6 +125,8 @@ private[recommendation] trait ALSModelParams extends Params with HasPredictionCo /** @group expertGetParam */ def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT) + + setDefault(userCol -> "user", itemCol -> "item", coldStartStrategy -> "nan") } /** @@ -235,10 +238,9 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w def getFinalStorageLevel: String = $(finalStorageLevel) setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10, - implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item", + implicitPrefs -> false, alpha -> 1.0, ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10, - intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK", - coldStartStrategy -> "nan") + intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK") /** * Validates and transforms the input schema. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 0291a57487c47..66869b87a5b0b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.regression import org.apache.hadoop.fs.Path -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -261,7 +261,8 @@ object DecisionTreeRegressionModel extends MLReadable[DecisionTreeRegressionMode override protected def saveImpl(path: String): Unit = { val extraMetadata: JObject = Map( - "numFeatures" -> instance.numFeatures) + "numFeatures" -> JInt(instance.numFeatures), + "impurity" -> JString(instance.getImpurity)) DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) val (nodeData, _) = NodeData.build(instance.rootNode, 0) val dataPath = new Path(path, "data").toString diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index f41d15b62dddd..8ee93d7f37c03 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.regression import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -287,8 +287,9 @@ object GBTRegressionModel extends MLReadable[GBTRegressionModel] { override protected def saveImpl(path: String): Unit = { val extraMetadata: JObject = Map( - "numFeatures" -> instance.numFeatures, - "numTrees" -> instance.getNumTrees) + "numFeatures" -> JInt(instance.numFeatures), + "numTrees" -> JInt(instance.getNumTrees), + "impurity" -> JString(instance.getImpurity)) EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 917a4d238d467..0cee10a0092db 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -62,6 +62,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.", (value: String) => supportedFamilyNames.contains(value.toLowerCase(Locale.ROOT))) + setDefault(family -> Gaussian.name) + /** @group getParam */ @Since("2.0.0") def getFamily: String = $(family) @@ -85,6 +87,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam "Only applicable to the Tweedie family. Supported values: 0 and [1, Inf).", (x: Double) => x >= 1.0 || x == 0.0) + setDefault(variancePower -> 0.0) + /** @group getParam */ @Since("2.2.0") def getVariancePower: Double = $(variancePower) @@ -179,6 +183,11 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam s"${supportedSolvers.mkString(", ")}. (Default irls)", ParamValidators.inArray[String](supportedSolvers)) + setDefault(solver -> IRLS) + setDefault(maxIter -> 25) + setDefault(tol -> 1E-6) + setDefault(regParam -> 0.0) + @Since("2.0.0") override def validateAndTransformSchema( schema: StructType, @@ -256,7 +265,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setFamily(value: String): this.type = set(family, value) - setDefault(family -> Gaussian.name) /** * Sets the value of param [[variancePower]]. @@ -267,7 +275,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.2.0") def setVariancePower(value: Double): this.type = set(variancePower, value) - setDefault(variancePower -> 0.0) /** * Sets the value of param [[linkPower]]. @@ -304,7 +311,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 25) /** * Sets the convergence tolerance of iterations. @@ -315,7 +321,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Sets the regularization parameter for L2 regularization. @@ -331,7 +336,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Sets the value of param [[weightCol]]. @@ -363,7 +367,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> IRLS) /** * Sets the link prediction (linear predictor) column name. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 6d3fe7a6c748c..709cd312d2a4b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -103,6 +103,17 @@ private[regression] trait LinearRegressionParams extends PredictorParams @Since("2.3.0") def getEpsilon: Double = $(epsilon) + setDefault(regParam -> 0.0) + setDefault(fitIntercept -> true) + setDefault(standardization -> true) + setDefault(elasticNetParam -> 0.0) + setDefault(maxIter -> 100) + setDefault(tol -> 1E-6) + setDefault(solver -> Auto) + setDefault(aggregationDepth -> 2) + setDefault(loss -> SquaredError) + setDefault(epsilon -> 1.35) + override protected def validateAndTransformSchema( schema: StructType, fitting: Boolean, @@ -188,7 +199,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.3.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set if we should fit the intercept. @@ -198,7 +208,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.5.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Whether to standardize the training features before fitting the model. @@ -214,7 +223,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.5.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) /** * Set the ElasticNet mixing parameter. @@ -230,7 +238,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.4.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) /** * Set the maximum number of iterations. @@ -240,7 +247,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.3.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. @@ -251,7 +257,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.4.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Whether to over-/under-sample training instances according to the given weights in weightCol. @@ -280,7 +285,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.6.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> Auto) /** * Suggested depth for treeAggregate (greater than or equal to 2). @@ -292,7 +296,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) /** * Sets the value of param [[loss]]. @@ -302,7 +305,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.3.0") def setLoss(value: String): this.type = set(loss, value) - setDefault(loss -> SquaredError) /** * Sets the value of param [[epsilon]]. @@ -312,7 +314,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.3.0") def setEpsilon(value: Double): this.type = set(epsilon, value) - setDefault(epsilon -> 1.35) override protected def train(dataset: Dataset[_]): LinearRegressionModel = { // Extract the number of features before deciding optimization solver. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 200b234b79978..74f59680d4d03 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.regression -import org.json4s.{DefaultFormats, JObject} +import org.json4s.{DefaultFormats, JInt, JObject, JString} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since @@ -254,8 +254,9 @@ object RandomForestRegressionModel extends MLReadable[RandomForestRegressionMode override protected def saveImpl(path: String): Unit = { val extraMetadata: JObject = Map( - "numFeatures" -> instance.numFeatures, - "numTrees" -> instance.getNumTrees) + "numFeatures" -> JInt(instance.numFeatures), + "numTrees" -> JInt(instance.getNumTrees), + "impurity" -> JString(instance.getImpurity)) EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 4aa4c3617e7fd..f72779ed14890 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -333,7 +333,7 @@ private[ml] object DecisionTreeModelReadWrite { // Get impurity to construct ImpurityCalculator for each node val impurityType: String = { - val impurityJson: JValue = metadata.getParamValue("impurity") + val impurityJson: JValue = metadata.getMetadataValue("impurity") Param.jsonDecode[String](compact(render(impurityJson))) } @@ -428,7 +428,7 @@ private[ml] object EnsembleModelReadWrite { // Get impurity to construct ImpurityCalculator for each node val impurityType: String = { - val impurityJson: JValue = metadata.getParamValue("impurity") + val impurityJson: JValue = metadata.getMetadataValue("impurity") Param.jsonDecode[String](compact(render(impurityJson))) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index a616907800969..25efc9121be71 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -296,7 +296,7 @@ private[ml] object DefaultParamsWriter { paramMap: Option[JValue] = None): String = { val uid = instance.uid val cls = instance.getClass.getName - val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] + val params = instance.paramMap.toSeq.asInstanceOf[Seq[ParamPair[Any]]] val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) => p.name -> parse(p.jsonEncode(v)) }.toList)) @@ -354,26 +354,40 @@ private[ml] object DefaultParamsReader { metadata: JValue, metadataJson: String) { - /** - * Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name. - * This can be useful for getting a Param value before an instance of `Params` - * is available. - */ - def getParamValue(paramName: String): JValue = { + private def getValue(paramName: String, target: JValue): Seq[(String, JValue)] = { implicit val format = DefaultFormats - params match { + target match { case JObject(pairs) => - val values = pairs.filter { case (pName, jsonValue) => + pairs.filter { case (pName, jsonValue) => pName == paramName - }.map(_._2) - assert(values.length == 1, s"Expected one instance of Param '$paramName' but found" + - s" ${values.length} in JSON Params: " + pairs.map(_.toString).mkString(", ")) - values.head + } case _ => throw new IllegalArgumentException( s"Cannot recognize JSON metadata: $metadataJson.") } } + + /** + * Get the JSON value of the given name from the metadata. + */ + def getMetadataValue(metadataName: String): JValue = { + val values = getValue(metadataName, metadata) + assert(values.length == 1, s"Expected one instance of metadata '$metadataName' but found" + + s" ${values.length} in JSON Metadata: " + values.map(_.toString).mkString(", ")) + values.head._2 + } + + /** + * Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name. + * This can be useful for getting a Param value before an instance of `Params` + * is available. + */ + def getParamValue(paramName: String): JValue = { + val values = getValue(paramName, params) + assert(values.length == 1, s"Expected one instance of Param '$paramName' but found" + + s" ${values.length} in JSON Params: " + values.map(_.toString).mkString(", ")) + values.head._2 + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala index 7403680ae3fdc..0acf38164d240 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala @@ -172,7 +172,10 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setSplits(Array(0.1, 0.8, 0.9)) - testDefaultReadWrite(t) + + val bucketizer = testDefaultReadWrite(t) + val data = Seq((1.0, 2.0), (10.0, 100.0), (101.0, -1.0)).toDF("myInputCol", "myInputCol2") + bucketizer.transform(data) } test("Bucket numeric features") { @@ -327,7 +330,10 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa .setInputCols(Array("myInputCol")) .setOutputCols(Array("myOutputCol")) .setSplitsArray(Array(Array(0.1, 0.8, 0.9))) - testDefaultReadWrite(t) + + val bucketizer = testDefaultReadWrite(t) + val data = Seq((1.0, 2.0), (10.0, 100.0), (101.0, -1.0)).toDF("myInputCol", "myInputCol2") + bucketizer.transform(data) } test("Bucketizer in a pipeline") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala index e9a75e931e6a8..383b32955ff28 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala @@ -27,6 +27,8 @@ import org.apache.spark.sql.functions.udf class QuantileDiscretizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + import testImplicits._ + test("Test observed number of buckets and their sizes match expected values") { val spark = this.spark import spark.implicits._ @@ -132,7 +134,10 @@ class QuantileDiscretizerSuite .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumBuckets(6) - testDefaultReadWrite(t) + + val readDiscretizer = testDefaultReadWrite(t) + val data = sc.parallelize(1 to 100).map(Tuple1.apply).toDF("myInputCol") + readDiscretizer.fit(data) } test("Verify resulting model has parent") { @@ -379,7 +384,10 @@ class QuantileDiscretizerSuite .setInputCols(Array("input1", "input2")) .setOutputCols(Array("result1", "result2")) .setNumBucketsArray(Array(5, 10)) - testDefaultReadWrite(discretizer) + val readDiscretizer = testDefaultReadWrite(discretizer) + + val data = Seq((1.0, 2.0), (2.0, 3.0), (3.0, 4.0)).toDF("input1", "input2") + readDiscretizer.fit(data) } test("Multiple Columns: Both inputCol and inputCols are set") { diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index d35c50e1d00fe..bc159299db780 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -135,7 +135,17 @@ object MimaExcludes { ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.Bucketizer.getHandleInvalid"), ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexer.getHandleInvalid"), ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.getHandleInvalid"), - ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexerModel.getHandleInvalid") + ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexerModel.getHandleInvalid"), + + // [SPARK-23377][ML] Fixes Bucketizer with multiple columns persistence bug + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.org$apache$spark$ml$param$Params$_setter_$paramMap_="), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.paramMap"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.PipelineStage.copyValues"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.JavaParams.copyValues"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.Params.copyValues"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.copyValues"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.copyValues$default$3"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.evaluation.Evaluator.copyValues") ) // Exclude rules for 2.2.x