apache · viirya · Feb 10, 2018 · Feb 10, 2018 · Feb 11, 2018 · Feb 11, 2018
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -115,7 +115,7 @@ abstract class Predictor[
       case _ => labelCasted
     }
 
-    copyValues(train(casted).setParent(this))
+    copyValues(train(casted).setParent(this), extra = ParamMap.empty, copyDefault = false)
   }
 
   override def copy(extra: ParamMap): Learner

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.classification
 
 import org.apache.hadoop.fs.Path
-import org.json4s.{DefaultFormats, JObject}
+import org.json4s.{DefaultFormats, JInt, JObject, JString}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
@@ -256,8 +256,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica
 
     override protected def saveImpl(path: String): Unit = {
       val extraMetadata: JObject = Map(
-        "numFeatures" -> instance.numFeatures,
-        "numClasses" -> instance.numClasses)
+        "numFeatures" -> JInt(instance.numFeatures),
+        "numClasses" -> JInt(instance.numClasses),
+        "impurity" -> JString(instance.getImpurity))
       DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
       val (nodeData, _) = NodeData.build(instance.rootNode, 0)
       val dataPath = new Path(path, "data").toString

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.classification
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.json4s.{DefaultFormats, JObject}
+import org.json4s.{DefaultFormats, JInt, JObject, JString}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
@@ -343,6 +343,7 @@ object GBTClassificationModel extends MLReadable[GBTClassificationModel] {
 
   private val numFeaturesKey: String = "numFeatures"
   private val numTreesKey: String = "numTrees"
+  private val impurityKey: String = "impurity"
 
   @Since("2.0.0")
   override def read: MLReader[GBTClassificationModel] = new GBTClassificationModelReader
@@ -356,8 +357,9 @@ object GBTClassificationModel extends MLReadable[GBTClassificationModel] {
     override protected def saveImpl(path: String): Unit = {
 
       val extraMetadata: JObject = Map(
-        numFeaturesKey -> instance.numFeatures,
-        numTreesKey -> instance.getNumTrees)
+        numFeaturesKey -> JInt(instance.numFeatures),
+        numTreesKey -> JInt(instance.getNumTrees),
+        impurityKey -> JString(instance.getImpurity))
       EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata)
     }
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -244,6 +244,16 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   @Since("2.2.0")
   def getUpperBoundsOnIntercepts: Vector = $(upperBoundsOnIntercepts)
 
+  setDefault(regParam -> 0.0)
+  setDefault(elasticNetParam -> 0.0)
+  setDefault(maxIter -> 100)
+  setDefault(tol -> 1E-6)
+  setDefault(fitIntercept -> true)
+  setDefault(standardization -> true)
+  setDefault(threshold -> 0.5)
+  setDefault(aggregationDepth -> 2)
+  setDefault(family -> "auto")
+
   protected def usingBoundConstrainedOptimization: Boolean = {
     isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) ||
       isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts)
@@ -291,7 +301,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.2.0")
   def setRegParam(value: Double): this.type = set(regParam, value)
-  setDefault(regParam -> 0.0)
 
   /**
    * Set the ElasticNet mixing parameter.
@@ -307,7 +316,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.4.0")
   def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
-  setDefault(elasticNetParam -> 0.0)
 
   /**
    * Set the maximum number of iterations.
@@ -317,7 +325,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.2.0")
   def setMaxIter(value: Int): this.type = set(maxIter, value)
-  setDefault(maxIter -> 100)
 
   /**
    * Set the convergence tolerance of iterations.
@@ -328,7 +335,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.4.0")
   def setTol(value: Double): this.type = set(tol, value)
-  setDefault(tol -> 1E-6)
 
   /**
    * Whether to fit an intercept term.
@@ -338,7 +344,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.4.0")
   def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
-  setDefault(fitIntercept -> true)
 
   /**
    * Sets the value of param [[family]].
@@ -348,7 +353,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("2.1.0")
   def setFamily(value: String): this.type = set(family, value)
-  setDefault(family -> "auto")
 
   /**
    * Whether to standardize the training features before fitting the model.
@@ -362,11 +366,9 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("1.5.0")
   def setStandardization(value: Boolean): this.type = set(standardization, value)
-  setDefault(standardization -> true)
 
   @Since("1.5.0")
   override def setThreshold(value: Double): this.type = super.setThreshold(value)
-  setDefault(threshold -> 0.5)
 
   @Since("1.5.0")
   override def getThreshold: Double = super.getThreshold
@@ -397,7 +399,6 @@ class LogisticRegression @Since("1.2.0") (
    */
   @Since("2.1.0")
   def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
-  setDefault(aggregationDepth -> 2)
 
   /**
    * Set the lower bounds on coefficients if fitting under bound constrained optimization.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -41,6 +41,7 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW
    */
   final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.",
     ParamValidators.gtEq(0))
+  setDefault(smoothing -> 1.0)
 
   /** @group getParam */
   final def getSmoothing: Double = $(smoothing)
@@ -54,6 +55,7 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW
   final val modelType: Param[String] = new Param[String](this, "modelType", "The model type " +
     "which is a string (case-sensitive). Supported options: multinomial (default) and bernoulli.",
     ParamValidators.inArray[String](NaiveBayes.supportedModelTypes.toArray))
+  setDefault(modelType -> NaiveBayes.Multinomial)
 
   /** @group getParam */
   final def getModelType: String = $(modelType)
@@ -91,7 +93,6 @@ class NaiveBayes @Since("1.5.0") (
    */
   @Since("1.5.0")
   def setSmoothing(value: Double): this.type = set(smoothing, value)
-  setDefault(smoothing -> 1.0)
 
   /**
    * Set the model type using a string (case-sensitive).
@@ -101,7 +102,6 @@ class NaiveBayes @Since("1.5.0") (
    */
   @Since("1.5.0")
   def setModelType(value: String): this.type = set(modelType, value)
-  setDefault(modelType -> NaiveBayes.Multinomial)
 
   /**
    * Sets the value of param [[weightCol]].

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.json4s.{DefaultFormats, JObject}
+import org.json4s.{DefaultFormats, JInt, JObject, JString}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
@@ -293,9 +293,10 @@ object RandomForestClassificationModel extends MLReadable[RandomForestClassifica
     override protected def saveImpl(path: String): Unit = {
       // Note: numTrees is not currently used, but could be nice to store for fast querying.
       val extraMetadata: JObject = Map(
-        "numFeatures" -> instance.numFeatures,
-        "numClasses" -> instance.numClasses,
-        "numTrees" -> instance.getNumTrees)
+        "numFeatures" -> JInt(instance.numFeatures),
+        "numClasses" -> JInt(instance.numClasses),
+        "numTrees" -> JInt(instance.getNumTrees),
+        "impurity" -> JString(instance.getImpurity))
       EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata)
     }
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -419,7 +419,8 @@ class GaussianMixture @Since("2.0.0") (
       new MultivariateGaussian(mean, cov)
     }
 
-    val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
+    val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists),
+      extra = ParamMap.empty, copyDefault = false).setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
     model.setSummary(Some(summary))

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -93,6 +93,14 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
   @Since("1.5.0")
   def getInitSteps: Int = $(initSteps)
 
+  setDefault(
+    k -> 2,
+    maxIter -> 20,
+    initMode -> MLlibKMeans.K_MEANS_PARALLEL,
+    initSteps -> 2,
+    tol -> 1e-4,
+    distanceMeasure -> DistanceMeasure.EUCLIDEAN)
+
   /**
    * Validates and transforms the input schema.
    * @param schema input schema
@@ -264,14 +272,6 @@ class KMeans @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
 
-  setDefault(
-    k -> 2,
-    maxIter -> 20,
-    initMode -> MLlibKMeans.K_MEANS_PARALLEL,
-    initSteps -> 2,
-    tol -> 1e-4,
-    distanceMeasure -> DistanceMeasure.EUCLIDEAN)
-
   @Since("1.5.0")
   override def copy(extra: ParamMap): KMeans = defaultCopy(extra)
 
@@ -339,7 +339,8 @@ class KMeans @Since("1.5.0") (
       .setEpsilon($(tol))
       .setDistanceMeasure($(distanceMeasure))
     val parentModel = algo.run(instances, Option(instr))
-    val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
+    val model = copyValues(new KMeansModel(uid, parentModel).setParent(this),
+      extra = ParamMap.empty, copyDefault = false)
     val summary = new KMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -195,8 +195,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
     " with estimates of the topic mixture distribution for each document (often called \"theta\"" +
     " in the literature).  Returns a vector of zeros for an empty document.")
 
-  setDefault(topicDistributionCol -> "topicDistribution")
-
   /** @group getParam */
   @Since("1.6.0")
   def getTopicDistributionCol: String = $(topicDistributionCol)
@@ -311,6 +309,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
   @Since("2.0.0")
   def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint)
 
+  setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
+    learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
+    optimizeDocConcentration -> true, keepLastCheckpoint -> true,
+    topicDistributionCol -> "topicDistribution")
+
   /**
    * Validates and transforms the input schema.
    *
@@ -818,10 +821,6 @@ class LDA @Since("1.6.0") (
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("lda"))
 
-  setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
-    learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
-    optimizeDocConcentration -> true, keepLastCheckpoint -> true)
-
   /**
    * The features for LDA should be a `Vector` representing the word counts in a document.
    * The vector should be of length vocabSize, with counts for each term (word).
@@ -918,7 +917,7 @@ class LDA @Since("1.6.0") (
     }
 
     instr.logNumFeatures(newModel.vocabSize)
-    val model = copyValues(newModel).setParent(this)
+    val model = copyValues(newModel, extra = ParamMap.empty, copyDefault = false).setParent(this)
     instr.logSuccess(model)
     model
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -865,7 +865,7 @@ trait Params extends Identifiable with Serializable {
   }
 
   /** Internal param map for user-supplied values. */
-  private val paramMap: ParamMap = ParamMap.empty
+  private[ml] val paramMap: ParamMap = ParamMap.empty
 
   /** Internal param map for default values. */
   private val defaultParamMap: ParamMap = ParamMap.empty
@@ -886,14 +886,18 @@ trait Params extends Identifiable with Serializable {
    *
    * @param to the target instance, which should work with the same set of default Params as this
    *           source instance
+   * @param copyDefault whether to copy default Params. Default is `true`.
    * @param extra extra params to be copied to the target's `paramMap`
    * @return the target instance with param values copied
    */
-  protected def copyValues[T <: Params](to: T, extra: ParamMap = ParamMap.empty): T = {
+  protected def copyValues[T <: Params](
+      to: T,
+      extra: ParamMap = ParamMap.empty,
+      copyDefault: Boolean = true): T = {
     val map = paramMap ++ extra
     params.foreach { param =>
       // copy default Params
-      if (defaultParamMap.contains(param) && to.hasParam(param.name)) {
+      if (copyDefault && defaultParamMap.contains(param) && to.hasParam(param.name)) {
         to.defaultParamMap.put(to.getParam(param.name), defaultParamMap(param))
       }
       // copy explicitly set Params

diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -54,6 +54,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * Common params for ALS and ALSModel.
  */
 private[recommendation] trait ALSModelParams extends Params with HasPredictionCol {
+
   /**
    * Param for the column name for user ids. Ids must be integers. Other
    * numeric types are supported for this column, but will be cast to integers as long as they
@@ -124,6 +125,8 @@ private[recommendation] trait ALSModelParams extends Params with HasPredictionCo
 
   /** @group expertGetParam */
   def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT)
+
+  setDefault(userCol -> "user", itemCol -> "item", coldStartStrategy -> "nan")
 }
 
 /**
@@ -235,10 +238,9 @@ private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter w
   def getFinalStorageLevel: String = $(finalStorageLevel)
 
   setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
-    implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
+    implicitPrefs -> false, alpha -> 1.0,
     ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10,
-    intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK",
-    coldStartStrategy -> "nan")
+    intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK")
 
   /**
    * Validates and transforms the input schema.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.regression
 
 import org.apache.hadoop.fs.Path
-import org.json4s.{DefaultFormats, JObject}
+import org.json4s.{DefaultFormats, JInt, JObject, JString}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
@@ -261,7 +261,8 @@ object DecisionTreeRegressionModel extends MLReadable[DecisionTreeRegressionMode
 
     override protected def saveImpl(path: String): Unit = {
       val extraMetadata: JObject = Map(
-        "numFeatures" -> instance.numFeatures)
+        "numFeatures" -> JInt(instance.numFeatures),
+        "impurity" -> JString(instance.getImpurity))
       DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
       val (nodeData, _) = NodeData.build(instance.rootNode, 0)
       val dataPath = new Path(path, "data").toString

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.regression
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.json4s.{DefaultFormats, JObject}
+import org.json4s.{DefaultFormats, JInt, JObject, JString}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
@@ -287,8 +287,9 @@ object GBTRegressionModel extends MLReadable[GBTRegressionModel] {
 
     override protected def saveImpl(path: String): Unit = {
       val extraMetadata: JObject = Map(
-        "numFeatures" -> instance.numFeatures,
-        "numTrees" -> instance.getNumTrees)
+        "numFeatures" -> JInt(instance.numFeatures),
+        "numTrees" -> JInt(instance.getNumTrees),
+        "impurity" -> JString(instance.getImpurity))
       EnsembleModelReadWrite.saveImpl(instance, path, sparkSession, extraMetadata)
     }
   }