From 05c11f49ef3f23f68ac9e2dae1c6fb6a2f438a6b Mon Sep 17 00:00:00 2001 From: Evan Chen Date: Wed, 18 Nov 2015 19:54:57 -0800 Subject: [PATCH] [SPARK-10931] Copied parameters over from Estimator to Transformer Estimator UID is being copied correctly to the Transformer model objects and params now, working on Doctests Changed the way parameters are copied from the Estimator to Transformer Checkpoint, switching back to inheritance method Working on DocTests Implemented Doctests for Recommendation, Clustering, Classification (except RandomForestClassifier), Evaluation, Tuning, Regression (except RandomRegression) Ready for Code Review Code Review changeset #1 --- python/pyspark/ml/classification.py | 31 ++++++++++----- python/pyspark/ml/clustering.py | 11 +++-- python/pyspark/ml/feature.py | 62 ++++++++++++++++++++++++----- python/pyspark/ml/param/__init__.py | 5 +++ python/pyspark/ml/recommendation.py | 6 +-- python/pyspark/ml/regression.py | 36 ++++++++++++----- python/pyspark/ml/wrapper.py | 9 ++--- 7 files changed, 116 insertions(+), 44 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d9ff356b9403..3e81249be462 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -264,7 +264,12 @@ def getFamily(self): return self.getOrDefault(self.family) -class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): +class LogisticRegressionModel(JavaModel, JavaClassificationModel, HasFeaturesCol, + HasLabelCol, HasPredictionCol, HasMaxIter, + HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, + HasElasticNetParam, HasFitIntercept, HasStandardization, + HasThresholds, HasWeightCol, HasAggregationDepth, + JavaMLWritable, JavaMLReadable): """ Model fitted by LogisticRegression. @@ -669,8 +674,11 @@ def _create_model(self, java_model): @inherit_doc -class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): +class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel, HasFeaturesCol, + HasLabelCol, HasPredictionCol, HasProbabilityCol, + HasRawPredictionCol, DecisionTreeParams, + TreeClassifierParams, HasCheckpointInterval, HasSeed, + JavaMLWritable, JavaMLReadable): """ Model fitted by DecisionTreeClassifier. @@ -798,8 +806,9 @@ def _create_model(self, java_model): return RandomForestClassificationModel(java_model) -class RandomForestClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): +class RandomForestClassificationModel(TreeEnsembleModel, JavaClassificationModel, HasFeaturesCol, + HasLabelCol, HasPredictionCol, HasRawPredictionCol, + HasProbabilityCol, JavaMLWritable, JavaMLReadable): """ Model fitted by RandomForestClassifier. @@ -950,7 +959,8 @@ def getLossType(self): return self.getOrDefault(self.lossType) -class GBTClassificationModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, +class GBTClassificationModel(TreeEnsembleModel, JavaPredictionModel, HasFeaturesCol, + HasLabelCol, HasPredictionCol, JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. @@ -1105,7 +1115,9 @@ def getModelType(self): return self.getOrDefault(self.modelType) -class NaiveBayesModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): +class NaiveBayesModel(JavaModel, JavaClassificationModel, HasFeaturesCol, HasLabelCol, + HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, + JavaMLWritable, JavaMLReadable): """ Model fitted by NaiveBayes. @@ -1304,8 +1316,9 @@ def getInitialWeights(self): return self.getOrDefault(self.initialWeights) -class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): +class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, + HasFeaturesCol, HasLabelCol, HasPredictionCol, + JavaMLWritable, JavaMLReadable): """ .. note:: Experimental diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 7632f05c3b68..0b12573b247d 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -27,7 +27,8 @@ 'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel'] -class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable): +class GaussianMixtureModel(JavaModel, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, + HasProbabilityCol, JavaMLWritable, JavaMLReadable): """ .. note:: Experimental @@ -181,7 +182,8 @@ def getK(self): return self.getOrDefault(self.k) -class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): +class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable, HasFeaturesCol, + HasPredictionCol, HasMaxIter, HasTol, HasSeed): """ Model fitted by KMeans. @@ -324,7 +326,8 @@ def getInitSteps(self): return self.getOrDefault(self.initSteps) -class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): +class BisectingKMeansModel(JavaModel, HasFeaturesCol, HasPredictionCol, HasMaxIter, + HasSeed, JavaMLWritable, JavaMLReadable): """ .. note:: Experimental @@ -461,7 +464,7 @@ def _create_model(self, java_model): @inherit_doc -class LDAModel(JavaModel): +class LDAModel(JavaModel, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval): """ .. note:: Experimental diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 94afe82a3647..089b5f126c87 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -340,7 +340,8 @@ def _create_model(self, java_model): return CountVectorizerModel(java_model) -class CountVectorizerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class CountVectorizerModel(JavaModel, HasInputCol, HasOutputCol, + JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`CountVectorizer`. @@ -635,7 +636,7 @@ def _create_model(self, java_model): return IDFModel(java_model) -class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable): +class IDFModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`IDF`. @@ -713,7 +714,7 @@ def _create_model(self, java_model): return MaxAbsScalerModel(java_model) -class MaxAbsScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class MaxAbsScalerModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -837,7 +838,7 @@ def _create_model(self, java_model): return MinMaxScalerModel(java_model) -class MinMaxScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class MinMaxScalerModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`MinMaxScaler`. @@ -1538,7 +1539,7 @@ def _create_model(self, java_model): return StandardScalerModel(java_model) -class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class StandardScalerModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`StandardScaler`. @@ -1626,7 +1627,8 @@ def _create_model(self, java_model): return StringIndexerModel(java_model) -class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class StringIndexerModel(JavaModel, HasInputCol, HasOutputCol, HasHandleInvalid, + JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`StringIndexer`. @@ -1996,7 +1998,7 @@ def _create_model(self, java_model): return VectorIndexerModel(java_model) -class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): +class VectorIndexerModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`VectorIndexer`. @@ -2134,6 +2136,15 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model") >>> model = word2Vec.fit(doc) + >>> estimator_paramMap = word2Vec.extractParamMap() + >>> model_paramMap = model.extractParamMap() + >>> all([estimator_paramMap[getattr(word2Vec, param.name)] == value + ... for param, value in model_paramMap.items()]) + True + >>> all([param.parent == model.uid for param in model_paramMap]) + True + >>> [param.name for param in model.params] + ['inputCol', 'maxIter', 'outputCol', 'seed', 'stepSize'] >>> model.getVectors().show() +----+--------------------+ |word| vector| @@ -2292,7 +2303,8 @@ def _create_model(self, java_model): return Word2VecModel(java_model) -class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable): +class Word2VecModel(JavaModel, HasStepSize, HasMaxIter, HasSeed, HasInputCol, + HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`Word2Vec`. @@ -2333,6 +2345,15 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab >>> df = spark.createDataFrame(data,["features"]) >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") >>> model = pca.fit(df) + >>> estimator_paramMap = pca.extractParamMap() + >>> model_paramMap = model.extractParamMap() + >>> all([estimator_paramMap[getattr(pca, param.name)] == value + ... for param, value in model_paramMap.items()]) + True + >>> all([param.parent == model.uid for param in model_paramMap]) + True + >>> [param.name for param in model.params] + ['inputCol', 'outputCol'] >>> model.transform(df).collect()[0].pca_features DenseVector([1.648..., -4.013...]) >>> model.explainedVariance @@ -2394,7 +2415,7 @@ def _create_model(self, java_model): return PCAModel(java_model) -class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable): +class PCAModel(JavaModel, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. @@ -2437,6 +2458,15 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM ... ], ["y", "x", "s"]) >>> rf = RFormula(formula="y ~ x + s") >>> model = rf.fit(df) + >>> estimator_paramMap = rf.extractParamMap() + >>> model_paramMap = model.extractParamMap() + >>> all([estimator_paramMap[getattr(rf, param.name)] == value + ... for param, value in model_paramMap.items()]) + True + >>> all([param.parent == model.uid for param in model_paramMap]) + True + >>> [param.name for param in model.params] + ['featuresCol', 'labelCol'] >>> model.transform(df).show() +---+---+---+---------+-----+ | y| x| s| features|label| @@ -2554,7 +2584,7 @@ def __str__(self): return "RFormula(%s) (uid=%s)" % (formulaStr, self.uid) -class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable): +class RFormulaModel(JavaModel, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -2586,6 +2616,15 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja ... ["features", "label"]) >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") >>> model = selector.fit(df) + >>> estimator_paramMap = selector.extractParamMap() + >>> model_paramMap = model.extractParamMap() + >>> all([estimator_paramMap[getattr(selector, param.name)] == value + ... for param, value in model_paramMap.items()]) + True + >>> all([param.parent == model.uid for param in model_paramMap]) + True + >>> [param.name for param in model.params] + ['featuresCol', 'labelCol', 'outputCol'] >>> model.transform(df).head().selectedFeatures DenseVector([18.0]) >>> model.selectedFeatures @@ -2710,7 +2749,8 @@ def _create_model(self, java_model): return ChiSqSelectorModel(java_model) -class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable): +class ChiSqSelectorModel(JavaModel, HasFeaturesCol, HasOutputCol, HasLabelCol, + JavaMLReadable, JavaMLWritable): """ .. note:: Experimental diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index ade4864e1d78..98f0ac08c4d0 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -336,6 +336,11 @@ def hasParam(self, paramName): return isinstance(p, Param) else: raise TypeError("hasParam(): paramName must be a string") + try: + param = self._resolveParam(paramName) + return param in self.params + except: + return False @since("1.4.0") def getOrDefault(self, param): diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index e28d38bd19f8..42dc149093af 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -26,8 +26,8 @@ @inherit_doc -class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, HasRegParam, HasSeed, - JavaMLWritable, JavaMLReadable): +class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, + HasRegParam, HasSeed, JavaMLWritable, JavaMLReadable): """ Alternating Least Squares (ALS) matrix factorization. @@ -333,7 +333,7 @@ def getFinalStorageLevel(self): return self.getOrDefault(self.finalStorageLevel) -class ALSModel(JavaModel, JavaMLWritable, JavaMLReadable): +class ALSModel(JavaModel, HasPredictionCol, JavaMLWritable, JavaMLReadable): """ Model fitted by ALS. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 9233d2e7e1a7..2a410c261291 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -129,7 +129,10 @@ def _create_model(self, java_model): return LinearRegressionModel(java_model) -class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): +class LinearRegressionModel(JavaModel, JavaPredictionModel, HasFeaturesCol, HasLabelCol, + HasPredictionCol, HasMaxIter, HasRegParam, HasTol, + HasElasticNetParam, HasFitIntercept, HasStandardization, + HasSolver, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`LinearRegression`. @@ -502,7 +505,9 @@ def getFeatureIndex(self): return self.getOrDefault(self.featureIndex) -class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable, + HasFeaturesCol, HasLabelCol, HasPredictionCol, + HasWeightCol): """ Model fitted by :class:`IsotonicRegression`. @@ -560,6 +565,7 @@ class TreeRegressorParams(Params): """ supportedImpurities = ["variance"] + # a placeholder to make it appear in the generated doc impurity = Param(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + @@ -724,9 +730,9 @@ def _create_model(self, java_model): @inherit_doc -class DecisionTreeModel(JavaModel, JavaPredictionModel): - """ - Abstraction for Decision Tree models. +class DecisionTreeModel(JavaModel, JavaPredictionModel, + HasFeaturesCol, HasLabelCol, HasPredictionCol): + """Abstraction for Decision Tree models. .. versionadded:: 1.5.0 """ @@ -916,8 +922,9 @@ def _create_model(self, java_model): return RandomForestRegressionModel(java_model) -class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): +class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, HasFeaturesCol, + HasLabelCol, HasPredictionCol, + JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. @@ -1057,7 +1064,10 @@ def getLossType(self): return self.getOrDefault(self.lossType) -class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): +class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, + HasFeaturesCol, HasLabelCol, + HasPredictionCol, JavaMLWritable, JavaMLReadable): + """ Model fitted by :class:`GBTRegressor`. @@ -1231,7 +1241,9 @@ def getQuantilesCol(self): return self.getOrDefault(self.quantilesCol) -class AFTSurvivalRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class AFTSurvivalRegressionModel(JavaModel, HasFeaturesCol, HasLabelCol, + HasPredictionCol, HasFitIntercept, HasMaxIter, + HasTol, JavaMLWritable, JavaMLReadable): """ .. note:: Experimental @@ -1425,8 +1437,10 @@ def getLink(self): return self.getOrDefault(self.link) -class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): +class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, HasLabelCol, HasFeaturesCol, + HasPredictionCol, HasFitIntercept, HasMaxIter, HasTol, + HasRegParam, HasWeightCol, HasSolver, + JavaMLWritable, JavaMLReadable): """ .. note:: Experimental diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 25c44b7533c7..4adfb3e99b96 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -19,8 +19,8 @@ from pyspark import SparkContext from pyspark.sql import DataFrame -from pyspark.ml import Estimator, Transformer, Model from pyspark.ml.param import Params +from pyspark.ml import Estimator, Transformer, Model from pyspark.ml.util import _jvm from pyspark.ml.common import inherit_doc, _java2py, _py2java @@ -138,9 +138,7 @@ def _to_java(self): """ Transfer this instance's Params to the wrapped Java object, and return the Java object. Used for ML persistence. - Meta-algorithms such as Pipeline should override this method. - :return: Java object equivalent to this instance. """ self._transfer_params_to_java() @@ -151,7 +149,6 @@ def _from_java(java_stage): """ Given a Java object, create and return a Python wrapper of it. Used for ML persistence. - Meta-algorithms such as Pipeline should override this method as a classmethod. """ def __get_class(clazz): @@ -200,7 +197,6 @@ def _create_model(self, java_model): def _fit_java(self, dataset): """ Fits a Java model to the input dataset. - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame` :param params: additional params (overwriting embedded values) @@ -211,7 +207,8 @@ def _fit_java(self, dataset): def _fit(self, dataset): java_model = self._fit_java(dataset) - return self._create_model(java_model) + model = self._create_model(java_model) + return self._copyValues(model) @inherit_doc