From a11558ea329b174531e4f3c3e4d95f875fbc5f5d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 26 Nov 2015 22:23:16 +0800 Subject: [PATCH 01/13] add QuantileDiscretizer in Python --- python/pyspark/ml/feature.py | 88 +++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52ab25..bb3d1c57afaf8 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,7 +33,7 @@ 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel'] + 'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer'] @inherit_doc @@ -2093,6 +2093,92 @@ class RFormulaModel(JavaModel): """ +@inherit_doc +class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned + categorical features. The bin ranges are chosen by taking a sample of the data and dividing it + into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, + covering all real values. This attempts to find numBuckets partitions based on a sample of data, + but it may find fewer depending on the data sample values. + + >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) + >>> discretizer = QuantileDiscretizer(inputCol="values", outputCol="buckets").setNumBuckets(3) + >>> bucketed = discretizer.fit(df).transform(df).collect() + >>> bucketed[0].buckets + 0.0 + >>> bucketed[1].buckets + 1.0 + >>> bucketed[2].buckets + 1.0 + >>> bucketed[3].buckets + 2.0 + + .. versionadded:: 1.7.0 + """ + + # a placeholder to make it appear in the generated doc + numBuckets = \ + Param(Params._dummy(), "numBuckets", + "Maximum number of buckets (quantiles, or categories) into which data points are " + + "grouped. Must be >= 2.") + + @keyword_only + def __init__(self, numBuckets=None, inputCol=None, outputCol=None): + """ + __init__(self, numBuckets=None, inputCol=None, outputCol=None) + """ + super(QuantileDiscretizer, self).__init__() + self._java_obj = \ + self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", self.uid) + # Maximum number of buckets (quantiles, or categories) into which data points are grouped. + # Must be >= 2. + # default: 2 + self.numBuckets = \ + Param(self, "numBuckets", + "Maximum number of buckets (quantiles, or categories) into which data points " + + "are grouped. Must be >= 2.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + class QuantileDiscretizerModel(JavaModel): + def getSplits(self): + return self._call_java("getSplits") + + @keyword_only + @since("1.7.0") + def setParams(self, numBuckets=None, inputCol=None, outputCol=None): + """ + setParams(self, numBuckets=None, inputCol=None, outputCol=None) + Sets params for this QuantileDiscretizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @since("1.7.0") + def setNumBuckets(self, value): + """ + Sets the value of :py:attr:`numBuckets`. + """ + self._paramMap[self.numBuckets] = value + return self + + @since("1.7.0") + def getNumBuckets(self): + """ + Gets the value of numBuckets or its default value. + """ + return self.getOrDefault(self.numBuckets) + + def _create_model(self, java_model): + model = self.QuantileDiscretizerModel(java_model) + return Bucketizer(splits=model.getSplits(), + inputCol=self.getOrDefault("inputCol"), + outputCol=self.getOrDefault("outputCol")) + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext From 670821bd0adbb58483e453d1b220179659d254f3 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 26 Nov 2015 23:49:43 +0800 Subject: [PATCH 02/13] add ChiSqSelector in Python --- python/pyspark/ml/feature.py | 89 ++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index bb3d1c57afaf8..8afd1c64e2219 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2179,6 +2179,95 @@ def _create_model(self, java_model): outputCol=self.getOrDefault("outputCol")) +@inherit_doc +class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): + """ + .. note:: Experimental + + # Chi-Squared feature selection, which selects categorical features to use for predicting a + # categorical label. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame( + ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), + ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), + ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], + ... ["features", "label"]) + >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") + >>> model = selector.fit(df) + >>> model.transform(df).collect()[0].selectedFeatures + DenseVector([1.0]) + >>> model.transform(df).collect()[1].selectedFeatures + DenseVector([0.0]) + >>> model.transform(df).collect()[2].selectedFeatures + DenseVector([0.1]) + + .. versionadded:: 1.7.0 + """ + + # a placeholder to make it appear in the generated doc + numTopFeatures = \ + Param(Params._dummy(), "numTopFeatures", + "Number of features that selector will select, ordered by statistics value " + + "descending. If the number of features is < numTopFeatures, then this will select " + + "all features.") + + @keyword_only + def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"): + """ + __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label") + """ + super(ChiSqSelector, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) + self.numTopFeatures = \ + Param(self, "numTopFeatures", + "Number of features that selector will select, ordered by statistics value " + + "descending. If the number of features is < numTopFeatures, then this will " + + "select all features.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("1.7.0") + def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, + labelCol="labels"): + """ + setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, + labelCol="labels") + Sets params for this ChiSqSelector. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @since("1.7.0") + def setNumTopFeatures(self, value): + """ + Sets the value of :py:attr:`numTopFeatures`. + """ + self._paramMap[self.numTopFeatures] = value + return self + + @since("1.7.0") + def getNumTopFeatures(self): + """ + Gets the value of numTopFeatures or its default value. + """ + return self.getOrDefault(self.numTopFeatures) + + def _create_model(self, java_model): + return ChiSqSelectorModel(java_model) + + +class ChiSqSelectorModel(JavaModel): + """ + .. note:: Experimental + + Model fitted by ChiSqSelector. + + .. versionadded:: 1.7.0 + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext From 05f3eddcf38d9211f65f2af2227cb9b249955208 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 26 Nov 2015 23:54:02 +0800 Subject: [PATCH 03/13] add class exports --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8afd1c64e2219..1905db20c5225 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,7 +33,8 @@ 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer'] + 'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer', 'ChiSqSelector', + 'ChiSqSelectorModel'] @inherit_doc From 3a33327122ae94d59403d807255273180528d9a9 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 3 Dec 2015 18:33:45 +0800 Subject: [PATCH 04/13] add java competible --- .../scala/org/apache/spark/ml/feature/Bucketizer.scala | 9 ++++++++- python/pyspark/ml/feature.py | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 324353a96afb3..3fad316299083 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -19,8 +19,10 @@ package org.apache.spark.ml.feature import java.{util => ju} +import scala.collection.JavaConverters._ + import org.apache.spark.SparkException -import org.apache.spark.annotation.{Since, Experimental} +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ @@ -56,6 +58,11 @@ final class Bucketizer(override val uid: String) "otherwise, values outside the splits specified will be treated as errors.", Bucketizer.checkSplits) + /** + * Method for calling from Python code (PySpark). + */ + def getJavaSplits: java.util.List[Double] = $(splits).toSeq.asJava + /** @group getParam */ def getSplits: Array[Double] = $(splits) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1905db20c5225..30c51e6b07bfd 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2144,9 +2144,11 @@ def __init__(self, numBuckets=None, inputCol=None, outputCol=None): kwargs = self.__init__._input_kwargs self.setParams(**kwargs) + # The inner class is used as an extractor that extracts splits from the JavaModel generated by + # QuantileDiscretizer, then constructs Bucketizer with the extracted splits. class QuantileDiscretizerModel(JavaModel): def getSplits(self): - return self._call_java("getSplits") + return self._call_java("getJavaSplits") @keyword_only @since("1.7.0") From a5e72ade18f25b64a016f156ca7950f8270364d9 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 8 Dec 2015 11:50:59 +0800 Subject: [PATCH 05/13] remove QuantileDiscretizer --- .../apache/spark/ml/feature/Bucketizer.scala | 9 +- python/pyspark/ml/feature.py | 91 +------------------ 2 files changed, 2 insertions(+), 98 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 3fad316299083..324353a96afb3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -19,10 +19,8 @@ package org.apache.spark.ml.feature import java.{util => ju} -import scala.collection.JavaConverters._ - import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ @@ -58,11 +56,6 @@ final class Bucketizer(override val uid: String) "otherwise, values outside the splits specified will be treated as errors.", Bucketizer.checkSplits) - /** - * Method for calling from Python code (PySpark). - */ - def getJavaSplits: java.util.List[Double] = $(splits).toSeq.asJava - /** @group getParam */ def getSplits: Array[Double] = $(splits) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 30c51e6b07bfd..c6ce5ab2315a0 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,8 +33,7 @@ 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer', 'ChiSqSelector', - 'ChiSqSelectorModel'] + 'Word2Vec', 'Word2VecModel', 'ChiSqSelector', 'ChiSqSelectorModel'] @inherit_doc @@ -2094,94 +2093,6 @@ class RFormulaModel(JavaModel): """ -@inherit_doc -class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): - """ - .. note:: Experimental - - `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned - categorical features. The bin ranges are chosen by taking a sample of the data and dividing it - into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, - covering all real values. This attempts to find numBuckets partitions based on a sample of data, - but it may find fewer depending on the data sample values. - - >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) - >>> discretizer = QuantileDiscretizer(inputCol="values", outputCol="buckets").setNumBuckets(3) - >>> bucketed = discretizer.fit(df).transform(df).collect() - >>> bucketed[0].buckets - 0.0 - >>> bucketed[1].buckets - 1.0 - >>> bucketed[2].buckets - 1.0 - >>> bucketed[3].buckets - 2.0 - - .. versionadded:: 1.7.0 - """ - - # a placeholder to make it appear in the generated doc - numBuckets = \ - Param(Params._dummy(), "numBuckets", - "Maximum number of buckets (quantiles, or categories) into which data points are " + - "grouped. Must be >= 2.") - - @keyword_only - def __init__(self, numBuckets=None, inputCol=None, outputCol=None): - """ - __init__(self, numBuckets=None, inputCol=None, outputCol=None) - """ - super(QuantileDiscretizer, self).__init__() - self._java_obj = \ - self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", self.uid) - # Maximum number of buckets (quantiles, or categories) into which data points are grouped. - # Must be >= 2. - # default: 2 - self.numBuckets = \ - Param(self, "numBuckets", - "Maximum number of buckets (quantiles, or categories) into which data points " + - "are grouped. Must be >= 2.") - kwargs = self.__init__._input_kwargs - self.setParams(**kwargs) - - # The inner class is used as an extractor that extracts splits from the JavaModel generated by - # QuantileDiscretizer, then constructs Bucketizer with the extracted splits. - class QuantileDiscretizerModel(JavaModel): - def getSplits(self): - return self._call_java("getJavaSplits") - - @keyword_only - @since("1.7.0") - def setParams(self, numBuckets=None, inputCol=None, outputCol=None): - """ - setParams(self, numBuckets=None, inputCol=None, outputCol=None) - Sets params for this QuantileDiscretizer. - """ - kwargs = self.setParams._input_kwargs - return self._set(**kwargs) - - @since("1.7.0") - def setNumBuckets(self, value): - """ - Sets the value of :py:attr:`numBuckets`. - """ - self._paramMap[self.numBuckets] = value - return self - - @since("1.7.0") - def getNumBuckets(self): - """ - Gets the value of numBuckets or its default value. - """ - return self.getOrDefault(self.numBuckets) - - def _create_model(self, java_model): - model = self.QuantileDiscretizerModel(java_model) - return Bucketizer(splits=model.getSplits(), - inputCol=self.getOrDefault("inputCol"), - outputCol=self.getOrDefault("outputCol")) - - @inherit_doc class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): """ From f49e231d4adad9c32fb07a49602ace4a46b13c20 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 8 Dec 2015 12:27:46 +0800 Subject: [PATCH 06/13] change tags from 1.7 to 1.6 --- python/pyspark/ml/feature.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index c6ce5ab2315a0..ec05aef590c7a 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2116,7 +2116,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): >>> model.transform(df).collect()[2].selectedFeatures DenseVector([0.1]) - .. versionadded:: 1.7.0 + .. versionadded:: 1.6.0 """ # a placeholder to make it appear in the generated doc @@ -2142,7 +2142,7 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, la self.setParams(**kwargs) @keyword_only - @since("1.7.0") + @since("1.6.0") def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="labels"): """ @@ -2153,7 +2153,7 @@ def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, kwargs = self.setParams._input_kwargs return self._set(**kwargs) - @since("1.7.0") + @since("1.6.0") def setNumTopFeatures(self, value): """ Sets the value of :py:attr:`numTopFeatures`. @@ -2161,7 +2161,7 @@ def setNumTopFeatures(self, value): self._paramMap[self.numTopFeatures] = value return self - @since("1.7.0") + @since("1.6.0") def getNumTopFeatures(self): """ Gets the value of numTopFeatures or its default value. @@ -2178,7 +2178,7 @@ class ChiSqSelectorModel(JavaModel): Model fitted by ChiSqSelector. - .. versionadded:: 1.7.0 + .. versionadded:: 1.6.0 """ From 657a0d4220092aec924c297942278c1a84103a03 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 10 Dec 2015 08:31:06 +0800 Subject: [PATCH 07/13] fix # and \ --- python/pyspark/ml/feature.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ec05aef590c7a..34772e1bbb678 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2098,8 +2098,8 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): """ .. note:: Experimental - # Chi-Squared feature selection, which selects categorical features to use for predicting a - # categorical label. + Chi-Squared feature selection, which selects categorical features to use for predicting a + categorical label. >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame( @@ -2146,7 +2146,7 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, la def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="labels"): """ - setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, + setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,\ labelCol="labels") Sets params for this ChiSqSelector. """ From aa9d40f873fafb79d15bcaf235d92a8466fc6407 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Mon, 11 Jan 2016 19:18:55 +0800 Subject: [PATCH 08/13] merge with master --- python/pyspark/ml/feature.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 34772e1bbb678..5e343ee2b4478 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2116,7 +2116,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): >>> model.transform(df).collect()[2].selectedFeatures DenseVector([0.1]) - .. versionadded:: 1.6.0 + .. versionadded:: 2.2.0 """ # a placeholder to make it appear in the generated doc @@ -2142,7 +2142,7 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, la self.setParams(**kwargs) @keyword_only - @since("1.6.0") + @since("2.2.0") def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="labels"): """ @@ -2153,7 +2153,7 @@ def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, kwargs = self.setParams._input_kwargs return self._set(**kwargs) - @since("1.6.0") + @since("2.2.0") def setNumTopFeatures(self, value): """ Sets the value of :py:attr:`numTopFeatures`. @@ -2161,7 +2161,7 @@ def setNumTopFeatures(self, value): self._paramMap[self.numTopFeatures] = value return self - @since("1.6.0") + @since("2.2.0") def getNumTopFeatures(self): """ Gets the value of numTopFeatures or its default value. @@ -2178,7 +2178,7 @@ class ChiSqSelectorModel(JavaModel): Model fitted by ChiSqSelector. - .. versionadded:: 1.6.0 + .. versionadded:: 2.2.0 """ From e276440fc7a333f80b311fb1de475b336089b15b Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 12 Jan 2016 13:06:11 +0800 Subject: [PATCH 09/13] fix nits --- python/pyspark/ml/feature.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5e343ee2b4478..76895bbe3bfe2 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2104,19 +2104,15 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame( ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), - ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), - ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], + ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), + ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ... ["features", "label"]) >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") >>> model = selector.fit(df) - >>> model.transform(df).collect()[0].selectedFeatures + >>> model.transform(df).head().selectedFeatures DenseVector([1.0]) - >>> model.transform(df).collect()[1].selectedFeatures - DenseVector([0.0]) - >>> model.transform(df).collect()[2].selectedFeatures - DenseVector([0.1]) - .. versionadded:: 2.2.0 + .. versionadded:: 2.0.0 """ # a placeholder to make it appear in the generated doc @@ -2142,7 +2138,7 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, la self.setParams(**kwargs) @keyword_only - @since("2.2.0") + @since("2.0.0") def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="labels"): """ @@ -2153,7 +2149,7 @@ def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, kwargs = self.setParams._input_kwargs return self._set(**kwargs) - @since("2.2.0") + @since("2.0.0") def setNumTopFeatures(self, value): """ Sets the value of :py:attr:`numTopFeatures`. @@ -2161,7 +2157,7 @@ def setNumTopFeatures(self, value): self._paramMap[self.numTopFeatures] = value return self - @since("2.2.0") + @since("2.0.0") def getNumTopFeatures(self): """ Gets the value of numTopFeatures or its default value. @@ -2178,7 +2174,7 @@ class ChiSqSelectorModel(JavaModel): Model fitted by ChiSqSelector. - .. versionadded:: 2.2.0 + .. versionadded:: 2.0.0 """ From 0bd12717f2c88415c5312693651a2b7538fd2244 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 13 Jan 2016 01:12:53 +0800 Subject: [PATCH 10/13] add selectedFeatures --- .../org/apache/spark/ml/feature/ChiSqSelector.scala | 5 +++++ python/pyspark/ml/feature.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 7b565ef3ed922..58f0c64e379ee 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.feature +import scala.collection.JavaConverters._ + import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} @@ -119,6 +121,9 @@ final class ChiSqSelectorModel private[ml] ( /** list of indices to select (filter). Must be ordered asc */ val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures + /** Java-friendly version of [[selectedFeatures]]. */ + def javaSelectedFeatures: java.util.List[Int] = selectedFeatures.toSeq.asJava + /** @group setParam */ def setFeaturesCol(value: String): this.type = set(featuresCol, value) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 76895bbe3bfe2..252d8eb527c02 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2111,6 +2111,8 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): >>> model = selector.fit(df) >>> model.transform(df).head().selectedFeatures DenseVector([1.0]) + >>> model.selectedFeatures + [3] .. versionadded:: 2.0.0 """ @@ -2177,6 +2179,14 @@ class ChiSqSelectorModel(JavaModel): .. versionadded:: 2.0.0 """ + @property + @since("2.0.0") + def selectedFeatures(self): + """ + Standard deviation of the StandardScalerModel. + """ + return self._call_java("javaSelectedFeatures") + if __name__ == "__main__": import doctest From 32cdbb0e8013492092f5f7f35184beaccf3f4325 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 15 Jan 2016 17:00:12 +0800 Subject: [PATCH 11/13] change the calling --- .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 3 --- python/pyspark/ml/feature.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 58f0c64e379ee..6dbb08a136843 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -121,9 +121,6 @@ final class ChiSqSelectorModel private[ml] ( /** list of indices to select (filter). Must be ordered asc */ val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures - /** Java-friendly version of [[selectedFeatures]]. */ - def javaSelectedFeatures: java.util.List[Int] = selectedFeatures.toSeq.asJava - /** @group setParam */ def setFeaturesCol(value: String): this.type = set(featuresCol, value) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 252d8eb527c02..d7b3408f57007 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2183,9 +2183,9 @@ class ChiSqSelectorModel(JavaModel): @since("2.0.0") def selectedFeatures(self): """ - Standard deviation of the StandardScalerModel. + list of indices to select (filter). Must be ordered asc. """ - return self._call_java("javaSelectedFeatures") + return list(self._java_obj.selectedFeatures()) if __name__ == "__main__": From 223fdf4c41628fcf45cc52d8fdd13c164f53199a Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 15 Jan 2016 17:01:18 +0800 Subject: [PATCH 12/13] remove import --- .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 6dbb08a136843..7b565ef3ed922 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -17,8 +17,6 @@ package org.apache.spark.ml.feature -import scala.collection.JavaConverters._ - import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} From 3fca95ed1ae22c6507a4be7a4f647e6972547064 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 26 Jan 2016 09:03:52 -0800 Subject: [PATCH 13/13] change to call_java --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 91b32abf83055..32f324685a9cf 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2328,9 +2328,9 @@ class ChiSqSelectorModel(JavaModel): @since("2.0.0") def selectedFeatures(self): """ - list of indices to select (filter). Must be ordered asc. + List of indices to select (filter). Must be ordered asc. """ - return list(self._java_obj.selectedFeatures()) + return self._call_java("selectedFeatures") if __name__ == "__main__":