diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 324353a96afb..3fad31629908 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -19,8 +19,10 @@ package org.apache.spark.ml.feature import java.{util => ju} +import scala.collection.JavaConverters._ + import org.apache.spark.SparkException -import org.apache.spark.annotation.{Since, Experimental} +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ @@ -56,6 +58,11 @@ final class Bucketizer(override val uid: String) "otherwise, values outside the splits specified will be treated as errors.", Bucketizer.checkSplits) + /** + * Method for calling from Python code (PySpark). + */ + def getJavaSplits: java.util.List[Double] = $(splits).toSeq.asJava + /** @group getParam */ def getSplits: Array[Double] = $(splits) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52ab2..30c51e6b07bf 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -33,7 +33,8 @@ 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel'] + 'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer', 'ChiSqSelector', + 'ChiSqSelectorModel'] @inherit_doc @@ -2093,6 +2094,183 @@ class RFormulaModel(JavaModel): """ +@inherit_doc +class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned + categorical features. The bin ranges are chosen by taking a sample of the data and dividing it + into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, + covering all real values. This attempts to find numBuckets partitions based on a sample of data, + but it may find fewer depending on the data sample values. + + >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) + >>> discretizer = QuantileDiscretizer(inputCol="values", outputCol="buckets").setNumBuckets(3) + >>> bucketed = discretizer.fit(df).transform(df).collect() + >>> bucketed[0].buckets + 0.0 + >>> bucketed[1].buckets + 1.0 + >>> bucketed[2].buckets + 1.0 + >>> bucketed[3].buckets + 2.0 + + .. versionadded:: 1.7.0 + """ + + # a placeholder to make it appear in the generated doc + numBuckets = \ + Param(Params._dummy(), "numBuckets", + "Maximum number of buckets (quantiles, or categories) into which data points are " + + "grouped. Must be >= 2.") + + @keyword_only + def __init__(self, numBuckets=None, inputCol=None, outputCol=None): + """ + __init__(self, numBuckets=None, inputCol=None, outputCol=None) + """ + super(QuantileDiscretizer, self).__init__() + self._java_obj = \ + self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", self.uid) + # Maximum number of buckets (quantiles, or categories) into which data points are grouped. + # Must be >= 2. + # default: 2 + self.numBuckets = \ + Param(self, "numBuckets", + "Maximum number of buckets (quantiles, or categories) into which data points " + + "are grouped. Must be >= 2.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + # The inner class is used as an extractor that extracts splits from the JavaModel generated by + # QuantileDiscretizer, then constructs Bucketizer with the extracted splits. + class QuantileDiscretizerModel(JavaModel): + def getSplits(self): + return self._call_java("getJavaSplits") + + @keyword_only + @since("1.7.0") + def setParams(self, numBuckets=None, inputCol=None, outputCol=None): + """ + setParams(self, numBuckets=None, inputCol=None, outputCol=None) + Sets params for this QuantileDiscretizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @since("1.7.0") + def setNumBuckets(self, value): + """ + Sets the value of :py:attr:`numBuckets`. + """ + self._paramMap[self.numBuckets] = value + return self + + @since("1.7.0") + def getNumBuckets(self): + """ + Gets the value of numBuckets or its default value. + """ + return self.getOrDefault(self.numBuckets) + + def _create_model(self, java_model): + model = self.QuantileDiscretizerModel(java_model) + return Bucketizer(splits=model.getSplits(), + inputCol=self.getOrDefault("inputCol"), + outputCol=self.getOrDefault("outputCol")) + + +@inherit_doc +class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): + """ + .. note:: Experimental + + # Chi-Squared feature selection, which selects categorical features to use for predicting a + # categorical label. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame( + ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), + ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), + ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], + ... ["features", "label"]) + >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") + >>> model = selector.fit(df) + >>> model.transform(df).collect()[0].selectedFeatures + DenseVector([1.0]) + >>> model.transform(df).collect()[1].selectedFeatures + DenseVector([0.0]) + >>> model.transform(df).collect()[2].selectedFeatures + DenseVector([0.1]) + + .. versionadded:: 1.7.0 + """ + + # a placeholder to make it appear in the generated doc + numTopFeatures = \ + Param(Params._dummy(), "numTopFeatures", + "Number of features that selector will select, ordered by statistics value " + + "descending. If the number of features is < numTopFeatures, then this will select " + + "all features.") + + @keyword_only + def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"): + """ + __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label") + """ + super(ChiSqSelector, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) + self.numTopFeatures = \ + Param(self, "numTopFeatures", + "Number of features that selector will select, ordered by statistics value " + + "descending. If the number of features is < numTopFeatures, then this will " + + "select all features.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("1.7.0") + def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, + labelCol="labels"): + """ + setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, + labelCol="labels") + Sets params for this ChiSqSelector. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @since("1.7.0") + def setNumTopFeatures(self, value): + """ + Sets the value of :py:attr:`numTopFeatures`. + """ + self._paramMap[self.numTopFeatures] = value + return self + + @since("1.7.0") + def getNumTopFeatures(self): + """ + Gets the value of numTopFeatures or its default value. + """ + return self.getOrDefault(self.numTopFeatures) + + def _create_model(self, java_model): + return ChiSqSelectorModel(java_model) + + +class ChiSqSelectorModel(JavaModel): + """ + .. note:: Experimental + + Model fitted by ChiSqSelector. + + .. versionadded:: 1.7.0 + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext