Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ package org.apache.spark.ml.feature

import java.{util => ju}

import scala.collection.JavaConverters._

import org.apache.spark.SparkException
import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.Model
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
Expand Down Expand Up @@ -56,6 +58,11 @@ final class Bucketizer(override val uid: String)
"otherwise, values outside the splits specified will be treated as errors.",
Bucketizer.checkSplits)

/**
* Method for calling from Python code (PySpark).
*/
def getJavaSplits: java.util.List[Double] = $(splits).toSeq.asJava

/** @group getParam */
def getSplits: Array[Double] = $(splits)

Expand Down
180 changes: 179 additions & 1 deletion python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
'Word2Vec', 'Word2VecModel']
'Word2Vec', 'Word2VecModel', 'QuantileDiscretizer', 'ChiSqSelector',
'ChiSqSelectorModel']


@inherit_doc
Expand Down Expand Up @@ -2093,6 +2094,183 @@ class RFormulaModel(JavaModel):
"""


@inherit_doc
class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol):
"""
.. note:: Experimental

`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
covering all real values. This attempts to find numBuckets partitions based on a sample of data,
but it may find fewer depending on the data sample values.

>>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> discretizer = QuantileDiscretizer(inputCol="values", outputCol="buckets").setNumBuckets(3)
>>> bucketed = discretizer.fit(df).transform(df).collect()
>>> bucketed[0].buckets
0.0
>>> bucketed[1].buckets
1.0
>>> bucketed[2].buckets
1.0
>>> bucketed[3].buckets
2.0

.. versionadded:: 1.7.0
"""

# a placeholder to make it appear in the generated doc
numBuckets = \
Param(Params._dummy(), "numBuckets",
"Maximum number of buckets (quantiles, or categories) into which data points are " +
"grouped. Must be >= 2.")

@keyword_only
def __init__(self, numBuckets=None, inputCol=None, outputCol=None):
"""
__init__(self, numBuckets=None, inputCol=None, outputCol=None)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = \
self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", self.uid)
# Maximum number of buckets (quantiles, or categories) into which data points are grouped.
# Must be >= 2.
# default: 2
self.numBuckets = \
Param(self, "numBuckets",
"Maximum number of buckets (quantiles, or categories) into which data points " +
"are grouped. Must be >= 2.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

# The inner class is used as an extractor that extracts splits from the JavaModel generated by
# QuantileDiscretizer, then constructs Bucketizer with the extracted splits.
class QuantileDiscretizerModel(JavaModel):
def getSplits(self):
return self._call_java("getJavaSplits")

@keyword_only
@since("1.7.0")
def setParams(self, numBuckets=None, inputCol=None, outputCol=None):
"""
setParams(self, numBuckets=None, inputCol=None, outputCol=None)
Sets params for this QuantileDiscretizer.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

@since("1.7.0")
def setNumBuckets(self, value):
"""
Sets the value of :py:attr:`numBuckets`.
"""
self._paramMap[self.numBuckets] = value
return self

@since("1.7.0")
def getNumBuckets(self):
"""
Gets the value of numBuckets or its default value.
"""
return self.getOrDefault(self.numBuckets)

def _create_model(self, java_model):
model = self.QuantileDiscretizerModel(java_model)
return Bucketizer(splits=model.getSplits(),
inputCol=self.getOrDefault("inputCol"),
outputCol=self.getOrDefault("outputCol"))


@inherit_doc
class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol):
"""
.. note:: Experimental

# Chi-Squared feature selection, which selects categorical features to use for predicting a
# categorical label.

>>> from pyspark.mllib.linalg import Vectors
>>> df = sqlContext.createDataFrame(
... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
... ["features", "label"])
>>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
>>> model = selector.fit(df)
>>> model.transform(df).collect()[0].selectedFeatures
DenseVector([1.0])
>>> model.transform(df).collect()[1].selectedFeatures
DenseVector([0.0])
>>> model.transform(df).collect()[2].selectedFeatures
DenseVector([0.1])

.. versionadded:: 1.7.0
"""

# a placeholder to make it appear in the generated doc
numTopFeatures = \
Param(Params._dummy(), "numTopFeatures",
"Number of features that selector will select, ordered by statistics value " +
"descending. If the number of features is < numTopFeatures, then this will select " +
"all features.")

@keyword_only
def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"):
"""
__init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label")
"""
super(ChiSqSelector, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
self.numTopFeatures = \
Param(self, "numTopFeatures",
"Number of features that selector will select, ordered by statistics value " +
"descending. If the number of features is < numTopFeatures, then this will " +
"select all features.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("1.7.0")
def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
labelCol="labels"):
"""
setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
labelCol="labels")
Sets params for this ChiSqSelector.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

@since("1.7.0")
def setNumTopFeatures(self, value):
"""
Sets the value of :py:attr:`numTopFeatures`.
"""
self._paramMap[self.numTopFeatures] = value
return self

@since("1.7.0")
def getNumTopFeatures(self):
"""
Gets the value of numTopFeatures or its default value.
"""
return self.getOrDefault(self.numTopFeatures)

def _create_model(self, java_model):
return ChiSqSelectorModel(java_model)


class ChiSqSelectorModel(JavaModel):
"""
.. note:: Experimental

Model fitted by ChiSqSelector.

.. versionadded:: 1.7.0
"""


if __name__ == "__main__":
import doctest
from pyspark.context import SparkContext
Expand Down