Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1188,7 +1188,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
that the number of buckets used will be smaller than this value, for example, if there are too few
distinct values of the input to create enough distinct quantiles.

NaN values: Note also that QuantileDiscretizer
NaN values:
NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, what does this mean exactly? When I train a QuantileDiscretizer to handle NaN values, I get back a Bucketizer that also handles them. The bucketizer does not raise an error when encountering NaNs. That seems contradictory to this statement.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QuantileDiscretizer always drops NaNs during fitting, so it will not throw an error for a dataset with NaNs even if handleInvalid = "error"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for clarifying that!

will raise an error when it finds NaN values in the dataset, but the user can also choose to either
keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,6 @@ class LogisticRegression @Since("1.2.0") (

private var optInitialModel: Option[LogisticRegressionModel] = None

/** @group setParam */
private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
this.optInitialModel = Some(model)
this
Expand All @@ -318,8 +317,9 @@ class LogisticRegression @Since("1.2.0") (
train(dataset, handlePersistence)
}

protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
LogisticRegressionModel = {
protected[spark] def train(
dataset: Dataset[_],
handlePersistence: Boolean): LogisticRegressionModel = {
val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
val instances: RDD[Instance] =
dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
/**
* Params for Naive Bayes Classifiers.
*/
private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {

/**
* The smoothing parameter.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
* Default: "error"
* @group param
*/
// TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
@Since("2.1.0")
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
"invalid entries. Options are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))

/** @group getParam */
@Since("2.1.0")
Expand Down Expand Up @@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
private[feature] val SKIP_INVALID: String = "skip"
private[feature] val ERROR_INVALID: String = "error"
private[feature] val KEEP_INVALID: String = "keep"
private[feature] val supportedHandleInvalid: Array[String] =
private[feature] val supportedHandleInvalids: Array[String] =
Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
* Default value is 0.05.
* @group param
*/
@Since("2.1.0")
final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
ParamValidators.inRange(0, 1))
setDefault(fpr -> 0.05)

/** @group getParam */
@Since("2.1.0")
def getFpr: Double = $(fpr)

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
* Default: "error"
* @group param
*/
// TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
@Since("2.1.0")
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
"invalid entries. Options are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
setDefault(handleInvalid, Bucketizer.ERROR_INVALID)

/** @group getParam */
Expand All @@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
* possible that the number of buckets used will be smaller than this value, for example, if there
* are too few distinct values of the input to create enough distinct quantiles.
*
* NaN handling: Note also that
* QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
* NaN handling:
* NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
* produce a `Bucketizer` model for making predictions. During the transformation,
* `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
* also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
* If the user chooses to keep NaN values, they will be handled specially and placed into their own
* bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
* @param objectiveHistory Option containing the objective history when an optimization program is
* used to solve the normal equations. None when an analytic solver is used.
*/
private[ml] class NormalEquationSolution(
private[optim] class NormalEquationSolution(
val coefficients: Array[Double],
val aaInv: Option[Array[Double]],
val objectiveHistory: Option[Array[Double]])

/**
* Interface for classes that solve the normal equations locally.
*/
private[ml] sealed trait NormalEquationSolver {
private[optim] sealed trait NormalEquationSolver {

/** Solve the normal equations from summary statistics. */
def solve(
Expand All @@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
/**
* A class that solves the normal equations directly, using Cholesky decomposition.
*/
private[ml] class CholeskySolver extends NormalEquationSolver {
private[optim] class CholeskySolver extends NormalEquationSolver {

override def solve(
bBar: Double,
Expand All @@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
/**
* A class for solving the normal equations using Quasi-Newton optimization methods.
*/
private[ml] class QuasiNewtonSolver(
private[optim] class QuasiNewtonSolver(
fitIntercept: Boolean,
maxIter: Int,
tol: Double,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,13 @@ class NaiveBayes private (
object NaiveBayes {

/** String name for multinomial model type. */
private[spark] val Multinomial: String = "multinomial"
private[classification] val Multinomial: String = "multinomial"

/** String name for Bernoulli model type. */
private[spark] val Bernoulli: String = "bernoulli"
private[classification] val Bernoulli: String = "bernoulli"

/* Set of modelTypes that NaiveBayes supports */
private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)

/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ private[spark] object ChiSqSelector {
val Percentile: String = "percentile"

/** String name for `fpr` selector type. */
private[spark] val FPR: String = "fpr"
val FPR: String = "fpr"

/** Set of selector types that ChiSqSelector supports. */
val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ class HashingTF(val numFeatures: Int) extends Serializable {

object HashingTF {

private[spark] val Native: String = "native"
private[HashingTF] val Native: String = "native"

private[spark] val Murmur3: String = "murmur3"
private[HashingTF] val Murmur3: String = "murmur3"

private val seed = 42

/**
* Calculate a hash code value for the term object using the native Scala implementation.
* This is the default hash algorithm used in Spark 1.6 and earlier.
*/
private[spark] def nativeHash(term: Any): Int = term.##
private[HashingTF] def nativeHash(term: Any): Int = term.##

/**
* Calculate a hash code value for the term object using
Expand Down