Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
def getQuantilesCol: String = $(quantilesCol)

/** Checks whether the input has quantiles column name. */
protected[regression] def hasQuantilesCol: Boolean = {
isDefined(quantilesCol) && $(quantilesCol) != ""
private[regression] def hasQuantilesCol: Boolean = {
isDefined(quantilesCol) && $(quantilesCol).nonEmpty
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
with HasFitIntercept with HasMaxIter with HasTol with HasRegParam with HasWeightCol
with HasSolver with Logging {

import GeneralizedLinearRegression._

/**
* Param for the name of family which is a description of the error distribution
* to be used in the model.
Expand All @@ -54,8 +56,8 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
@Since("2.0.0")
final val family: Param[String] = new Param(this, "family",
"The name of family which is a description of the error distribution to be used in the " +
"model. Supported options: gaussian(default), binomial, poisson and gamma.",
ParamValidators.inArray[String](GeneralizedLinearRegression.supportedFamilyNames.toArray))
s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
ParamValidators.inArray[String](supportedFamilyNames.toArray))

/** @group getParam */
@Since("2.0.0")
Expand All @@ -71,29 +73,32 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
@Since("2.0.0")
final val link: Param[String] = new Param(this, "link", "The name of link function " +
"which provides the relationship between the linear predictor and the mean of the " +
"distribution function. Supported options: identity, log, inverse, logit, probit, " +
"cloglog and sqrt.",
ParamValidators.inArray[String](GeneralizedLinearRegression.supportedLinkNames.toArray))
s"distribution function. Supported options: ${supportedLinkNames.mkString(", ")}",
ParamValidators.inArray[String](supportedLinkNames.toArray))

/** @group getParam */
@Since("2.0.0")
def getLink: String = $(link)

/**
* Param for link prediction (linear predictor) column name.
* Default is empty, which means we do not output link prediction.
* Default is not set, which means we do not output link prediction.
*
* @group param
*/
@Since("2.0.0")
final val linkPredictionCol: Param[String] = new Param[String](this, "linkPredictionCol",
"link prediction (linear predictor) column name")
setDefault(linkPredictionCol, "")

/** @group getParam */
@Since("2.0.0")
def getLinkPredictionCol: String = $(linkPredictionCol)

/** Checks whether we should output link prediction. */
private[regression] def hasLinkPredictionCol: Boolean = {
isDefined(linkPredictionCol) && $(linkPredictionCol).nonEmpty
}

import GeneralizedLinearRegression._

@Since("2.0.0")
Expand All @@ -107,7 +112,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
s"with ${$(family)} family does not support ${$(link)} link function.")
}
val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType)
if ($(linkPredictionCol).nonEmpty) {
if (hasLinkPredictionCol) {
SchemaUtils.appendColumn(newSchema, $(linkPredictionCol), DoubleType)
} else {
newSchema
Expand Down Expand Up @@ -205,7 +210,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
/**
* Sets the value of param [[weightCol]].
* If this is not set or empty, we treat all instance weights as 1.0.
* Default is empty, so all instances have weight one.
* Default is not set, so all instances have weight one.
*
* @group setParam
*/
Expand All @@ -214,7 +219,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val

/**
* Sets the solver algorithm used for optimization.
* Currently only support "irls" which is also the default solver.
* Currently only supports "irls" which is also the default solver.
*
* @group setParam
*/
Expand All @@ -239,10 +244,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
}
val familyAndLink = new FamilyAndLink(familyObj, linkObj)

val numFeatures = dataset.select(col($(featuresCol))).limit(1).rdd
.map { case Row(features: Vector) =>
features.size
}.first()
val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not do dataset.select(col($(featuresCol))).as[Vector].first().size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You means dataset.select(col($(featuresCol))).as[Vector].first().size? I think it's OK.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like Spark does not provide encoder for Vector. If I change to use as[Vector], the compiler will complain:

Error:(244, 61) Unable to find encoder for type stored in a Dataset.  Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._  Support for serializing other types will be added in future releases.
    val numFeatures = dataset.select(col($(featuresCol))).as[Vector].first().size
                                                            ^

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, we would need to add an implicit encoder implicit def encoder: Encoder[Vector] = ExpressionEncoder(), e.g. see here #12718 (comment).

However, let's leave that change for #12718

if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) {
val msg = "Currently, GeneralizedLinearRegression only supports number of features" +
s" <= ${WeightedLeastSquares.MAX_NUM_FEATURES}. Found $numFeatures in the input dataset."
Expand Down Expand Up @@ -294,25 +296,25 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def load(path: String): GeneralizedLinearRegression = super.load(path)

/** Set of family and link pairs that GeneralizedLinearRegression supports. */
private[ml] lazy val supportedFamilyAndLinkPairs = Set(
private[regression] lazy val supportedFamilyAndLinkPairs = Set(
Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse,
Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog,
Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt,
Gamma -> Inverse, Gamma -> Identity, Gamma -> Log
)

/** Set of family names that GeneralizedLinearRegression supports. */
private[ml] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1.name)
private[regression] lazy val supportedFamilyNames = supportedFamilyAndLinkPairs.map(_._1.name)

/** Set of link names that GeneralizedLinearRegression supports. */
private[ml] lazy val supportedLinkNames = supportedFamilyAndLinkPairs.map(_._2.name)
private[regression] lazy val supportedLinkNames = supportedFamilyAndLinkPairs.map(_._2.name)

private[ml] val epsilon: Double = 1E-16
private[regression] val epsilon: Double = 1E-16

/**
* Wrapper of family and link combination used in the model.
*/
private[ml] class FamilyAndLink(val family: Family, val link: Link) extends Serializable {
private[regression] class FamilyAndLink(val family: Family, val link: Link) extends Serializable {

/** Linear predictor based on given mu. */
def predict(mu: Double): Double = link.link(family.project(mu))
Expand Down Expand Up @@ -359,7 +361,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
*
* @param name the name of the family.
*/
private[ml] abstract class Family(val name: String) extends Serializable {
private[regression] abstract class Family(val name: String) extends Serializable {

/** The default link instance of this family. */
val defaultLink: Link
Expand Down Expand Up @@ -391,7 +393,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
def project(mu: Double): Double = mu
}

private[ml] object Family {
private[regression] object Family {

/**
* Gets the [[Family]] object from its name.
Expand All @@ -412,7 +414,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
* Gaussian exponential family distribution.
* The default link for the Gaussian family is the identity link.
*/
private[ml] object Gaussian extends Family("gaussian") {
private[regression] object Gaussian extends Family("gaussian") {

val defaultLink: Link = Identity

Expand Down Expand Up @@ -448,7 +450,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
* Binomial exponential family distribution.
* The default link for the Binomial family is the logit link.
*/
private[ml] object Binomial extends Family("binomial") {
private[regression] object Binomial extends Family("binomial") {

val defaultLink: Link = Logit

Expand Down Expand Up @@ -492,7 +494,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
* Poisson exponential family distribution.
* The default link for the Poisson family is the log link.
*/
private[ml] object Poisson extends Family("poisson") {
private[regression] object Poisson extends Family("poisson") {

val defaultLink: Link = Log

Expand Down Expand Up @@ -533,7 +535,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
* Gamma exponential family distribution.
* The default link for the Gamma family is the inverse link.
*/
private[ml] object Gamma extends Family("gamma") {
private[regression] object Gamma extends Family("gamma") {

val defaultLink: Link = Inverse

Expand Down Expand Up @@ -578,7 +580,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
*
* @param name the name of link function.
*/
private[ml] abstract class Link(val name: String) extends Serializable {
private[regression] abstract class Link(val name: String) extends Serializable {

/** The link function. */
def link(mu: Double): Double
Expand All @@ -590,7 +592,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
def unlink(eta: Double): Double
}

private[ml] object Link {
private[regression] object Link {

/**
* Gets the [[Link]] object from its name.
Expand All @@ -611,7 +613,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
}
}

private[ml] object Identity extends Link("identity") {
private[regression] object Identity extends Link("identity") {

override def link(mu: Double): Double = mu

Expand All @@ -620,7 +622,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = eta
}

private[ml] object Logit extends Link("logit") {
private[regression] object Logit extends Link("logit") {

override def link(mu: Double): Double = math.log(mu / (1.0 - mu))

Expand All @@ -629,7 +631,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = 1.0 / (1.0 + math.exp(-1.0 * eta))
}

private[ml] object Log extends Link("log") {
private[regression] object Log extends Link("log") {

override def link(mu: Double): Double = math.log(mu)

Expand All @@ -638,7 +640,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = math.exp(eta)
}

private[ml] object Inverse extends Link("inverse") {
private[regression] object Inverse extends Link("inverse") {

override def link(mu: Double): Double = 1.0 / mu

Expand All @@ -647,7 +649,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = 1.0 / eta
}

private[ml] object Probit extends Link("probit") {
private[regression] object Probit extends Link("probit") {

override def link(mu: Double): Double = dist.Gaussian(0.0, 1.0).icdf(mu)

Expand All @@ -658,7 +660,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = dist.Gaussian(0.0, 1.0).cdf(eta)
}

private[ml] object CLogLog extends Link("cloglog") {
private[regression] object CLogLog extends Link("cloglog") {

override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu))

Expand All @@ -667,7 +669,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta))
}

private[ml] object Sqrt extends Link("sqrt") {
private[regression] object Sqrt extends Link("sqrt") {

override def link(mu: Double): Double = math.sqrt(mu)

Expand Down Expand Up @@ -732,7 +734,7 @@ class GeneralizedLinearRegressionModel private[ml] (
if ($(predictionCol).nonEmpty) {
output = output.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
}
if ($(linkPredictionCol).nonEmpty) {
if (hasLinkPredictionCol) {
output = output.withColumn($(linkPredictionCol), predictLinkUDF(col($(featuresCol))))
}
output.toDF()
Expand Down Expand Up @@ -853,7 +855,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
*/
@Since("2.0.0")
val predictionCol: String = {
if (origModel.isDefined(origModel.predictionCol) && origModel.getPredictionCol != "") {
if (origModel.isDefined(origModel.predictionCol) && origModel.getPredictionCol.nonEmpty) {
origModel.getPredictionCol
} else {
"prediction_" + java.util.UUID.randomUUID.toString
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
setDefault(isotonic -> true, featureIndex -> 0)

/** Checks whether the input has weight column. */
protected[ml] def hasWeightCol: Boolean = {
isDefined(weightCol) && $(weightCol) != ""
private[regression] def hasWeightCol: Boolean = {
isDefined(weightCol) && $(weightCol).nonEmpty
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String

override protected def train(dataset: Dataset[_]): LinearRegressionModel = {
// Extract the number of features before deciding optimization solver.
val numFeatures = dataset.select(col($(featuresCol))).limit(1).rdd.map {
case Row(features: Vector) => features.size
}.first()
val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, can we do as[Vector] instead?

val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))

if (($(solver) == "auto" && $(elasticNetParam) == 0.0 &&
Expand Down Expand Up @@ -240,7 +238,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
val coefficients = Vectors.sparse(numFeatures, Seq())
val intercept = yMean

val model = new LinearRegressionModel(uid, coefficients, intercept)
val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
// Handle possible missing or invalid prediction columns
val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()

Expand All @@ -252,7 +250,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
model,
Array(0D),
Array(0D))
return copyValues(model.setSummary(trainingSummary))
return model.setSummary(trainingSummary)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a minor bug of LinearRegression, we should first copy values from parent estimator and then call findSummaryModelAndPredictionCol, otherwise we will always get empty predictionCol(and other params) for the LinearRegressionModel.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So test cases didn't pick this up? We should look into why and amend the tests accordingly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is due to we don't have excellent test coverage ...
I will add test case after collect comments.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MLnick I added test case for this scenario and updated other test cases to ensure coping prediction column(and other params) correct in all situations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also need to setParent

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkbradley It does not necessary to setParent at here, because we have done it at Predictor.fit

override def fit(dataset: Dataset[_]): M = {
    // This handles a few items such as schema validation.
    // Developers only need to implement train().
    transformSchema(dataset.schema, logging = true)
    copyValues(train(dataset).setParent(this))
  }

} else {
require($(regParam) == 0.0, "The standard deviation of the label is zero. " +
"Model cannot be regularized.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -610,20 +610,31 @@ class LinearRegressionSuite
val model1 = new LinearRegression()
.setFitIntercept(fitIntercept)
.setWeightCol("weight")
.setPredictionCol("myPrediction")
.setSolver(solver)
.fit(datasetWithWeightConstantLabel)
val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0),
model1.coefficients(1))
assert(actual1 ~== expected(idx) absTol 1e-4)

// Schema of summary.predictions should be a superset of the input dataset
assert((datasetWithWeightConstantLabel.schema.fieldNames.toSet + model1.getPredictionCol)
.subsetOf(model1.summary.predictions.schema.fieldNames.toSet))

val model2 = new LinearRegression()
.setFitIntercept(fitIntercept)
.setWeightCol("weight")
.setPredictionCol("myPrediction")
.setSolver(solver)
.fit(datasetWithWeightZeroLabel)
val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0),
model2.coefficients(1))
assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4)

// Schema of summary.predictions should be a superset of the input dataset
assert((datasetWithWeightZeroLabel.schema.fieldNames.toSet + model2.getPredictionCol)
.subsetOf(model2.summary.predictions.schema.fieldNames.toSet))

idx += 1
}
}
Expand Down Expand Up @@ -672,7 +683,7 @@ class LinearRegressionSuite

test("linear regression model training summary") {
Seq("auto", "l-bfgs", "normal").foreach { solver =>
val trainer = new LinearRegression().setSolver(solver)
val trainer = new LinearRegression().setSolver(solver).setPredictionCol("myPrediction")
val model = trainer.fit(datasetWithDenseFeature)
val trainerNoPredictionCol = trainer.setPredictionCol("")
val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature)
Expand All @@ -682,7 +693,7 @@ class LinearRegressionSuite
assert(modelNoPredictionCol.hasSummary)

// Schema should be a superset of the input dataset
assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf(
assert((datasetWithDenseFeature.schema.fieldNames.toSet + model.getPredictionCol).subsetOf(
model.summary.predictions.schema.fieldNames.toSet))
// Validate that we re-insert a prediction column for evaluation
val modelNoPredictionColFieldNames
Expand Down