Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions R/pkg/inst/tests/testthat/test_mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,6 @@ test_that("spark.glm and predict", {
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)

# binomial family
binomialTraining <- training[training$Species %in% c("versicolor", "virginica"), ]
model <- spark.glm(binomialTraining, Species ~ Sepal_Length + Sepal_Width,
family = binomial(link = "logit"))
prediction <- predict(model, binomialTraining)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
"versicolor", "virginica", "versicolor", "virginica", "versicolor")
expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)

# poisson family
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
family = poisson(link = identity))
Expand Down Expand Up @@ -138,10 +128,10 @@ test_that("spark.glm summary", {
expect_equal(stats$aic, rStats$aic)

# Test spark.glm works with weighted dataset
a1 <- c(0, 1, 2, 3, 4)
a2 <- c(5, 2, 1, 3, 2)
w <- c(1, 2, 3, 4, 5)
b <- c(1, 0, 1, 0, 0)
a1 <- c(0, 1, 2, 3)
a2 <- c(5, 2, 1, 3)
w <- c(1, 2, 3, 4)
b <- c(1, 0, 1, 0)
data <- as.data.frame(cbind(a1, a2, w, b))
df <- createDataFrame(data)

Expand All @@ -168,7 +158,7 @@ test_that("spark.glm summary", {
data <- as.data.frame(cbind(a1, a2, b))
df <- suppressWarnings(createDataFrame(data))
regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result

# Test spark.glm works on collinear data
A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,12 @@ import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.regression._
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.regression._
import org.apache.spark.ml.util._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

private[r] class GeneralizedLinearRegressionWrapper private (
val pipeline: PipelineModel,
Expand All @@ -48,8 +43,6 @@ private[r] class GeneralizedLinearRegressionWrapper private (
val rNumIterations: Int,
val isLoaded: Boolean = false) extends MLWritable {

import GeneralizedLinearRegressionWrapper._

private val glm: GeneralizedLinearRegressionModel =
pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]

Expand All @@ -60,16 +53,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)

def transform(dataset: Dataset[_]): DataFrame = {
if (rFamily == "binomial") {
pipeline.transform(dataset)
.drop(PREDICTED_LABEL_PROB_COL)
.drop(PREDICTED_LABEL_INDEX_COL)
.drop(glm.getFeaturesCol)
.drop(glm.getLabelCol)
} else {
pipeline.transform(dataset)
.drop(glm.getFeaturesCol)
}
pipeline.transform(dataset).drop(glm.getFeaturesCol)
}

override def write: MLWriter =
Expand All @@ -79,10 +63,6 @@ private[r] class GeneralizedLinearRegressionWrapper private (
private[r] object GeneralizedLinearRegressionWrapper
extends MLReadable[GeneralizedLinearRegressionWrapper] {

val PREDICTED_LABEL_PROB_COL = "pred_label_prob"
val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
val PREDICTED_LABEL_COL = "prediction"

def fit(
formula: String,
data: DataFrame,
Expand All @@ -93,7 +73,6 @@ private[r] object GeneralizedLinearRegressionWrapper
weightCol: String,
regParam: Double): GeneralizedLinearRegressionWrapper = {
val rFormula = new RFormula().setFormula(formula)
if (family == "binomial") rFormula.setForceIndexLabel(true)
checkDataColumns(rFormula, data)
val rFormulaModel = rFormula.fit(data)
// get labels and feature names from output schema
Expand All @@ -111,28 +90,9 @@ private[r] object GeneralizedLinearRegressionWrapper
.setWeightCol(weightCol)
.setRegParam(regParam)
.setFeaturesCol(rFormula.getFeaturesCol)
.setLabelCol(rFormula.getLabelCol)
val pipeline = if (family == "binomial") {
// Convert prediction from probability to label index.
val probToPred = new ProbabilityToPrediction()
.setInputCol(PREDICTED_LABEL_PROB_COL)
.setOutputCol(PREDICTED_LABEL_INDEX_COL)
// Convert prediction from label index to original label.
val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
.asInstanceOf[NominalAttribute]
val labels = labelAttr.values.get
val idxToStr = new IndexToString()
.setInputCol(PREDICTED_LABEL_INDEX_COL)
.setOutputCol(PREDICTED_LABEL_COL)
.setLabels(labels)

new Pipeline()
.setStages(Array(rFormulaModel, glr.setPredictionCol(PREDICTED_LABEL_PROB_COL),
probToPred, idxToStr))
.fit(data)
} else {
new Pipeline().setStages(Array(rFormulaModel, glr)).fit(data)
}
val pipeline = new Pipeline()
.setStages(Array(rFormulaModel, glr))
.fit(data)

val glm: GeneralizedLinearRegressionModel =
pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
Expand Down Expand Up @@ -248,27 +208,3 @@ private[r] object GeneralizedLinearRegressionWrapper
}
}
}

/**
* This utility transformer converts the predicted value of GeneralizedLinearRegressionModel
* with "binomial" family from probability to prediction according to threshold 0.5.
*/
private[r] class ProbabilityToPrediction private[r] (override val uid: String)
extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable {

def this() = this(Identifiable.randomUID("probToPred"))

def setInputCol(value: String): this.type = set(inputCol, value)

def setOutputCol(value: String): this.type = set(outputCol, value)

override def transformSchema(schema: StructType): StructType = {
StructType(schema.fields :+ StructField($(outputCol), DoubleType))
}

override def transform(dataset: Dataset[_]): DataFrame = {
dataset.withColumn($(outputCol), round(col($(inputCol))))
}

override def copy(extra: ParamMap): ProbabilityToPrediction = defaultCopy(extra)
}