apache · fjiang6 · Aug 14, 2014 · Aug 22, 2014 · Aug 21, 2014 · Aug 22, 2014
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BiweightRobustRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BiweightRobustRegression.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.log4j.{Level, Logger}
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.regression.BiweightRobustRegressionWithSGD
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater}
+
+/**
+ * An example app for Biweight Robust regression. Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.BiweightRobustRegression
+ * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`.
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app
+ */
+object BiweightRobustRegression extends App {
+
+  object RegType extends Enumeration {
+    type RegType = Value
+    val NONE, L1, L2 = Value
+  }
+
+  import RegType._
+
+  case class Params(
+                     input: String = null,
+                     numIterations: Int = 5000,
+                     stepSize: Double = 1.0,
+                     regType: RegType = L2,
+                     regParam: Double = 0.1)
+
+  val defaultParams = Params()
+
+  val parser = new OptionParser[Params]("BiweightRobustRegression") {
+    head("BiweightRobustRegression: an example app for Biweight M-Estimation Robust regression.")
+    opt[Int]("numIterations")
+      .text("number of iterations")
+      .action((x, c) => c.copy(numIterations = x))
+    opt[Double]("stepSize")
+      .text(s"initial step size, default: ${defaultParams.stepSize}")
+      .action((x, c) => c.copy(stepSize = x))
+    opt[String]("regType")
+      .text(s"regularization type (${RegType.values.mkString(",")}), " +
+      s"default: ${defaultParams.regType}")
+      .action((x, c) => c.copy(regType = RegType.withName(x)))
+    opt[Double]("regParam")
+      .text(s"regularization parameter, default: ${defaultParams.regParam}")
+    arg[String]("<input>")
+      .required()
+      .text("input paths to labeled examples in LIBSVM format")
+      .action((x, c) => c.copy(input = x))
+    note(
+      """
+        |For example, the following command runs this app on a synthetic dataset:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.BiweightRobustRegression \
+        |  examples/target/scala-*/spark-examples-*.jar \
+        |  data/mllib/sample_linear_regression_data.txt
+      """.stripMargin)
+  }
+
+  parser.parse(args, defaultParams).map { params =>
+    run(params)
+  } getOrElse {
+    sys.exit(1)
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"BiweightRobustRegression with $params")
+    val sc = new SparkContext(conf)
+
+    Logger.getRootLogger.setLevel(Level.WARN)
+
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
+
+    val splits = examples.randomSplit(Array(0.8, 0.2))
+    val training = splits(0).cache()
+    val test = splits(1).cache()
+
+    val numTraining = training.count()
+    val numTest = test.count()
+    println(s"Training: $numTraining, test: $numTest.")
+
+    examples.unpersist(blocking = false)
+
+    val updater = params.regType match {
+      case NONE => new SimpleUpdater()
+      case L1 => new L1Updater()
+      case L2 => new SquaredL2Updater()
+    }
+
+    val algorithm = new BiweightRobustRegressionWithSGD()
+    algorithm.optimizer
+      .setNumIterations(params.numIterations)
+      .setStepSize(params.stepSize)
+      .setUpdater(updater)
+      .setRegParam(params.regParam)
+
+    val model = algorithm.run(training)
+
+    val prediction = model.predict(test.map(_.features))
+    val predictionAndLabel = prediction.zip(test.map(_.label))
+
+    val loss = predictionAndLabel.map { case (p, l) =>
+      val err = p - l
+      err * err
+    }.reduce(_ + _)
+    val rmse = math.sqrt(loss / numTest)
+
+    println(s"Test RMSE = $rmse.")
+
+    sc.stop()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HuberRobustRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HuberRobustRegression.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.log4j.{Level, Logger}
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.regression.HuberRobustRegressionWithSGD
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater}
+
+/**
+ * An example app for Huber Robust regression. Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.HuberRobustRegression
+ * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`.
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object HuberRobustRegression extends App {
+
+  object RegType extends Enumeration {
+    type RegType = Value
+    val NONE, L1, L2 = Value
+  }
+
+  import RegType._
+
+  case class Params(
+                     input: String = null,
+                     numIterations: Int = 100,
+                     stepSize: Double = 1.0,
+                     regType: RegType = L2,
+                     regParam: Double = 0.1)
+
+  val defaultParams = Params()
+
+  val parser = new OptionParser[Params]("HuberRobustRegression") {
+    head("HuberRobustRegression: an example app for Huber M-Estimation Robust regression.")
+    opt[Int]("numIterations")
+      .text("number of iterations")
+      .action((x, c) => c.copy(numIterations = x))
+    opt[Double]("stepSize")
+      .text(s"initial step size, default: ${defaultParams.stepSize}")
+      .action((x, c) => c.copy(stepSize = x))
+    opt[String]("regType")
+      .text(s"regularization type (${RegType.values.mkString(",")}), " +
+      s"default: ${defaultParams.regType}")
+      .action((x, c) => c.copy(regType = RegType.withName(x)))
+    opt[Double]("regParam")
+      .text(s"regularization parameter, default: ${defaultParams.regParam}")
+    arg[String]("<input>")
+      .required()
+      .text("input paths to labeled examples in LIBSVM format")
+      .action((x, c) => c.copy(input = x))
+    note(
+      """
+        |For example, the following command runs this app on a synthetic dataset:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.HuberRobustRegression \
+        |  examples/target/scala-*/spark-examples-*.jar \
+        |  data/mllib/sample_linear_regression_data.txt
+      """.stripMargin)
+  }
+
+  parser.parse(args, defaultParams).map { params =>
+    run(params)
+  } getOrElse {
+    sys.exit(1)
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"HuberRobustRegression with $params")
+    val sc = new SparkContext(conf)
+
+    Logger.getRootLogger.setLevel(Level.WARN)
+
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
+
+    val splits = examples.randomSplit(Array(0.8, 0.2))
+    val training = splits(0).cache()
+    val test = splits(1).cache()
+
+    val numTraining = training.count()
+    val numTest = test.count()
+    println(s"Training: $numTraining, test: $numTest.")
+
+    examples.unpersist(blocking = false)
+
+    val updater = params.regType match {
+      case NONE => new SimpleUpdater()
+      case L1 => new L1Updater()
+      case L2 => new SquaredL2Updater()
+    }
+
+    val algorithm = new HuberRobustRegressionWithSGD()
+    algorithm.optimizer
+      .setNumIterations(params.numIterations)
+      .setStepSize(params.stepSize)
+      .setUpdater(updater)
+      .setRegParam(params.regParam)
+
+    val model = algorithm.run(training)
+
+    val prediction = model.predict(test.map(_.features))
+    val predictionAndLabel = prediction.zip(test.map(_.label))
+
+    val loss = predictionAndLabel.map { case (p, l) =>
+      val err = p - l
+      err * err
+    }.reduce(_ + _)
+    val rmse = math.sqrt(loss / numTest)
+
+    println(s"Test RMSE = $rmse.")
+
+    sc.stop()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.optimization
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.{axpy, dot, scal}
+import scala.math.pow
 
 /**
  * :: DeveloperApi ::
@@ -118,6 +119,114 @@ class LeastSquaresGradient extends Gradient {
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ * Compute gradient and loss for Huber objective function, as used in robust regression.
+ * The Huber M-estimator corresponds to a probability distribution for the errors which is normal
+ * in the centre but like a double exponential distribution in the tails (Hogg 1979: 109).
+ *              L = 1/2 ||A weights-y||^2       if |A weights-y| <= k
+ *              L = k |A weights-y| - 1/2 K^2   if |A weights-y| > k
+ * where k = 1.345 which produce 95% efficiency when the errors are normal and
+ * substantial resistance to outliers otherwise.
+ * See also the documentation for the precise formulation.
+ */
+@DeveloperApi
+class HuberRobustGradient extends Gradient {
+  override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
+    val diff = dot(data, weights) - label
+    val loss = diff * diff
+    val gradient = data.copy
+    val k = 1.345
+    /**
+    * Tuning constant is generally picked to give reasonably high efficiency in the normal case.
+    * Smaller values produce more resistance to outliers while at the expense of
+    * lower efficiency when the errors are normally distributed.
+    */
+    if(diff < -k){
+      scal(-k, gradient)
+      (gradient, (-k * diff - 1.0 / 2.0 * pow(k, 2)))
+    }else if(diff >= -k && diff <= k){
+      scal(diff, gradient)
+      (gradient, (1.0 / 2.0 * loss))
+    }else {
+      scal(k, gradient)
+      (gradient, (k * diff - 1.0 / 2.0 * pow(k, 2)))
+    }
+  }
+
+  override def compute(
+                        data: Vector,
+                        label: Double,
+                        weights: Vector,
+                        cumGradient: Vector): Double = {
+    val diff = dot(data, weights) - label
+    val loss = diff * diff
+    val k = 1.345
+    if(diff < -k){
+      axpy(-k, data, cumGradient)
+    }else if(diff >= -k && diff <= k){
+      axpy(diff, data, cumGradient)
+    }else {
+      axpy(k, data, cumGradient)
+    }
+    if(diff < -k){
+      -k * diff - 1.0 / 2.0 * pow(k, 2)
+    }else if(diff >= -k && diff <= k){
+      1.0 / 2.0 * loss
+    }else {
+      k * diff - 1.0 /2.0 * pow(k, 2)
+    }
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Compute gradient and loss for Tukey bisquare weight function, as used in robust regression.
+ * The biweight function produces and M-estimator that is more resistant to regression
+ * outliers than the Huber M-estimator (Andersen 2008: 19). Especially on the extreme tails.
+ *              L = k^2 / 6 * (1 - (1 - ||A weights-y||^2 / k^2)^3)     if |A weights-y| <= k
+ *              L = k^2 / 6                                             if |A weights-y| > k
+ * where k = 4.685 which produce 95% efficiency when the errors are normal and
+ * substantial resistance to outliers otherwise.
+ * See also the documentation for the precise formulation.
+ */
+@DeveloperApi
+class BiweightRobustGradient extends Gradient {
+  override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
+    val diff = dot(data, weights) - label
+    val loss = diff * diff
+    val k = 4.685
+    /**
+     * Tuning constant is generally picked to give reasonably high efficiency in the normal case.
+     * Smaller values produce more resistance to outliers while at the expense of
+     * lower efficiency when the errors are normally distributed.
+     */
+    if(diff >= -k && diff <= k){
+      val gradient = data.copy
+      scal(pow((1 - loss / pow(k, 2)), 2) * diff, gradient)
+      (gradient, (pow(k, 2) / 6.0 * (1 - pow((1 - loss / pow(k, 2)), 3))))
+    }else {
+      (Vectors.sparse(weights.size, Array.empty, Array.empty), (pow(k, 2) / 6.0))
+    }
+  }
+
+  override def compute(
+                        data: Vector,
+                        label: Double,
+                        weights: Vector,
+                        cumGradient: Vector): Double = {
+    val diff = dot(data, weights) - label
+    val loss = diff * diff
+    val k = 4.685
+    if(diff >= -k && diff <= k){
+      axpy((pow((1.0 - loss / pow(k, 2)), 2) * diff), data, cumGradient)
+      pow(k, 2) / 6.0 * (1.0 - pow((1.0 - loss / pow(k, 2)), 3))
+    }else {
+      pow(k, 2) / 6.0
+    }
+  }
+}
+
 /**
  * :: DeveloperApi ::
  * Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.