From c1abc2bd8043660cd927e73bab084df30b571720 Mon Sep 17 00:00:00 2001 From: Oleksiy Dyagilev Date: Tue, 23 Jun 2015 18:07:59 +0300 Subject: [PATCH 1/2] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position --- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 4 ++++ .../apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +++++ .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++++++ 3 files changed, 16 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index 308f7f3578e2..3446fa8f061f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -19,6 +19,8 @@ package org.apache.spark.mllib.util import java.util.StringTokenizer +import org.apache.commons.lang.StringUtils.isBlank + import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException @@ -98,6 +100,8 @@ private[mllib] object NumericParser { } } else if (token == ")") { parsing = false + } else if (isBlank(token)){ + // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index d8364a06de4d..f8d0af8820e6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite { } } + test("parse labeled points with whitespaces") { + val point = LabeledPoint.parse("(0.0, [1.0, 2.0])") + assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test("parse labeled points with v0.9 format") { val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0") assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index 8dcb9ba9be10..fa4f74d71b7e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite { } } } + + test("parser with whitespaces") { + val s = "(0.0, [1.0, 2.0])" + val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] + assert(parsed(0).asInstanceOf[Double] === 0.0) + assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } From 0755b9dd55e364b0843fbc8c0e16b0b3c8cee0ef Mon Sep 17 00:00:00 2001 From: Oleksiy Dyagilev Date: Tue, 23 Jun 2015 19:09:23 +0300 Subject: [PATCH 2/2] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang --- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index 3446fa8f061f..a841c5caf014 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -19,8 +19,6 @@ package org.apache.spark.mllib.util import java.util.StringTokenizer -import org.apache.commons.lang.StringUtils.isBlank - import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException @@ -100,7 +98,7 @@ private[mllib] object NumericParser { } } else if (token == ")") { parsing = false - } else if (isBlank(token)){ + } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number