add headers and minor changes

mengxr · mengxr · commit c1885c13af4e · 2014-05-08T17:17:48.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -1,8 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.mllib.util
 
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 
-object NumericTokenizer {
+private[mllib] object NumericTokenizer {
   val NUMBER = -1
   val END = -2
 }
@@ -16,7 +33,7 @@ import NumericTokenizer._
  *  - array: an array of numbers stored as `[v0,v1,...,vn]`
  *  - tuple: a list of numbers, arrays, or tuples stored as `(...)`
  *
-   @param s input string
+ * @param s input string
  * @param start start index
  * @param end end index
  */
@@ -45,12 +62,6 @@ private[mllib] class NumericTokenizer(s: String, start: Int, end: Int) {
   def next(): Int = {
     if (cur < end) {
       val c = s(cur)
-      if (c == ',' && allowComma) {
-        cur += 1
-        allowComma = false
-        return next()
-      }
-
       c match {
         case '(' | '[' =>
           allowComma = false
@@ -90,6 +101,9 @@ private[mllib] class NumericTokenizer(s: String, start: Int, end: Int) {
   }
 }
 
+/**
+ * Simple parser for tokens from [[org.apache.spark.mllib.util.NumericTokenizer]].
+ */
 private[mllib] object NumericParser {
 
   /** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
@@ -119,7 +133,7 @@ private[mllib] object NumericParser {
     values.toArray
   }
 
-  private def parseTuple(tokenizer: NumericTokenizer): List[_] = {
+  private def parseTuple(tokenizer: NumericTokenizer): Seq[_] = {
     val items = ListBuffer.empty[Any]
     var token = tokenizer.next()
     while (token != ')' && token != END) {
@@ -134,6 +148,6 @@ private[mllib] object NumericParser {
       token = tokenizer.next()
     }
     require(token == ')')
-    items.toList
+    items.toSeq
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -1,12 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.mllib.util
 
-import org.scalatest.FunSuite
 import scala.collection.mutable.ListBuffer
 
+import org.scalatest.FunSuite
+
 class NumericParserSuite extends FunSuite {
 
   test("tokenizer") {
-    val s = "((1,2),4,[5,6],8)"
+    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
     val tokenizer = new NumericTokenizer(s)
     var token = tokenizer.next()
     val tokens = ListBuffer.empty[Any]
@@ -19,10 +37,23 @@ class NumericParserSuite extends FunSuite {
       }
       token = tokenizer.next()
     }
-    val expected = Seq('(', '(', 1.0, 2.0, ')', 4.0, '[', 5.0, 6.0, ']', 8.0, ')')
+    val expected = Seq('(', '(', 1.0, 2e3, ')', -4.0, '[', 5e-6, 7e8, ']', 9.0, ')')
     assert(expected === tokens)
   }
 
+  test("tokenizer on malformatted strings") {
+    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
+    malformatted.foreach { s =>
+      intercept[RuntimeException] {
+        val tokenizer = new NumericTokenizer(s)
+        while (tokenizer.next() != NumericTokenizer.END) {
+          // do nothing
+        }
+        println(s"Didn't detect malformatted string $s.")
+      }
+    }
+  }
+
   test("parser") {
     val s = "((1,2),4,[5,6],8)"
     val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]