Fix text source tests and javadoc comments

MaxGekk · MaxGekk · commit bbff40206e68 · 2018-03-23T11:58:55.000+01:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -131,7 +131,7 @@ private[sql] class JSONOptions(
 
   def getTextOptions: Map[String, String] = {
     lineSeparatorInRead.map{ bytes =>
-      "lineSep" -> bytes.map("%02x".format(_)).mkString
+      "lineSep" -> bytes.map("x%02x".format(_)).mkString
     }.toMap
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -366,7 +366,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * <li>`multiLine` (default `false`): parse one record, which may span multiple lines,
    * per file</li>
-   * </ul>
    * <li>`lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator
    * that should be used for parsing.</li>
    * </ul>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -518,7 +518,6 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
-   * </ul>
    * <li>`lineSep` (default `\n`): defines the line separator that should
    * be used for writing.</li>
    * </ul>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala
@@ -41,8 +41,16 @@ private[text] class TextOptions(@transient private val parameters: CaseInsensiti
    */
   val wholeText = parameters.getOrElse(WHOLETEXT, "false").toBoolean
 
-  val lineSeparator: Option[Array[Byte]] = parameters.get(LINE_SEPARATOR).map { hex =>
-    hex.sliding(2, 2).toArray.map(Integer.parseInt(_, 16).toByte)
+  val charset: Option[String] = Some("UTF-8")
+
+  val lineSeparator: Option[Array[Byte]] = parameters.get("lineSep").collect {
+    case hexs if hexs.startsWith("x") =>
+      hexs.replaceAll("[^0-9A-Fa-f]", "").sliding(2, 2).toArray
+        .map(Integer.parseInt(_, 16).toByte)
+    case reserved if reserved.startsWith("r") || reserved.startsWith("/") =>
+      throw new NotImplementedError(s"the $reserved selector has not supported yet")
+    case delim => delim.getBytes(charset.getOrElse(
+      throw new IllegalArgumentException("Please, set the charset option for the delimiter")))
   }
 
   // Note that the option 'lineSep' uses a different default value in read and write.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -268,7 +268,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * <li>`multiLine` (default `false`): parse one record, which may span multiple lines,
    * per file</li>
-   * </ul>
    * <li>`lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator
    * that should be used for parsing.</li>
    * </ul>

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ private[sql] class JSONOptions(`
`131`	`131`
`132`	`132`	`def getTextOptions: Map[String, String] = {`
`133`	`133`	`lineSeparatorInRead.map{ bytes =>`
`134`		`- "lineSep" -> bytes.map("%02x".format(_)).mkString`
	`134`	`+ "lineSep" -> bytes.map("x%02x".format(_)).mkString`
`135`	`135`	`}.toMap`
`136`	`136`	`}`
`137`	`137`	`}`