apache · MaxGekk · Feb 11, 2018 · Feb 11, 2018 · Feb 13, 2018 · Feb 13, 2018
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -176,7 +176,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
              allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
-             multiLine=None, allowUnquotedControlChars=None):
+             multiLine=None, allowUnquotedControlChars=None, charset=None):
         """
         Loads JSON files and returns the results as a :class:`DataFrame`.
 
@@ -237,6 +237,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
         :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control
                                           characters (ASCII characters with value less than 32,
                                           including tab and line feed characters) or not.
+        :param charset: standard charset name, for example UTF-8, UTF-16 and UTF-32. If None is
+                          set, the charset of input json will be detected automatically.
 
         >>> df1 = spark.read.json('python/test_support/sql/people.json')
         >>> df1.dtypes
@@ -254,7 +256,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
             mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
             timestampFormat=timestampFormat, multiLine=multiLine,
-            allowUnquotedControlChars=allowUnquotedControlChars)
+            allowUnquotedControlChars=allowUnquotedControlChars, charset=charset)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -654,6 +654,13 @@ def test_multiLine_json(self):
                                             multiLine=True)
         self.assertEqual(people1.collect(), people_array.collect())
 
+    def test_charset_json(self):
+        people_array = self.spark.read\
+            .json("python/test_support/sql/people_array_utf16le.json",
+                  multiLine=True, charset="UTF-16LE")
+        expected = [Row(age=30, name=u'Andy'), Row(age=19, name=u'Justin')]
+        self.assertEqual(people_array.collect(), expected)
+
     def test_multiline_csv(self):
         ages_newlines = self.spark.read.csv(
             "python/test_support/sql/ages_newlines.csv", multiLine=True)

diff --git a/python/test_support/sql/people_array_utf16le.json b/python/test_support/sql/people_array_utf16le.json
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -39,11 +39,25 @@ private[sql] object CreateJacksonParser extends Serializable {
     jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
-  def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    jsonFactory.createParser(record.getBytes, 0, record.getLength)
+  def text(jsonFactory: JsonFactory, record: Text, charset: Option[String] = None): JsonParser = {
+    charset match {
+      case Some(cs) =>
+        val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
+        jsonFactory.createParser(new InputStreamReader(bain, cs))
+      case _ =>
+        jsonFactory.createParser(record.getBytes, 0, record.getLength)
+    }
   }
 
-  def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
-    jsonFactory.createParser(record)
+  def inputStream(
+      jsonFactory: JsonFactory,
+      is: InputStream,
+      charset: Option[String] = None): JsonParser = {
+    charset match {
+      case Some(cs) =>
+        jsonFactory.createParser(new InputStreamReader(is, cs))
+      case _ =>
+        jsonFactory.createParser(is)
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -85,6 +85,12 @@ private[sql] class JSONOptions(
 
   val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
 
+  /**
+   * Standard charset name. For example UTF-8, UTF-16 and UTF-32.
+   * If charset is not specified (None), it will be detected automatically.
+   */
+  val charset: Option[String] = parameters.get("charset")
+
   /** Sets config options on a Jackson [[JsonFactory]]. */
   def setJacksonOptions(factory: JsonFactory): Unit = {
     factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.json
 
-import java.io.ByteArrayOutputStream
+import java.io.{ByteArrayOutputStream, CharConversionException}
 
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
@@ -361,6 +361,15 @@ class JacksonParser(
         // For such records, all fields other than the field configured by
         // `columnNameOfCorruptRecord` are set to `null`.
         throw BadRecordException(() => recordLiteral(record), () => None, e)
+      case e: CharConversionException if options.charset.isEmpty =>
+        val msg =
+          """Failed to parse a character. Charset was detected automatically.
+            |You might want to set it explicitly via the charset option like:
+            |  .option("charset", "UTF-8")
+            |Example of supported charsets:
+            |  UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE
+            |""".stripMargin + e.getMessage
+        throw new CharConversionException(msg)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -366,6 +366,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * <li>`multiLine` (default `false`): parse one record, which may span multiple lines,
    * per file</li>
+   * <li>`charset` (by default it is not set): allows to forcibly set one of standard basic
+   * or extended charsets for input jsons. For example UTF-8, UTF-16BE, UTF-32. If the charset
+   * is not specified (by default), the charset is detected automatically.</li>
    * </ul>
    *
    * @since 2.0.0

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -122,8 +122,10 @@ object TextInputJsonDataSource extends JsonDataSource {
       schema: StructType): Iterator[InternalRow] = {
     val linesReader = new HadoopFileLinesReader(file, conf)
     Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close()))
+    val charset = parser.options.charset
+
     val safeParser = new FailureSafeParser[Text](
-      input => parser.parse(input, CreateJacksonParser.text, textToUTF8String),
+      input => parser.parse[Text](input, CreateJacksonParser.text(_, _, charset), textToUTF8String),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
@@ -146,7 +148,12 @@ object MultiLineJsonDataSource extends JsonDataSource {
       parsedOptions: JSONOptions): StructType = {
     val json: RDD[PortableDataStream] = createBaseRdd(sparkSession, inputPaths)
     val sampled: RDD[PortableDataStream] = JsonUtils.sample(json, parsedOptions)
-    JsonInferSchema.infer(sampled, parsedOptions, createParser)
+
+    JsonInferSchema.infer[PortableDataStream](
+      sampled,
+      parsedOptions,
+      createParser(_, _, parsedOptions.charset)
+    )
   }
 
   private def createBaseRdd(
@@ -168,33 +175,43 @@ object MultiLineJsonDataSource extends JsonDataSource {
       .values
   }
 
-  private def createParser(jsonFactory: JsonFactory, record: PortableDataStream): JsonParser = {
+  private def createParser(
+      jsonFactory: JsonFactory,
+      record: PortableDataStream,
+      charset: Option[String] = None): JsonParser = {
     val path = new Path(record.getPath())
     CreateJacksonParser.inputStream(
       jsonFactory,
-      CodecStreams.createInputStreamWithCloseResource(record.getConfiguration, path))
+      CodecStreams.createInputStreamWithCloseResource(record.getConfiguration, path),
+      charset
+    )
   }
 
   override def readFile(
       conf: Configuration,
       file: PartitionedFile,
       parser: JacksonParser,
       schema: StructType): Iterator[InternalRow] = {
+    def createInputStream() = {
+      CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath)))
+    }
     def partitionedFileString(ignored: Any): UTF8String = {
-      Utils.tryWithResource {
-        CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath)))
-      } { inputStream =>
-        UTF8String.fromBytes(ByteStreams.toByteArray(inputStream))
+      Utils.tryWithResource(createInputStream()) { is =>
+        UTF8String.fromBytes(ByteStreams.toByteArray(is))
       }
     }
+    val charset = parser.options.charset
 
     val safeParser = new FailureSafeParser[InputStream](
-      input => parser.parse(input, CreateJacksonParser.inputStream, partitionedFileString),
+      input => parser.parse[InputStream](
+        input,
+        CreateJacksonParser.inputStream(_, _, charset),
+        partitionedFileString
+      ),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
 
-    safeParser.parse(
-      CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))))
+    safeParser.parse(createInputStream())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.json
 
+import java.nio.charset.{Charset, StandardCharsets}
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
@@ -151,7 +153,16 @@ private[json] class JsonOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter with Logging {
 
-  private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path))
+  private val charset = options.charset match {
+    case Some(charsetName) => Charset.forName(charsetName)
+    case _ => StandardCharsets.UTF_8
+  }
+
+  private val writer = CodecStreams.createOutputStreamWriter(
+    context,
+    new Path(path),
+    charset
+  )
 
   // create the Generator without separator inserted between 2 records
   private[this] val gen = new JacksonGenerator(dataSchema, writer, options)

diff --git a/sql/core/src/test/resources/json-tests/utf16LE.json b/sql/core/src/test/resources/json-tests/utf16LE.json
diff --git a/sql/core/src/test/resources/json-tests/utf16WithBOM.json b/sql/core/src/test/resources/json-tests/utf16WithBOM.json
diff --git a/sql/core/src/test/resources/json-tests/utf32BEWithBOM.json b/sql/core/src/test/resources/json-tests/utf32BEWithBOM.json