Skip to content

Commit ef5e2a0

Browse files
brkyvzcloud-fan
authored andcommitted
[SPARK-20549] java.io.CharConversionException: Invalid UTF-32' in JsonToStructs
## What changes were proposed in this pull request? A fix for the same problem was made in #17693 but ignored `JsonToStructs`. This PR uses the same fix for `JsonToStructs`. ## How was this patch tested? Regression test Author: Burak Yavuz <[email protected]> Closes #17826 from brkyvz/SPARK-20549. (cherry picked from commit 86174ea) Signed-off-by: Wenchen Fan <[email protected]>
1 parent b146481 commit ef5e2a0

File tree

3 files changed

+15
-7
lines changed

3 files changed

+15
-7
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,7 @@ case class GetJsonObject(json: Expression, path: Expression)
151151
try {
152152
/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson
153153
detect character encoding which could fail for some malformed strings */
154-
Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
155-
new ByteArrayInputStream(jsonStr.getBytes), "UTF-8"))) { parser =>
154+
Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, jsonStr)) { parser =>
156155
val output = new ByteArrayOutputStream()
157156
val matched = Utils.tryWithResource(
158157
jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator =>
@@ -398,9 +397,8 @@ case class JsonTuple(children: Seq[Expression])
398397
try {
399398
/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having Jackson
400399
detect character encoding which could fail for some malformed strings */
401-
Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
402-
new ByteArrayInputStream(json.getBytes), "UTF-8"))) {
403-
parser => parseRow(parser, input)
400+
Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser =>
401+
parseRow(parser, input)
404402
}
405403
} catch {
406404
case _: JsonProcessingException =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.json
1919

20-
import java.io.InputStream
20+
import java.io.{ByteArrayInputStream, InputStream, InputStreamReader}
2121

2222
import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
2323
import org.apache.hadoop.io.Text
@@ -33,7 +33,10 @@ private[sql] object CreateJacksonParser extends Serializable {
3333
val bb = record.getByteBuffer
3434
assert(bb.hasArray)
3535

36-
jsonFactory.createParser(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining())
36+
val bain = new ByteArrayInputStream(
37+
bb.array(), bb.arrayOffset() + bb.position(), bb.remaining())
38+
39+
jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
3740
}
3841

3942
def text(jsonFactory: JsonFactory, record: Text): JsonParser = {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
453453
)
454454
}
455455

456+
test("SPARK-20549: from_json bad UTF-8") {
457+
val schema = StructType(StructField("a", IntegerType) :: Nil)
458+
checkEvaluation(
459+
JsonToStructs(schema, Map.empty, Literal(badJson), gmtId),
460+
null)
461+
}
462+
456463
test("from_json with timestamp") {
457464
val schema = StructType(StructField("t", TimestampType) :: Nil)
458465

0 commit comments

Comments
 (0)