diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java index 2f575c55ea36..9c4487dbd3f7 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java @@ -542,12 +542,13 @@ private void buildJson(JsonParser parser) throws IOException { } // Choose the smallest unsigned integer type that can store `value`. It must be within - // `[0, U24_MAX]`. + // `[0, SIZE_LIMIT]`. private int getIntegerSize(int value) { - assert value >= 0 && value <= U24_MAX; + assert value >= 0 && value <= SIZE_LIMIT; if (value <= U8_MAX) return 1; if (value <= U16_MAX) return 2; - return U24_SIZE; + if (value <= U24_MAX) return 3; + return 4; } private void parseFloatingPoint(JsonParser parser) throws IOException { diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java index 4205b76a530a..63937f9dd217 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java @@ -137,8 +137,8 @@ public class VariantUtil { public static final int U24_SIZE = 3; public static final int U32_SIZE = 4; - // Both variant value and variant metadata need to be no longer than 16MiB. - public static final int SIZE_LIMIT = U24_MAX + 1; + // Both variant value and variant metadata need to be no longer than 128MiB. + public static final int SIZE_LIMIT = 128 * 1024 * 1024; public static final int MAX_DECIMAL4_PRECISION = 9; public static final int MAX_DECIMAL8_PRECISION = 18; diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala index f599fead4501..8f8256ff1f6e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala @@ -133,10 +133,9 @@ class VariantExpressionEvalUtilsSuite extends SparkFunSuite { checkException(json, "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", Map("badRecord" -> json, "failFastMode" -> "FAILFAST")) } - for (json <- Seq("\"" + "a" * (16 * 1024 * 1024) + "\"", - (0 to 4 * 1024 * 1024).mkString("[", ",", "]"))) { + for (json <- Seq((0 to 32 * 1024 * 1024).mkString("[", ",", "]"))) { checkException(json, "VARIANT_SIZE_LIMIT", - Map("sizeLimit" -> "16.0 MiB", "functionName" -> "`parse_json`")) + Map("sizeLimit" -> "128.0 MiB", "functionName" -> "`parse_json`")) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala index 1cd8cc6228ef..5d61d09cfc4b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala @@ -93,21 +93,21 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { check(Array(primitiveHeader(INT1), 0), Array[Byte](3, 0, 0)) check(Array(primitiveHeader(INT1), 0), Array[Byte](2, 0, 0)) - // Construct binary values that are over 1 << 24 bytes, but otherwise valid. + // Construct binary values that are over SIZE_LIMIT bytes, but otherwise valid. val bigVersion = Array[Byte]((VERSION | (3 << 6)).toByte) - val a = Array.fill(1 << 24)('a'.toByte) + val a = Array.fill(SIZE_LIMIT)('a'.toByte) val hugeMetadata = bigVersion ++ Array[Byte](2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1) ++ a ++ Array[Byte]('b') check(Array(primitiveHeader(TRUE)), hugeMetadata, "VARIANT_CONSTRUCTOR_SIZE_LIMIT") // The keys are 'aaa....' and 'b'. Values are "yyy..." and 'true'. - val y = Array.fill(1 << 24)('y'.toByte) + val y = Array.fill(SIZE_LIMIT)('y'.toByte) val hugeObject = Array[Byte](objectHeader(true, 4, 4)) ++ /* size */ padded(Array(2), 4) ++ /* id list */ padded(Array(0, 1), 4) ++ - // Second value starts at offset 5 + (1 << 24), which is `5001` little-endian. The last value - // is 1 byte, so the one-past-the-end value is `6001` - /* offset list */ Array[Byte](0, 0, 0, 0, 5, 0, 0, 1, 6, 0, 0, 1) ++ + // Second value starts at offset 5 + (SIZE_LIMIT), which is `5008` little-endian. The last + // value is 1 byte, so the one-past-the-end value is `6008` + /* offset list */ Array[Byte](0, 0, 0, 0, 5, 0, 0, 8, 6, 0, 0, 8) ++ /* field data */ Array[Byte](primitiveHeader(LONG_STR), 0, 0, 0, 1) ++ y ++ Array[Byte]( primitiveHeader(TRUE) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala index cf83ae30a679..620d00b81651 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala @@ -135,7 +135,8 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { check("{1:2}", null) check("{\"a\":1", null) check("{\"a\":[a,b,c]}", null) - check("\"" + "a" * (16 * 1024 * 1024) + "\"", null) + check("\"" + "a" * (16 * 1024 * 1024) + "\"") + check("\"" + "a" * (128 * 1024 * 1024) + "\"", null) } test("to_json with nested variant") {