-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api #4826
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4ec17fd
8f19fe5
9a26611
0a703e7
bf50d73
8bd008b
b2c06f8
e4f397c
f6ed813
0eb5578
8a3f237
8360817
d3747d1
486ed08
3cec464
0cb7ea2
587d88b
80e487e
3b61a04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -115,4 +115,87 @@ class DataTypeSuite extends FunSuite { | |
| checkDefaultSize(MapType(IntegerType, StringType, true), 410000) | ||
| checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400) | ||
| checkDefaultSize(structType, 812) | ||
|
|
||
| def checkEqualsIgnoreCompatibleNullability( | ||
| from: DataType, | ||
| to: DataType, | ||
| expected: Boolean): Unit = { | ||
| val testName = | ||
| s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})" | ||
| test(testName) { | ||
| assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected) | ||
| } | ||
| } | ||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = true), | ||
| to = ArrayType(DoubleType, containsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(DoubleType, containsNull = false), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(DoubleType, containsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = true), | ||
| to = ArrayType(DoubleType, containsNull = false), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(StringType, containsNull = false), | ||
| expected = false) | ||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true), | ||
| to = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true), | ||
| expected = false) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to add another test case to show nested case: checkEqualsIgnoreCompatibleNullability(
from = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true),
to = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true),
expected = true)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done (added a test after this one) |
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true), | ||
| to = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true), | ||
| expected = true) | ||
|
|
||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType( | ||
| StructField("a", StringType, nullable = false) :: | ||
| StructField("b", StringType, nullable = true) :: Nil), | ||
| to = StructType( | ||
| StructField("a", StringType, nullable = false) :: | ||
| StructField("b", StringType, nullable = false) :: Nil), | ||
| expected = false) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ import java.util.logging.Level | |
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hadoop.fs.permission.FsAction | ||
| import org.apache.spark.sql.types.{StructType, DataType} | ||
| import parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat} | ||
| import parquet.hadoop.metadata.CompressionCodecName | ||
| import parquet.schema.MessageType | ||
|
|
@@ -172,9 +173,13 @@ private[sql] object ParquetRelation { | |
| sqlContext.conf.parquetCompressionCodec.toUpperCase, CompressionCodecName.UNCOMPRESSED) | ||
| .name()) | ||
| ParquetRelation.enableLogForwarding() | ||
| ParquetTypesConverter.writeMetaData(attributes, path, conf) | ||
| // This is a hack. We always set nullable/containsNull/valueContainsNull to true | ||
| // for the schema of a parquet data. | ||
| val schema = StructType.fromAttributes(attributes).asNullable | ||
| val newAttributes = schema.toAttributes | ||
| ParquetTypesConverter.writeMetaData(newAttributes, path, conf) | ||
| new ParquetRelation(path.toString, Some(conf), sqlContext) { | ||
| override val output = attributes | ||
| override val output = newAttributes | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @liancheng @marmbrus I am also changing the nullability for our old parquet write path to make the behavior consistent with our new write path. Let me know if there is any potential compatibility issue and we should revert this change.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also make data types of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind, since we always write nullable data, it should be OK to leave
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Verified that when merging schemas, official Parquet implementation will handle nullability (repetition level) properly. So our change should be safe for interoperation with other systems that support Parquet schema evolving. |
||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can introduce a method to the class of
DataTypebased on this one later (I am not sure what will be a good name. I thought aboutcompatibleWith, but I feel it is not very accurate).