-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-16552] [SQL] Store the Inferred Schemas into External Catalog Tables when Creating Tables #14207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-16552] [SQL] Store the Inferred Schemas into External Catalog Tables when Creating Tables #14207
Changes from all commits
3c992a9
5ed4e68
3be0dc0
c6afbbb
55c2c5e
a043ca2
e930819
727ecf8
1ee1743
b404eec
224b048
264ad35
6492e98
1ab7897
b694d8b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan | |||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||||
| import org.apache.spark.sql.execution.datasources._ | ||||
| import org.apache.spark.sql.internal.HiveSerDe | ||||
| import org.apache.spark.sql.sources.InsertableRelation | ||||
| import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} | ||||
| import org.apache.spark.sql.types._ | ||||
|
|
||||
| /** | ||||
|
|
@@ -52,7 +52,7 @@ case class CreateDataSourceTableCommand( | |||
| userSpecifiedSchema: Option[StructType], | ||||
| provider: String, | ||||
| options: Map[String, String], | ||||
| partitionColumns: Array[String], | ||||
| userSpecifiedPartitionColumns: Array[String], | ||||
| bucketSpec: Option[BucketSpec], | ||||
| ignoreIfExists: Boolean, | ||||
| managedIfNoPath: Boolean) | ||||
|
|
@@ -95,17 +95,39 @@ case class CreateDataSourceTableCommand( | |||
| } | ||||
|
|
||||
| // Create the relation to validate the arguments before writing the metadata to the metastore. | ||||
| DataSource( | ||||
| sparkSession = sparkSession, | ||||
| userSpecifiedSchema = userSpecifiedSchema, | ||||
| className = provider, | ||||
| bucketSpec = None, | ||||
| options = optionsWithPath).resolveRelation(checkPathExist = false) | ||||
| val dataSource: BaseRelation = | ||||
| DataSource( | ||||
| sparkSession = sparkSession, | ||||
| userSpecifiedSchema = userSpecifiedSchema, | ||||
| className = provider, | ||||
| bucketSpec = None, | ||||
| options = optionsWithPath).resolveRelation(checkPathExist = false) | ||||
|
|
||||
| val partitionColumns = if (userSpecifiedSchema.nonEmpty) { | ||||
| userSpecifiedPartitionColumns | ||||
| } else { | ||||
| val res = dataSource match { | ||||
| case r: HadoopFsRelation => r.partitionSchema.fieldNames | ||||
| case _ => Array.empty[String] | ||||
| } | ||||
| if (userSpecifiedPartitionColumns.length > 0) { | ||||
| // The table does not have a specified schema, which means that the schema will be inferred | ||||
| // when we load the table. So, we are not expecting partition columns and we will discover | ||||
| // partitions when we load the table. However, if there are specified partition columns, | ||||
| // we simply ignore them and provide a warning message. | ||||
| logWarning( | ||||
| s"Specified partition columns (${userSpecifiedPartitionColumns.mkString(",")}) will be " + | ||||
| s"ignored. The schema and partition columns of table $tableIdent are inferred. " + | ||||
| s"Schema: ${dataSource.schema.simpleString}; " + | ||||
| s"Partition columns: ${res.mkString("(", ", ", ")")}") | ||||
| } | ||||
| res | ||||
| } | ||||
|
|
||||
| CreateDataSourceTableUtils.createDataSourceTable( | ||||
| sparkSession = sparkSession, | ||||
| tableIdent = tableIdent, | ||||
| userSpecifiedSchema = userSpecifiedSchema, | ||||
| schema = dataSource.schema, | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems we should still use the user-specified schema, right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think from the code, it is not very clear that
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, Actually, after re-checking the code, I found the schema might be adjusted a little even if users specify the schema. For example, the nullability could be changed : spark/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala Line 407 in 64529b1
I think we should make such a change but maybe we should test and log it? |
||||
| partitionColumns = partitionColumns, | ||||
| bucketSpec = bucketSpec, | ||||
| provider = provider, | ||||
|
|
@@ -213,7 +235,7 @@ case class CreateDataSourceTableAsSelectCommand( | |||
| } | ||||
| existingSchema = Some(l.schema) | ||||
| case s: SimpleCatalogRelation if DDLUtils.isDatasourceTable(s.metadata) => | ||||
| existingSchema = DDLUtils.getSchemaFromTableProperties(s.metadata) | ||||
| existingSchema = Some(DDLUtils.getSchemaFromTableProperties(s.metadata)) | ||||
| case o => | ||||
| throw new AnalysisException(s"Saving data in ${o.toString} is not supported.") | ||||
| } | ||||
|
|
@@ -256,7 +278,7 @@ case class CreateDataSourceTableAsSelectCommand( | |||
| CreateDataSourceTableUtils.createDataSourceTable( | ||||
| sparkSession = sparkSession, | ||||
| tableIdent = tableIdent, | ||||
| userSpecifiedSchema = Some(result.schema), | ||||
| schema = result.schema, | ||||
| partitionColumns = partitionColumns, | ||||
| bucketSpec = bucketSpec, | ||||
| provider = provider, | ||||
|
|
@@ -306,7 +328,7 @@ object CreateDataSourceTableUtils extends Logging { | |||
| def createDataSourceTable( | ||||
| sparkSession: SparkSession, | ||||
| tableIdent: TableIdentifier, | ||||
| userSpecifiedSchema: Option[StructType], | ||||
| schema: StructType, | ||||
| partitionColumns: Array[String], | ||||
| bucketSpec: Option[BucketSpec], | ||||
| provider: String, | ||||
|
|
@@ -315,28 +337,26 @@ object CreateDataSourceTableUtils extends Logging { | |||
| val tableProperties = new mutable.HashMap[String, String] | ||||
| tableProperties.put(DATASOURCE_PROVIDER, provider) | ||||
|
|
||||
| // Saves optional user specified schema. Serialized JSON schema string may be too long to be | ||||
| // stored into a single metastore SerDe property. In this case, we split the JSON string and | ||||
| // store each part as a separate SerDe property. | ||||
| userSpecifiedSchema.foreach { schema => | ||||
| val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold | ||||
| val schemaJsonString = schema.json | ||||
| // Split the JSON string. | ||||
| val parts = schemaJsonString.grouped(threshold).toSeq | ||||
| tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) | ||||
| parts.zipWithIndex.foreach { case (part, index) => | ||||
| tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) | ||||
| } | ||||
| // Serialized JSON schema string may be too long to be stored into a single metastore table | ||||
| // property. In this case, we split the JSON string and store each part as a separate table | ||||
| // property. | ||||
| val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold | ||||
| val schemaJsonString = schema.json | ||||
| // Split the JSON string. | ||||
| val parts = schemaJsonString.grouped(threshold).toSeq | ||||
| tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) | ||||
| parts.zipWithIndex.foreach { case (part, index) => | ||||
| tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) | ||||
| } | ||||
|
|
||||
| if (userSpecifiedSchema.isDefined && partitionColumns.length > 0) { | ||||
| if (partitionColumns.length > 0) { | ||||
| tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString) | ||||
| partitionColumns.zipWithIndex.foreach { case (partCol, index) => | ||||
| tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol) | ||||
| } | ||||
| } | ||||
|
|
||||
| if (userSpecifiedSchema.isDefined && bucketSpec.isDefined) { | ||||
| if (bucketSpec.isDefined) { | ||||
| val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get | ||||
|
|
||||
| tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString) | ||||
|
|
@@ -353,16 +373,6 @@ object CreateDataSourceTableUtils extends Logging { | |||
| } | ||||
| } | ||||
|
|
||||
| if (userSpecifiedSchema.isEmpty && partitionColumns.length > 0) { | ||||
| // The table does not have a specified schema, which means that the schema will be inferred | ||||
| // when we load the table. So, we are not expecting partition columns and we will discover | ||||
| // partitions when we load the table. However, if there are specified partition columns, | ||||
| // we simply ignore them and provide a warning message. | ||||
| logWarning( | ||||
| s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " + | ||||
| s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.") | ||||
| } | ||||
|
|
||||
| val tableType = if (isExternal) { | ||||
| tableProperties.put("EXTERNAL", "TRUE") | ||||
| CatalogTableType.EXTERNAL | ||||
|
|
@@ -375,7 +385,7 @@ object CreateDataSourceTableUtils extends Logging { | |||
| val dataSource = | ||||
| DataSource( | ||||
| sparkSession, | ||||
| userSpecifiedSchema = userSpecifiedSchema, | ||||
| userSpecifiedSchema = Some(schema), | ||||
| partitionColumns = partitionColumns, | ||||
| bucketSpec = bucketSpec, | ||||
| className = provider, | ||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -521,31 +521,29 @@ object DDLUtils { | |
| table.partitionColumns.nonEmpty || table.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS) | ||
| } | ||
|
|
||
| // A persisted data source table may not store its schema in the catalog. In this case, its schema | ||
| // will be inferred at runtime when the table is referenced. | ||
| def getSchemaFromTableProperties(metadata: CatalogTable): Option[StructType] = { | ||
| // A persisted data source table always store its schema in the catalog. | ||
| def getSchemaFromTableProperties(metadata: CatalogTable): StructType = { | ||
| require(isDatasourceTable(metadata)) | ||
| val msgSchemaCorrupted = "Could not read schema from the metastore because it is corrupted." | ||
| val props = metadata.properties | ||
| if (props.isDefinedAt(DATASOURCE_SCHEMA)) { | ||
| props.get(DATASOURCE_SCHEMA).map { schema => | ||
| // Originally, we used spark.sql.sources.schema to store the schema of a data source table. | ||
| // After SPARK-6024, we removed this flag. | ||
| // Although we are not using spark.sql.sources.schema any more, we need to still support. | ||
| props.get(DATASOURCE_SCHEMA).map(DataType.fromJson(_).asInstanceOf[StructType]) | ||
| } else { | ||
| metadata.properties.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => | ||
| DataType.fromJson(schema).asInstanceOf[StructType] | ||
| } getOrElse { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure if |
||
| props.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => | ||
| val parts = (0 until numParts.toInt).map { index => | ||
| val part = metadata.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull | ||
| if (part == null) { | ||
| throw new AnalysisException( | ||
| "Could not read schema from the metastore because it is corrupted " + | ||
| s"(missing part $index of the schema, $numParts parts are expected).") | ||
| throw new AnalysisException(msgSchemaCorrupted + | ||
| s" (missing part $index of the schema, $numParts parts are expected).") | ||
| } | ||
|
|
||
| part | ||
| } | ||
| // Stick all parts back to a single schema string. | ||
| DataType.fromJson(parts.mkString).asInstanceOf[StructType] | ||
| } | ||
| } getOrElse(throw new AnalysisException(msgSchemaCorrupted)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, this
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. : ) |
||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we throw an exception for this case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here, I just keep the existing behavior.
To be honest, I think we should throw an exception whenever it makes sense. It sounds like the job log is not being read by most users. Will submit a follow-up PR to make it a change. Thanks!