-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-19611][SQL] Introduce configurable table schema inference #16944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -296,6 +296,25 @@ object SQLConf { | |
| .longConf | ||
| .createWithDefault(250 * 1024 * 1024) | ||
|
|
||
| object HiveCaseSensitiveInferenceMode extends Enumeration { | ||
|
||
| val INFER_AND_SAVE, INFER_ONLY, NEVER_INFER = Value | ||
| } | ||
|
|
||
| val HIVE_CASE_SENSITIVE_INFERENCE = buildConf("spark.sql.hive.caseSensitiveInferenceMode") | ||
| .doc("Sets the action to take when a case-sensitive schema cannot be read from a Hive " + | ||
| "table's properties. Although Spark SQL itself is not case-sensitive, Hive compatible file " + | ||
| "formats such as Parquet are. Spark SQL must use a case-preserving schema when querying " + | ||
| "any table backed by files containing case-sensitive field names or queries may not return " + | ||
| "accurate results. Valid options include INFER_AND_SAVE (the default mode-- infer the " + | ||
| "case-sensitive schema from the underlying data files and write it back to the table " + | ||
| "properties), INFER_ONLY (infer the schema but don't attempt to write it to the table " + | ||
| "properties) and NEVER_INFER (fallback to using the case-insensitive metastore schema " + | ||
| "instead of inferring).") | ||
| .stringConf | ||
| .transform(_.toUpperCase()) | ||
| .checkValues(HiveCaseSensitiveInferenceMode.values.map(_.toString)) | ||
| .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString) | ||
|
|
||
| val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly") | ||
| .doc("When true, enable the metadata-only query optimization that use the table's metadata " + | ||
| "to produce the partition columns instead of table scans. It applies when all the columns " + | ||
|
|
@@ -792,6 +811,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { | |
|
|
||
| def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE) | ||
|
|
||
| def caseSensitiveInferenceMode: HiveCaseSensitiveInferenceMode.Value = | ||
| HiveCaseSensitiveInferenceMode.withName(getConf(HIVE_CASE_SENSITIVE_INFERENCE)) | ||
|
|
||
| def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT) | ||
|
|
||
| def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -597,6 +597,25 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| } | ||
| } | ||
|
|
||
| override def alterTableSchema(db: String, table: String, schema: StructType): Unit = withClient { | ||
| requireTableExists(db, table) | ||
| val rawTable = getRawTable(db, table) | ||
| val withNewSchema = rawTable.copy(schema = schema) | ||
|
||
| // Add table metadata such as table schema, partition columns, etc. to table properties. | ||
| val updatedTable = withNewSchema.copy( | ||
| properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema)) | ||
| try { | ||
| client.alterTable(updatedTable) | ||
| } catch { | ||
| case NonFatal(e) => | ||
| val warningMessage = | ||
| s"Could not alter schema of table ${rawTable.identifier.quotedString} in a Hive " + | ||
| "compatible way. Updating Hive metastore in Spark SQL specific format." | ||
| logWarning(warningMessage, e) | ||
| client.alterTable(updatedTable.copy(schema = updatedTable.partitionSchema)) | ||
| } | ||
| } | ||
|
|
||
| override def getTable(db: String, table: String): CatalogTable = withClient { | ||
| restoreTableMetadata(getRawTable(db, table)) | ||
| } | ||
|
|
@@ -690,10 +709,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat | |
| "different from the schema when this table was created by Spark SQL" + | ||
| s"(${schemaFromTableProps.simpleString}). We have to fall back to the table schema " + | ||
| "from Hive metastore which is not case preserving.") | ||
| hiveTable | ||
| hiveTable.copy(schemaPreservesCase = false) | ||
|
||
| } | ||
| } else { | ||
| hiveTable | ||
| hiveTable.copy(schemaPreservesCase = false) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we create a special table property?
object CatalogTabledefines some specify properties for view and we can follow it. If we keeps adding more parameters, we may blow up theCatalogTableone day...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and when we fix the schema and try to write it back, remember to remove this property first.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I considered taking this approach but I think adding this as a parameter to CatalogTable itself is more explicit and less flaky. I share your concern that adding more and more parameters to CatalogTable could make this less usable, especially since params like
schemaPreservesCasereally only matter when dealing with Hive tables.However, I don't think dumping more and more parameters into
propertiesis a great solution either. As you've pointed out, we would need to filter out the properties only used internally by Spark before writing them to the catalog. HiveExternalCatalog already filters out Spark SQL-specific properties from the CatalogTable returned by HiveClient. Adding additional internal properties would put us in a place where properties contains:Which isn't even to mention that we'll have to be serializing/deserializing this value to and from a (String, String) pair just to pass information between
HiveExternalCatalogandHiveMetastoreCatalog.I think that if CatalogTable ends up with too many datasource-specific internal parameters then maybe it makes more sense to introduce a new Map element, e.g.
internalProperties, so these don't get mixed in with the table properties.