-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-27237][SS] Introduce State schema validation among query restart #24173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1294,6 +1294,14 @@ object SQLConf { | |
| .createWithDefault( | ||
| "org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider") | ||
|
|
||
| val STATE_SCHEMA_CHECK_ENABLED = | ||
| buildConf("spark.sql.streaming.stateStore.stateSchemaCheck") | ||
| .doc("When true, Spark will validate the state schema against schema on existing state and " + | ||
| "fail query if it's incompatible.") | ||
| .version("3.1.0") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT = | ||
| buildConf("spark.sql.streaming.stateStore.minDeltasForSnapshot") | ||
| .internal() | ||
|
|
@@ -3064,6 +3072,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS) | ||
|
|
||
| def isStateSchemaCheckEnabled: Boolean = getConf(STATE_SCHEMA_CHECK_ENABLED) | ||
|
||
|
|
||
| def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT) | ||
|
|
||
| def stateStoreFormatValidationEnabled: Boolean = getConf(STATE_STORE_FORMAT_VALIDATION_ENABLED) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.streaming | ||
|
|
||
| object MetadataVersionUtil { | ||
| /** | ||
| * Parse the log version from the given `text` -- will throw exception when the parsed version | ||
| * exceeds `maxSupportedVersion`, or when `text` is malformed (such as "xyz", "v", "v-1", | ||
| * "v123xyz" etc.) | ||
| */ | ||
| def validateVersion(text: String, maxSupportedVersion: Int): Int = { | ||
| if (text.length > 0 && text(0) == 'v') { | ||
| val version = | ||
| try { | ||
| text.substring(1, text.length).toInt | ||
| } catch { | ||
| case _: NumberFormatException => | ||
| throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + | ||
| s"version from $text.") | ||
| } | ||
| if (version > 0) { | ||
| if (version > maxSupportedVersion) { | ||
| throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " + | ||
| s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " + | ||
| s"by a newer version of Spark and cannot be read by this version. Please upgrade.") | ||
| } else { | ||
| return version | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // reaching here means we failed to read the correct log version | ||
| throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + | ||
| s"version from $text.") | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.streaming.state | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, MetadataVersionUtil} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
|
|
||
| case class StateSchemaNotCompatible(message: String) extends Exception(message) | ||
|
|
||
| class StateSchemaCompatibilityChecker( | ||
| providerId: StateStoreProviderId, | ||
| hadoopConf: Configuration) extends Logging { | ||
|
|
||
| private val storeCpLocation = providerId.storeId.storeCheckpointLocation() | ||
| private val fm = CheckpointFileManager.create(storeCpLocation, hadoopConf) | ||
| private val schemaFileLocation = schemaFile(storeCpLocation) | ||
|
|
||
| fm.mkdirs(schemaFileLocation.getParent) | ||
|
|
||
| def check(keySchema: StructType, valueSchema: StructType): Unit = { | ||
| if (fm.exists(schemaFileLocation)) { | ||
| logDebug(s"Schema file for provider $providerId exists. Comparing with provided schema.") | ||
| val (storedKeySchema, storedValueSchema) = readSchemaFile() | ||
| if (storedKeySchema.equals(keySchema) && storedValueSchema.equals(valueSchema)) { | ||
| // schema is exactly same | ||
| } else if (!schemasCompatible(storedKeySchema, keySchema) || | ||
| !schemasCompatible(storedValueSchema, valueSchema)) { | ||
| val errorMsg = "Provided schema doesn't match to the schema for existing state! " + | ||
| "Please note that Spark allow difference of field name: check count of fields " + | ||
| "and data type of each field.\n" + | ||
| s"- Provided key schema: $keySchema\n" + | ||
| s"- Provided value schema: $valueSchema\n" + | ||
| s"- Existing key schema: $storedKeySchema\n" + | ||
| s"- Existing value schema: $storedValueSchema\n" + | ||
| s"If you want to force running query without schema validation, please set " + | ||
| s"${SQLConf.STATE_SCHEMA_CHECK_ENABLED.key} to false.\n" + | ||
| "Please note running query with incompatible schema could cause indeterministic" + | ||
| " behavior." | ||
| logError(errorMsg) | ||
| throw StateSchemaNotCompatible(errorMsg) | ||
| } else { | ||
| logInfo("Detected schema change which is compatible. Allowing to put rows.") | ||
| } | ||
| } else { | ||
| // schema doesn't exist, create one now | ||
| logDebug(s"Schema file for provider $providerId doesn't exist. Creating one.") | ||
| createSchemaFile(keySchema, valueSchema) | ||
| } | ||
| } | ||
|
|
||
| private def schemasCompatible(storedSchema: StructType, schema: StructType): Boolean = | ||
| DataType.equalsIgnoreNameAndCompatibleNullability(storedSchema, schema) | ||
|
|
||
| private def readSchemaFile(): (StructType, StructType) = { | ||
| val inStream = fm.open(schemaFileLocation) | ||
| try { | ||
| val versionStr = inStream.readUTF() | ||
| // Currently we only support version 1, which we can simplify the version validation and | ||
| // the parse logic. | ||
| val version = MetadataVersionUtil.validateVersion(versionStr, | ||
| StateSchemaCompatibilityChecker.VERSION) | ||
| require(version == 1) | ||
|
|
||
| val keySchemaStr = inStream.readUTF() | ||
| val valueSchemaStr = inStream.readUTF() | ||
|
|
||
| (StructType.fromString(keySchemaStr), StructType.fromString(valueSchemaStr)) | ||
| } catch { | ||
| case e: Throwable => | ||
| logError(s"Fail to read schema file from $schemaFileLocation", e) | ||
| throw e | ||
| } finally { | ||
| inStream.close() | ||
| } | ||
| } | ||
|
|
||
| private def createSchemaFile(keySchema: StructType, valueSchema: StructType): Unit = { | ||
| val outStream = fm.createAtomic(schemaFileLocation, overwriteIfPossible = false) | ||
| try { | ||
| outStream.writeUTF(s"v${StateSchemaCompatibilityChecker.VERSION}") | ||
| outStream.writeUTF(keySchema.json) | ||
HeartSaVioR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| outStream.writeUTF(valueSchema.json) | ||
| outStream.close() | ||
| } catch { | ||
| case e: Throwable => | ||
| logError(s"Fail to write schema file to $schemaFileLocation", e) | ||
| outStream.cancel() | ||
| throw e | ||
| } | ||
| } | ||
|
|
||
| private def schemaFile(storeCpLocation: Path): Path = | ||
| new Path(new Path(storeCpLocation, "_metadata"), "schema") | ||
| } | ||
|
|
||
| object StateSchemaCompatibilityChecker { | ||
| val VERSION = 1 | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ import java.util.concurrent.{ScheduledFuture, TimeUnit} | |
| import javax.annotation.concurrent.GuardedBy | ||
|
|
||
| import scala.collection.mutable | ||
| import scala.util.Try | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
|
|
@@ -280,14 +281,14 @@ object StateStoreProvider { | |
| * Return a instance of the required provider, initialized with the given configurations. | ||
| */ | ||
| def createAndInit( | ||
| stateStoreId: StateStoreId, | ||
| providerId: StateStoreProviderId, | ||
| keySchema: StructType, | ||
| valueSchema: StructType, | ||
| indexOrdinal: Option[Int], // for sorting the data | ||
| storeConf: StateStoreConf, | ||
| hadoopConf: Configuration): StateStoreProvider = { | ||
| val provider = create(storeConf.providerClass) | ||
| provider.init(stateStoreId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) | ||
| provider.init(providerId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) | ||
| provider | ||
| } | ||
|
|
||
|
|
@@ -386,10 +387,14 @@ object StateStore extends Logging { | |
|
|
||
| val MAINTENANCE_INTERVAL_CONFIG = "spark.sql.streaming.stateStore.maintenanceInterval" | ||
| val MAINTENANCE_INTERVAL_DEFAULT_SECS = 60 | ||
| val PARTITION_ID_TO_CHECK_SCHEMA = 0 | ||
|
|
||
| @GuardedBy("loadedProviders") | ||
| private val loadedProviders = new mutable.HashMap[StateStoreProviderId, StateStoreProvider]() | ||
|
|
||
| @GuardedBy("loadedProviders") | ||
| private val schemaValidated = new mutable.HashMap[StateStoreProviderId, Option[Throwable]]() | ||
|
|
||
| /** | ||
| * Runs the `task` periodically and automatically cancels it if there is an exception. `onError` | ||
| * will be called when an exception happens. | ||
|
|
@@ -467,10 +472,29 @@ object StateStore extends Logging { | |
| hadoopConf: Configuration): StateStoreProvider = { | ||
| loadedProviders.synchronized { | ||
| startMaintenanceIfNeeded() | ||
|
|
||
| if (storeProviderId.storeId.partitionId == PARTITION_ID_TO_CHECK_SCHEMA) { | ||
| val result = schemaValidated.getOrElseUpdate(storeProviderId, { | ||
| val checker = new StateSchemaCompatibilityChecker(storeProviderId, hadoopConf) | ||
| // regardless of configuration, we check compatibility to at least write schema file | ||
| // if necessary | ||
| val ret = Try(checker.check(keySchema, valueSchema)).toEither.fold(Some(_), _ => None) | ||
| if (storeConf.stateSchemaCheckEnabled) { | ||
| ret | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
|
|
||
| if (result.isDefined) { | ||
| throw result.get | ||
| } | ||
| } | ||
|
|
||
| val provider = loadedProviders.getOrElseUpdate( | ||
| storeProviderId, | ||
| StateStoreProvider.createAndInit( | ||
| storeProviderId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) | ||
| storeProviderId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) | ||
| ) | ||
| reportActiveStoreInstance(storeProviderId) | ||
| provider | ||
|
|
@@ -482,6 +506,12 @@ object StateStore extends Logging { | |
| loadedProviders.remove(storeProviderId).foreach(_.close()) | ||
| } | ||
|
|
||
| /** Unload all state store providers: unit test purpose */ | ||
| private[sql] def unloadAll(): Unit = loadedProviders.synchronized { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto, if we eagerly check the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here as well. |
||
| loadedProviders.keySet.foreach { key => unload(key) } | ||
| loadedProviders.clear() | ||
| } | ||
|
|
||
| /** Whether a state store provider is loaded or not */ | ||
| def isLoaded(storeProviderId: StateStoreProviderId): Boolean = loadedProviders.synchronized { | ||
| loadedProviders.contains(storeProviderId) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.version("3.1.0")?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah the versioning wasn't even existed but now it's needed. Thanks for the pointer.