[SPARK-54118][SS] Improve the put/merge operation in ListState when there are multiple values

huanliwang-db · anishshri-db · commit 7dd973d87cb1 · 2025-10-31T21:41:41.000-07:00
In SS TWS, when we do the put(array) operation in liststate, we put the first element and then merge the remaining elements one by one. so if we want to put an array with 100 elements, it means we need do 1 put + 99 merges. This can result in worse performance than a single put operation for the entire array. Similar, we have the same issue in merge(array) Ran the benchmark with inputRate = 1M keys/second - 1M key cardinality, here are the results for the batch duration with TWS in SS Before: ``` Batch Duration (ms) p50 666.00 p90 899.70 p95 969.35 p99 1081.94 ``` After ``` Batch Duration (ms) p50 488 p90 576 p95 609 p99 713 ``` ### What changes were proposed in this pull request? Improve the existing `put(array)` and `merge(array)` implementation to reduce the number of rocksdb operations. ### Why are the changes needed? performance improvement ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing UT and new UT ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#52820 from huanliwang-db/huanliwang-db/improve-list-state. Authored-by: huanliwang-db <huanli.wang@databricks.com> Signed-off-by: Anish Shrigondekar <anish.shrigondekar@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/transformwithstate/statevariables/ListStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/transformwithstate/statevariables/ListStateImpl.scala
@@ -88,21 +88,15 @@ class ListStateImpl[S](
      validateNewState(newState)
 
      val encodedKey = stateTypesEncoder.encodeGroupingKey()
-     var isFirst = true
      var entryCount = 0L
      TWSMetricsUtils.resetMetric(metrics, "numUpdatedStateRows")
 
-     newState.foreach { v =>
-       val encodedValue = stateTypesEncoder.encodeValue(v)
-       if (isFirst) {
-         store.put(encodedKey, encodedValue, stateName)
-         isFirst = false
-       } else {
-         store.merge(encodedKey, encodedValue, stateName)
-       }
+     val encodedValues = newState.map { v =>
        entryCount += 1
        TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows")
+       stateTypesEncoder.encodeValue(v).copy()
      }
+     store.putList(encodedKey, encodedValues, stateName)
      updateEntryCount(encodedKey, entryCount)
    }
 
@@ -123,12 +117,12 @@ class ListStateImpl[S](
 
      val encodedKey = stateTypesEncoder.encodeGroupingKey()
      var entryCount = getEntryCount(encodedKey)
-     newState.foreach { v =>
-       val encodedValue = stateTypesEncoder.encodeValue(v)
-       store.merge(encodedKey, encodedValue, stateName)
+     val encodedValues = newState.map { v =>
        entryCount += 1
        TWSMetricsUtils.incrementMetric(metrics, "numUpdatedStateRows")
+       stateTypesEncoder.encodeValue(v).copy()
      }
+     store.mergeList(encodedKey, encodedValues, stateName)
      updateEntryCount(encodedKey, entryCount)
    }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -292,11 +292,20 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
       throw StateStoreErrors.unsupportedOperationException("multipleValuesPerKey", providerName)
     }
 
+    override def putList(key: UnsafeRow, values: Array[UnsafeRow], colFamilyName: String): Unit = {
+      throw StateStoreErrors.unsupportedOperationException("putList", providerName)
+    }
+
     override def merge(key: UnsafeRow,
         value: UnsafeRow,
         colFamilyName: String): Unit = {
       throw StateStoreErrors.unsupportedOperationException("merge", providerName)
     }
+
+    override def mergeList(
+        key: UnsafeRow, values: Array[UnsafeRow], colFamilyName: String): Unit = {
+      throw StateStoreErrors.unsupportedOperationException("mergeList", providerName)
+    }
   }
 
   def getMetricsForProvider(): Map[String, Long] = synchronized {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala
@@ -40,6 +40,7 @@ import org.apache.spark.TaskContext
 import org.apache.spark.internal.{LogEntry, Logging, LogKeys}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.{NextIterator, Utils}
 
 // RocksDB operations that could acquire/release the instance lock
@@ -1050,6 +1051,69 @@ class RocksDB(
     changelogWriter.foreach(_.put(keyWithPrefix, valueWithChecksum))
   }
 
+  /**
+   * Convert the given list of value row bytes into a single byte array. The returned array
+   * bytes supports additional values to be later merged to it.
+   */
+  private def getListValuesInArrayByte(values: List[Array[Byte]]): Array[Byte] = {
+    // Delimit each value row bytes with a single byte delimiter, the last
+    // value row won't have a delimiter at the end.
+    val delimiterNum = values.length - 1
+    // The bytes in values already include the bytes length prefix
+    val totalSize = values.map(_.length).sum +
+      delimiterNum // for each delimiter
+
+    val result = new Array[Byte](totalSize)
+    var pos = Platform.BYTE_ARRAY_OFFSET
+
+    values.zipWithIndex.foreach { case (rowBytes, idx) =>
+      // Write the data
+      Platform.copyMemory(rowBytes, Platform.BYTE_ARRAY_OFFSET, result, pos, rowBytes.length)
+      pos += rowBytes.length
+
+      // Add the delimiter - we are using "," as the delimiter
+      if (idx < delimiterNum) {
+        result(pos - Platform.BYTE_ARRAY_OFFSET) = 44.toByte
+      }
+      // Move the position for delimiter
+      pos += 1
+    }
+    result
+  }
+
+  /**
+   * Put the given list of values for the given key.
+   * @note
+   *   This update is not committed to disk until commit() is called.
+   */
+  def putList(
+      key: Array[Byte],
+      values: List[Array[Byte]],
+      cfName: String = StateStore.DEFAULT_COL_FAMILY_NAME,
+      includesPrefix: Boolean = false,
+      deriveCfName: Boolean = false): Unit = {
+    updateMemoryUsageIfNeeded()
+    val keyWithPrefix = if (useColumnFamilies && !includesPrefix) {
+      encodeStateRowWithPrefix(key, cfName)
+    } else {
+      key
+    }
+
+    val valuesInArrayByte = getListValuesInArrayByte(values)
+
+    val columnFamilyName = if (deriveCfName && useColumnFamilies) {
+      val (_, cfName) = decodeStateRowWithPrefix(keyWithPrefix)
+      cfName
+    } else {
+      cfName
+    }
+
+    handleMetricsUpdate(keyWithPrefix, columnFamilyName, isPutOrMerge = true)
+    db.put(writeOptions, keyWithPrefix, valuesInArrayByte)
+    changelogWriter.foreach(_.put(keyWithPrefix, valuesInArrayByte))
+  }
+
+
   /**
    * Merge the given value for the given key. This is equivalent to the Atomic
    * Read-Modify-Write operation in RocksDB, known as the "Merge" operation. The
@@ -1094,6 +1158,39 @@ class RocksDB(
     changelogWriter.foreach(_.merge(keyWithPrefix, valueWithChecksum))
   }
 
+  /**
+   * Merge the given list of values for the given key.
+   *
+   * This is similar to the merge() function, but allows merging multiple values at once. The
+   * provided values will be appended to the current list of values for the given key.
+   */
+  def mergeList(
+      key: Array[Byte],
+      values: List[Array[Byte]],
+      cfName: String = StateStore.DEFAULT_COL_FAMILY_NAME,
+      includesPrefix: Boolean = false,
+      deriveCfName: Boolean = false): Unit = {
+    updateMemoryUsageIfNeeded()
+    val keyWithPrefix = if (useColumnFamilies && !includesPrefix) {
+      encodeStateRowWithPrefix(key, cfName)
+    } else {
+      key
+    }
+
+    val columnFamilyName = if (deriveCfName && useColumnFamilies) {
+      val (_, cfName) = decodeStateRowWithPrefix(keyWithPrefix)
+      cfName
+    } else {
+      cfName
+    }
+
+    val valueInArrayByte = getListValuesInArrayByte(values)
+
+    handleMetricsUpdate(keyWithPrefix, columnFamilyName, isPutOrMerge = true)
+    db.merge(writeOptions, keyWithPrefix, valueInArrayByte)
+    changelogWriter.foreach(_.merge(keyWithPrefix, valueInArrayByte))
+  }
+
   /**
    * Remove the key if present.
    * @note This update is not committed to disk until commit() is called.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala
@@ -300,6 +300,31 @@ private[sql] class RocksDBStateStoreProvider
       rocksDB.merge(keyEncoder.encodeKey(key), valueEncoder.encodeValue(value), colFamilyName)
     }
 
+    override def mergeList(
+        key: UnsafeRow,
+        values: Array[UnsafeRow],
+        colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit = {
+      validateAndTransitionState(UPDATE)
+      verify(state == UPDATING, "Cannot merge after already committed or aborted")
+      verifyColFamilyOperations("merge", colFamilyName)
+
+      val kvEncoder = keyValueEncoderMap.get(colFamilyName)
+      val keyEncoder = kvEncoder._1
+      val valueEncoder = kvEncoder._2
+      verify(
+        valueEncoder.supportsMultipleValuesPerKey,
+        "Merge operation requires an encoder" +
+          " which supports multiple values for a single key")
+      verify(key != null, "Key cannot be null")
+      require(values != null, "Cannot merge a null value")
+      values.foreach(v => require(v != null, "Cannot merge a null value in the array"))
+
+      rocksDB.mergeList(
+        keyEncoder.encodeKey(key),
+        values.map(valueEncoder.encodeValue).toList,
+        colFamilyName)
+    }
+
     override def put(key: UnsafeRow, value: UnsafeRow, colFamilyName: String): Unit = {
       validateAndTransitionState(UPDATE)
       verify(state == UPDATING, "Cannot put after already committed or aborted")
@@ -311,6 +336,28 @@ private[sql] class RocksDBStateStoreProvider
       rocksDB.put(kvEncoder._1.encodeKey(key), kvEncoder._2.encodeValue(value), colFamilyName)
     }
 
+    override def putList(
+        key: UnsafeRow,
+        values: Array[UnsafeRow],
+        colFamilyName: String): Unit = {
+      validateAndTransitionState(UPDATE)
+      verify(state == UPDATING, "Cannot put after already committed or aborted")
+      verify(key != null, "Key cannot be null")
+      require(values != null, "Cannot put a null value")
+      values.foreach(v => require(v != null, "Cannot put a null value in the array"))
+      verifyColFamilyOperations("put", colFamilyName)
+
+      val kvEncoder = keyValueEncoderMap.get(colFamilyName)
+      verify(
+        kvEncoder._2.supportsMultipleValuesPerKey,
+        "Multi-value put operation requires an encoder" +
+          " which supports multiple values for a single key")
+      rocksDB.putList(
+        kvEncoder._1.encodeKey(key),
+        values.map(kvEncoder._2.encodeValue).toList,
+        colFamilyName)
+    }
+
     override def remove(key: UnsafeRow, colFamilyName: String): Unit = {
       validateAndTransitionState(UPDATE)
       verify(state == UPDATING, "Cannot remove after already committed or aborted")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -208,6 +208,16 @@ trait StateStore extends ReadStateStore {
       value: UnsafeRow,
       colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit
 
+  /**
+   * Put a new list of non-null values for a non-null key. Implementations must be aware that the
+   * UnsafeRows in the params can be reused, and must make copies of the data as needed for
+   * persistence.
+   */
+  def putList(
+      key: UnsafeRow,
+      values: Array[UnsafeRow],
+      colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit
+
   /**
    * Remove a single non-null key.
    */
@@ -225,6 +235,18 @@ trait StateStore extends ReadStateStore {
   def merge(key: UnsafeRow, value: UnsafeRow,
       colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit
 
+  /**
+   * Merges the provided list of values with existing values of a non-null key. If a existing
+   * value does not exist, this operation behaves as [[StateStore.putArray()]].
+   *
+   * It is expected to throw exception if Spark calls this method without setting
+   * multipleValuesPerKey as true for the column family.
+   */
+  def mergeList(
+      key: UnsafeRow,
+      values: Array[UnsafeRow],
+      colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit
+
   /**
    * Commit all the updates that have been made to the store, and return the new version.
    * Implementations should ensure that no more updates (puts, removes) can be after a commit in
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MemoryStateStore.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MemoryStateStore.scala
@@ -51,6 +51,10 @@ class MemoryStateStore extends StateStore() {
   override def put(key: UnsafeRow, newValue: UnsafeRow, colFamilyName: String): Unit =
     map.put(key.copy(), newValue.copy())
 
+  override def putList(key: UnsafeRow, newValues: Array[UnsafeRow], colFamilyName: String): Unit = {
+    throw new UnsupportedOperationException("Doesn't support put multiple values put")
+  }
+
   override def remove(key: UnsafeRow, colFamilyName: String): Unit = map.remove(key)
 
   override def commit(): Long = version + 1
@@ -78,6 +82,10 @@ class MemoryStateStore extends StateStore() {
     throw new UnsupportedOperationException("Doesn't support multiple values per key")
   }
 
+  override def mergeList(key: UnsafeRow, values: Array[UnsafeRow], colFamilyName: String): Unit = {
+    throw new UnsupportedOperationException("Doesn't support multiple values merge")
+  }
+
   override def valuesIterator(key: UnsafeRow, colFamilyName: String): Iterator[UnsafeRow] = {
     throw new UnsupportedOperationException("Doesn't support multiple values per key")
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreCheckpointFormatV2Suite.scala
@@ -123,6 +123,13 @@ case class CkptIdCollectingStateStoreWrapper(innerStore: StateStore) extends Sta
     innerStore.put(key, value, colFamilyName)
   }
 
+  override def putList(
+      key: UnsafeRow,
+      values: Array[UnsafeRow],
+      colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit = {
+    innerStore.putList(key, values, colFamilyName)
+  }
+
   override def remove(
       key: UnsafeRow,
       colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit = {
@@ -136,6 +143,13 @@ case class CkptIdCollectingStateStoreWrapper(innerStore: StateStore) extends Sta
     innerStore.merge(key, value, colFamilyName)
   }
 
+  override def mergeList(
+      key: UnsafeRow,
+      values: Array[UnsafeRow],
+      colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Unit = {
+    innerStore.mergeList(key, values, colFamilyName)
+  }
+
   override def commit(): Long = innerStore.commit()
   override def metrics: StateStoreMetrics = innerStore.metrics
   override def getStateStoreCheckpointInfo(): StateStoreCheckpointInfo = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala
@@ -2016,6 +2016,58 @@ class RocksDBSuite extends AlsoTestWithRocksDBFeatures with SharedSparkSession
     }
   }
 
+  test("RocksDB: ensure putList / mergeList operation correctness") {
+    withTempDir { dir =>
+      val remoteDir = Utils.createTempDir().toString
+      // minDeltasForSnapshot being 5 ensures that only changelog files are created
+      // for the 3 commits below
+      val conf = dbConf.copy(minDeltasForSnapshot = 5, compactOnCommit = false)
+      new File(remoteDir).delete() // to make sure that the directory gets created
+      withDB(remoteDir, conf = conf, useColumnFamilies = true) { db =>
+        db.load(0)
+        db.put("a", "1".getBytes)
+        db.mergeList("a", Seq("2", "3", "4").map(_.getBytes).toList)
+        db.commit()
+
+        db.load(1)
+        db.mergeList("a", Seq("5", "6").map(_.getBytes).toList)
+        db.commit()
+
+        db.load(2)
+        db.remove("a")
+        db.commit()
+
+        db.load(3)
+        db.putList("a", Seq("7", "8", "9").map(_.getBytes).toList)
+        db.commit()
+
+        db.load(4)
+        db.putList("a", Seq("10", "11").map(_.getBytes).toList)
+        db.commit()
+
+        db.load(1)
+        assert(new String(db.get("a")) === "1,2,3,4")
+        assert(db.iterator().map(toStr).toSet === Set(("a", "1,2,3,4")))
+
+        db.load(2)
+        assert(new String(db.get("a")) === "1,2,3,4,5,6")
+        assert(db.iterator().map(toStr).toSet === Set(("a", "1,2,3,4,5,6")))
+
+        db.load(3)
+        assert(db.get("a") === null)
+        assert(db.iterator().isEmpty)
+
+        db.load(4)
+        assert(new String(db.get("a")) === "7,8,9")
+        assert(db.iterator().map(toStr).toSet === Set(("a", "7,8,9")))
+
+        db.load(5)
+        assert(new String(db.get("a")) === "10,11")
+        assert(db.iterator().map(toStr).toSet === Set(("a", "10,11")))
+      }
+    }
+  }
+
   testWithStateStoreCheckpointIdsAndColumnFamilies("RocksDBFileManager: delete orphan files",
     TestWithBothChangelogCheckpointingEnabledAndDisabled) {
     case (enableStateStoreCheckpointIds, colFamiliesEnabled) =>

Original file line number	Diff line number	Diff line change
`@@ -292,11 +292,20 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with`
`292`	`292`	`throw StateStoreErrors.unsupportedOperationException("multipleValuesPerKey", providerName)`
`293`	`293`	`}`
`294`	`294`
	`295`	`+ override def putList(key: UnsafeRow, values: Array[UnsafeRow], colFamilyName: String): Unit = {`
	`296`	`+ throw StateStoreErrors.unsupportedOperationException("putList", providerName)`
	`297`	`+ }`
	`298`	`+`
`295`	`299`	`override def merge(key: UnsafeRow,`
`296`	`300`	`value: UnsafeRow,`
`297`	`301`	`colFamilyName: String): Unit = {`
`298`	`302`	`throw StateStoreErrors.unsupportedOperationException("merge", providerName)`
`299`	`303`	`}`
	`304`	`+`
	`305`	`+ override def mergeList(`
	`306`	`+ key: UnsafeRow, values: Array[UnsafeRow], colFamilyName: String): Unit = {`
	`307`	`+ throw StateStoreErrors.unsupportedOperationException("mergeList", providerName)`
	`308`	`+ }`
`300`	`309`	`}`
`301`	`310`
`302`	`311`	`def getMetricsForProvider(): Map[String, Long] = synchronized {`