From e8c27a54e1c33be49439639729526339b22c37cb Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Fri, 23 May 2025 12:27:25 -0400 Subject: [PATCH 1/7] Allow missing shard stats for restarted nodes for _snapshot/_status Returns an empty shard stats for shard entries where stats were unavailable in the case where a node has been restarted or left the cluster. The change adds a 'description' field to the SnapshotIndexShardStatus class that is used to include a message indicating why the stats are empty. This change was motivated by a desire to reduce latency for getting the stats for currently running snapshots. The stats can still be loaded from the repository via a _snapshot//snapshot/_status call. Closes ES-10982 --- .../snapshots/SnapshotStatusApisIT.java | 8 +- .../org/elasticsearch/TransportVersions.java | 1 + .../status/SnapshotIndexShardStatus.java | 37 ++++- .../TransportSnapshotsStatusAction.java | 31 ++-- .../status/SnapshotIndexShardStatusTests.java | 9 +- .../TransportSnapshotsStatusActionTests.java | 142 ++++++++++++++++++ 6 files changed, 206 insertions(+), 22 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java index 9261e7e206662..3dc626bab60ca 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java @@ -212,7 +212,8 @@ public void testGetSnapshotsWithoutIndices() throws Exception { * 1. Start snapshot of two shards (both located on separate data nodes). * 2. Have one of the shards snapshot completely and the other block * 3. Restart the data node that completed its shard snapshot - * 4. Make sure that snapshot status APIs show correct file-counts and -sizes + * 4. Make sure that snapshot status APIs show correct file-counts and -sizes for non-restarted nodes + * 5. Make sure the description string is set for shard snapshots on restarted nodes. * * @throws Exception on failure */ @@ -261,8 +262,9 @@ public void testCorrectCountsForDoneShards() throws Exception { indexTwo ); assertThat(snapshotShardStateAfterNodeRestart.getStage(), is(SnapshotIndexShardStage.DONE)); - assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalFileCount(), equalTo(totalFiles)); - assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalSize(), equalTo(totalFileSize)); + assertNotNull("expected a description string for missing stats", snapshotShardStateAfterNodeRestart.getDescription()); + assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalFileCount(), equalTo(0)); + assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalSize(), equalTo(0L)); unblockAllDataNodes(repoName); assertThat(responseSnapshotOne.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS)); diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index 30739fbb992b2..944cd276518f1 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -259,6 +259,7 @@ static TransportVersion def(int id) { public static final TransportVersion ESQL_TIME_SERIES_SOURCE_STATUS = def(9_076_0_00); public static final TransportVersion ESQL_HASH_OPERATOR_STATUS_OUTPUT_TIME = def(9_077_0_00); public static final TransportVersion ML_INFERENCE_HUGGING_FACE_CHAT_COMPLETION_ADDED = def(9_078_0_00); + public static final TransportVersion SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED = def(9_079_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java index e868daf0ca46e..fe9a055715964 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.util.Objects; +import static org.elasticsearch.TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED; + public class SnapshotIndexShardStatus extends BroadcastShardResponse implements ToXContentFragment { private final SnapshotIndexShardStage stage; @@ -31,12 +33,17 @@ public class SnapshotIndexShardStatus extends BroadcastShardResponse implements private String failure; + private String description; + public SnapshotIndexShardStatus(StreamInput in) throws IOException { super(in); stage = SnapshotIndexShardStage.fromValue(in.readByte()); stats = new SnapshotStats(in); nodeId = in.readOptionalString(); failure = in.readOptionalString(); + if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) { + description = in.readOptionalString(); + } } SnapshotIndexShardStatus(ShardId shardId, SnapshotIndexShardStage stage) { @@ -74,11 +81,23 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException { } SnapshotIndexShardStatus(ShardId shardId, SnapshotIndexShardStage stage, SnapshotStats stats, String nodeId, String failure) { + this(shardId, stage, stats, nodeId, failure, null); + } + + SnapshotIndexShardStatus( + ShardId shardId, + SnapshotIndexShardStage stage, + SnapshotStats stats, + String nodeId, + String failure, + String description + ) { super(shardId); this.stage = stage; this.stats = stats; this.nodeId = nodeId; this.failure = failure; + this.description = description; } /** @@ -109,6 +128,13 @@ public String getFailure() { return failure; } + /** + * Returns the optional description of the data values contained in the {@code stats} field. + */ + public String getDescription() { + return description; + } + @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); @@ -116,12 +142,16 @@ public void writeTo(StreamOutput out) throws IOException { stats.writeTo(out); out.writeOptionalString(nodeId); out.writeOptionalString(failure); + if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) { + out.writeOptionalString(description); + } } static final class Fields { static final String STAGE = "stage"; static final String REASON = "reason"; static final String NODE = "node"; + static final String DESCRIPTION = "description"; } @Override @@ -135,6 +165,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (getFailure() != null) { builder.field(Fields.REASON, getFailure()); } + if (getDescription() != null) { + builder.field(Fields.DESCRIPTION, getDescription()); + } builder.endObject(); return builder; } @@ -151,7 +184,8 @@ public boolean equals(Object o) { return stage == that.stage && Objects.equals(stats, that.stats) && Objects.equals(nodeId, that.nodeId) - && Objects.equals(failure, that.failure); + && Objects.equals(failure, that.failure) + && Objects.equals(description, that.description); } @Override @@ -160,6 +194,7 @@ public int hashCode() { result = 31 * result + (stats != null ? stats.hashCode() : 0); result = 31 * result + (nodeId != null ? nodeId.hashCode() : 0); result = 31 * result + (failure != null ? failure.hashCode() : 0); + result = 31 * result + (description != null ? description.hashCode() : 0); return result; } diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java index f99cd1f593511..e86d59cda2dd9 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java @@ -196,13 +196,13 @@ void buildResponse( } SnapshotsInProgress.ShardSnapshotStatus status = shardEntry.getValue(); if (status.nodeId() != null) { - // We should have information about this shard from the shard: + // We should have information about this shard from the node: TransportNodesSnapshotsStatus.NodeSnapshotStatus nodeStatus = nodeSnapshotStatusMap.get(status.nodeId()); if (nodeStatus != null) { - Map shardStatues = nodeStatus.status().get(entry.snapshot()); - if (shardStatues != null) { + Map shardStatuses = nodeStatus.status().get(entry.snapshot()); + if (shardStatuses != null) { final ShardId sid = entry.shardId(shardEntry.getKey()); - SnapshotIndexShardStatus shardStatus = shardStatues.get(sid); + SnapshotIndexShardStatus shardStatus = shardStatuses.get(sid); if (shardStatus != null) { // We have full information about this shard if (shardStatus.getStage() == SnapshotIndexShardStage.DONE @@ -228,8 +228,6 @@ void buildResponse( } // We failed to find the status of the shard from the responses we received from data nodes. // This can happen if nodes drop out of the cluster completely or restart during the snapshot. - // We rebuild the information they would have provided from their in memory state from the cluster - // state and the repository contents in the below logic final SnapshotIndexShardStage stage = switch (shardEntry.getValue().state()) { case FAILED, ABORTED, MISSING -> SnapshotIndexShardStage.FAILURE; case INIT, WAITING, PAUSED_FOR_NODE_REMOVAL, QUEUED -> SnapshotIndexShardStage.STARTED; @@ -237,17 +235,18 @@ void buildResponse( }; final SnapshotIndexShardStatus shardStatus; if (stage == SnapshotIndexShardStage.DONE) { - // Shard snapshot completed successfully so we should be able to load the exact statistics for this - // shard from the repository already. - final ShardId shardId = entry.shardId(shardEntry.getKey()); + // When processing currently running snapshots, instead of reading the statistics from the repository, which can be + // expensive, we choose instead to provide a message to the caller explaining why the stats are missing and the API + // that can be used to load them. shardStatus = new SnapshotIndexShardStatus( - shardId, - repositoriesService.repository(entry.repository()) - .getShardSnapshotStatus( - entry.snapshot().getSnapshotId(), - entry.indices().get(shardId.getIndexName()), - shardId - ) + entry.shardId(shardEntry.getKey()), + stage, + new SnapshotStats(), + null, + null, + """ + Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ + completing the snapshot; use /_snapshot///_status to load from the repository.""" ); } else { shardStatus = new SnapshotIndexShardStatus(entry.shardId(shardEntry.getKey()), stage); diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java index f79bd5f6ad864..777a8611ce4f6 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java @@ -39,10 +39,13 @@ protected SnapshotIndexShardStatus createForIndex(String indexName) { SnapshotStats stats = new SnapshotStatsTests().createTestInstance(); String nodeId = randomAlphaOfLength(20); String failure = null; + String description = null; if (rarely()) { failure = randomAlphaOfLength(200); + } else if (rarely()) { + description = randomAlphaOfLength(200); } - return new SnapshotIndexShardStatus(shardId, stage, stats, nodeId, failure); + return new SnapshotIndexShardStatus(shardId, stage, stats, nodeId, failure, description); } @Override @@ -76,6 +79,7 @@ protected boolean supportsUnknownFields() { String rawStage = (String) parsedObjects[i++]; String nodeId = (String) parsedObjects[i++]; String failure = (String) parsedObjects[i++]; + String description = (String) parsedObjects[i++]; SnapshotStats stats = (SnapshotStats) parsedObjects[i]; SnapshotIndexShardStage stage; @@ -89,12 +93,13 @@ protected boolean supportsUnknownFields() { rawStage ); } - return new SnapshotIndexShardStatus(shard, stage, stats, nodeId, failure); + return new SnapshotIndexShardStatus(shard, stage, stats, nodeId, failure, description); } ); innerParser.declareString(constructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.STAGE)); innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.NODE)); innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.REASON)); + innerParser.declareString(optionalConstructorArg(), new ParseField(SnapshotIndexShardStatus.Fields.DESCRIPTION)); innerParser.declareObject(constructorArg(), (p, c) -> SnapshotStats.fromXContent(p), new ParseField(SnapshotStats.Fields.STATS)); PARSER = (p, indexId, shardName) -> { // Combine the index name in the context with the shard name passed in for the named object parser diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java index 07c7fbd9cd234..4adaa44928b36 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java @@ -14,12 +14,16 @@ import org.elasticsearch.client.internal.node.NodeClient; import org.elasticsearch.cluster.SnapshotsInProgress; import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.common.transport.TransportAddress; +import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.repositories.IndexId; import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.repositories.ShardGeneration; +import org.elasticsearch.repositories.ShardSnapshotResult; import org.elasticsearch.snapshots.Snapshot; import org.elasticsearch.snapshots.SnapshotId; import org.elasticsearch.tasks.CancellableTask; @@ -200,4 +204,142 @@ public void onFailure(Exception e) { ); assertTrue("Expected listener to be invoked", listenerInvoked.get()); } + + public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() { + final var snapshot = new Snapshot(ProjectId.DEFAULT, "test-repo", new SnapshotId("snapshot", "uuid")); + final var indexName = "test-index-name"; + final var indexUuid = "test-index-uuid"; + final var shardGeneration = new ShardGeneration("gen"); + final var shardId2 = new ShardId(indexName, indexUuid, 2); + final var nowMsecs = System.currentTimeMillis(); + final var eightKb = ByteSizeValue.ofKb(8).getBytes(); + + final var currentSnapshotEntries = List.of( + SnapshotsInProgress.Entry.snapshot( + snapshot, + randomBoolean(), + randomBoolean(), + SnapshotsInProgress.State.STARTED, + Map.of(indexName, new IndexId(indexName, indexUuid)), + List.of(), + List.of(), + randomNonNegativeLong(), + randomNonNegativeLong(), + Map.of( + new ShardId(indexName, indexUuid, 0), + SnapshotsInProgress.ShardSnapshotStatus.success( + "nodeId0", + new ShardSnapshotResult(shardGeneration, ByteSizeValue.ofKb(5), 1) + ), + new ShardId(indexName, indexUuid, 1), + new SnapshotsInProgress.ShardSnapshotStatus("nodeId1", shardGeneration), + shardId2, + SnapshotsInProgress.ShardSnapshotStatus.success( + "nodeId2", + new ShardSnapshotResult(shardGeneration, ByteSizeValue.ofKb(8), 1) + ) + ), + null, + Map.of(), + IndexVersion.current() + ) + ); + final var nodeSnapshotStatuses = new TransportNodesSnapshotsStatus.NodesSnapshotStatus( + clusterService.getClusterName(), + List.of( + new TransportNodesSnapshotsStatus.NodeSnapshotStatus( + new DiscoveryNode( + "nodeName0", + "nodeId0", + new TransportAddress(TransportAddress.META_ADDRESS, 9000), + Map.of(), + Set.of(), + null + ), + // Here we are missing the snapshot data for the shard on this node. + Map.of() + ), + new TransportNodesSnapshotsStatus.NodeSnapshotStatus( + new DiscoveryNode( + "nodeName2", + "nodeId2", + new TransportAddress(TransportAddress.META_ADDRESS, 9002), + Map.of(), + Set.of(), + null + ), + Map.of( + snapshot, + Map.of( + shardId2, + new SnapshotIndexShardStatus( + new ShardId(indexName, indexUuid, 2), + SnapshotIndexShardStage.DONE, + new SnapshotStats(nowMsecs, 0, 1, 1, 1, eightKb, eightKb, eightKb), + "nodeId2", + null + ) + ) + ) + ) + ), + List.of() + ); + + final Consumer verifyResponse = rsp -> { + assertNotNull(rsp); + final var snapshotStatuses = rsp.getSnapshots(); + assertNotNull(snapshotStatuses); + assertEquals( + "expected 1 snapshot status, got " + snapshotStatuses.size() + ": " + snapshotStatuses, + 1, + snapshotStatuses.size() + ); + final var snapshotStatus = snapshotStatuses.getFirst(); + assertEquals(SnapshotsInProgress.State.STARTED, snapshotStatus.getState()); + final var shardStats = snapshotStatus.getShardsStats(); + assertNotNull("expected non-null shard stats for SnapshotStatus: " + snapshotStatus, shardStats); + assertEquals(new SnapshotShardsStats(0, 1 /* started */, 0, 2 /* done */, 0, 3 /* total */), shardStats); + final var totalStats = snapshotStatus.getStats(); + assertNotNull("expected non-null total stats for SnapshotStatus: " + snapshotStatus, snapshotStatus); + assertEquals("expected total file count to be 1 in the stats: " + totalStats, 1, totalStats.getTotalFileCount()); + assertEquals("expected total size to be " + eightKb + " in the stats: " + totalStats, eightKb, totalStats.getTotalSize()); + final var snapshotStatusIndices = snapshotStatus.getIndices(); + assertNotNull("expected a non-null map from getIndices() from SnapshotStatus: " + snapshotStatus, snapshotStatusIndices); + final var snapshotIndexStatus = snapshotStatusIndices.get(indexName); + assertNotNull( + "no entry for indexName [" + indexName + "] found in snapshotStatusIndices: " + snapshotStatusIndices, + snapshotIndexStatus + ); + final var shardMap = snapshotIndexStatus.getShards(); + assertNotNull("expected a non-null shard map for SnapshotIndexStatus: " + snapshotIndexStatus, shardMap); + final var shard0Entry = shardMap.get(0); + assertNotNull("no entry for shard 0 found in indexName [" + indexName + "] shardMap: " + shardMap, shard0Entry); + assertNotNull("expected a description string for shard 0 with missing stats from node0", shard0Entry.getDescription()); + }; + + final var listener = new ActionListener() { + @Override + public void onResponse(SnapshotsStatusResponse rsp) { + verifyResponse.accept(rsp); + } + + @Override + public void onFailure(Exception e) { + fail("expected onResponse() instead of onFailure(" + e + ")"); + } + }; + + final var listenerInvoked = new AtomicBoolean(false); + + action.buildResponse( + SnapshotsInProgress.EMPTY, + new SnapshotsStatusRequest(TEST_REQUEST_TIMEOUT), + currentSnapshotEntries, + nodeSnapshotStatuses, + new CancellableTask(randomLong(), "type", "action", "desc", null, Map.of()), + ActionListener.runAfter(listener, () -> listenerInvoked.set(true)) + ); + assertTrue("Expected listener to be invoked", listenerInvoked.get()); + } } From 52909c7d433f3a430dbeab40e800f624124f889d Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 23 May 2025 16:46:44 +0000 Subject: [PATCH 2/7] [CI] Auto commit changes from spotless --- .../snapshots/status/TransportSnapshotsStatusAction.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java index e86d59cda2dd9..2919b2d73267d 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java @@ -245,8 +245,8 @@ void buildResponse( null, null, """ - Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ - completing the snapshot; use /_snapshot///_status to load from the repository.""" + Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ + completing the snapshot; use /_snapshot///_status to load from the repository.""" ); } else { shardStatus = new SnapshotIndexShardStatus(entry.shardId(shardEntry.getKey()), stage); From 83bd1a976da16ddaccb30722bb2d019e829f67c4 Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Fri, 30 May 2025 15:11:50 -0400 Subject: [PATCH 3/7] Use -1 for missing stats values, address code review comments --- .../snapshots/SnapshotStatusApisIT.java | 24 +++++----- .../org/elasticsearch/TransportVersions.java | 2 +- .../status/SnapshotIndexShardStatus.java | 20 ++++++-- .../snapshots/status/SnapshotStats.java | 48 +++++++++++++++++++ .../TransportSnapshotsStatusAction.java | 36 +++++++++----- .../snapshots/status/SnapshotStatsTests.java | 48 +++++++++++++++++++ .../TransportSnapshotsStatusActionTests.java | 29 +++++++++-- 7 files changed, 176 insertions(+), 31 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java index 3dc626bab60ca..4002b569915ba 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java @@ -13,7 +13,6 @@ import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotIndexShardStage; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotIndexShardStatus; -import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStats; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse; import org.elasticsearch.action.support.GroupedActionListener; @@ -249,22 +248,23 @@ public void testCorrectCountsForDoneShards() throws Exception { assertThat(snapshotShardState.getStage(), is(SnapshotIndexShardStage.DONE)); assertThat(snapshotShardState.getStats().getTotalFileCount(), greaterThan(0)); assertThat(snapshotShardState.getStats().getTotalSize(), greaterThan(0L)); + assertNull("expected a null description for snapshot shard status: " + snapshotShardState, snapshotShardState.getDescription()); }, 30L, TimeUnit.SECONDS); - final SnapshotStats snapshotShardStats = stateFirstShard(getSnapshotStatus(repoName, snapshotOne), indexTwo).getStats(); - final int totalFiles = snapshotShardStats.getTotalFileCount(); - final long totalFileSize = snapshotShardStats.getTotalSize(); - internalCluster().restartNode(dataNodeTwo); - final SnapshotIndexShardStatus snapshotShardStateAfterNodeRestart = stateFirstShard( - getSnapshotStatus(repoName, snapshotOne), - indexTwo - ); + final var snapshotStatusAfterRestart = getSnapshotStatus(repoName, snapshotOne); + final var snapshotShardStateAfterNodeRestart = stateFirstShard(snapshotStatusAfterRestart, indexTwo); assertThat(snapshotShardStateAfterNodeRestart.getStage(), is(SnapshotIndexShardStage.DONE)); - assertNotNull("expected a description string for missing stats", snapshotShardStateAfterNodeRestart.getDescription()); - assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalFileCount(), equalTo(0)); - assertThat(snapshotShardStateAfterNodeRestart.getStats().getTotalSize(), equalTo(0L)); + assertNotNull("expected a non-null description string for missing stats", snapshotShardStateAfterNodeRestart.getDescription()); + final var missingStats = snapshotShardStateAfterNodeRestart.getStats(); + assertThat(missingStats.getTotalFileCount(), equalTo(-1)); + assertThat(missingStats.getTotalSize(), equalTo(-1L)); + + final var snapshotShardStateIndexOne = stateFirstShard(snapshotStatusAfterRestart, indexOne); + assertNull("expected a null description string for available stats", snapshotShardStateIndexOne.getDescription()); + assertThat(snapshotShardStateIndexOne.getStats().getTotalFileCount(), greaterThan(0)); + assertThat(snapshotShardStateIndexOne.getStats().getTotalSize(), greaterThan(0L)); unblockAllDataNodes(repoName); assertThat(responseSnapshotOne.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS)); diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index 7505501258b6d..a375bf64d8c02 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -273,7 +273,7 @@ static TransportVersion def(int id) { public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00); public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00); public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00); - public static final TransportVersion SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED = def(9_087_0_00); + public static final TransportVersion SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS = def(9_087_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java index fe9a055715964..e6f5d4092af61 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java @@ -21,7 +21,7 @@ import java.io.IOException; import java.util.Objects; -import static org.elasticsearch.TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED; +import static org.elasticsearch.TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS; public class SnapshotIndexShardStatus extends BroadcastShardResponse implements ToXContentFragment { @@ -41,7 +41,7 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException { stats = new SnapshotStats(in); nodeId = in.readOptionalString(); failure = in.readOptionalString(); - if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) { + if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { description = in.readOptionalString(); } } @@ -100,6 +100,20 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException { this.description = description; } + /** + * Creates an instance for scenarios where the snapshot stats are unavailable, with a non-null description of why the stats are missing. + */ + public static SnapshotIndexShardStatus forMissingStats(ShardId shardId, SnapshotIndexShardStage stage, String description) { + return new SnapshotIndexShardStatus( + shardId, + stage, + SnapshotStats.forMissingStats(), + null, + null, + Objects.requireNonNull(description) + ); + } + /** * Returns snapshot stage */ @@ -142,7 +156,7 @@ public void writeTo(StreamOutput out) throws IOException { stats.writeTo(out); out.writeOptionalString(nodeId); out.writeOptionalString(failure); - if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_DESCRIPTION_ADDED)) { + if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { out.writeOptionalString(description); } } diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java index 03b01f88a3a20..f7fb5806b5d08 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java @@ -23,6 +23,8 @@ import java.io.IOException; +import static org.elasticsearch.TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS; + public class SnapshotStats implements Writeable, ToXContentObject { private long startTime; @@ -37,6 +39,18 @@ public class SnapshotStats implements Writeable, ToXContentObject { SnapshotStats() {} SnapshotStats(StreamInput in) throws IOException { + if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS) && in.readBoolean() == false) { + startTime = 0L; + time = 0L; + incrementalFileCount = -1; + processedFileCount = -1; + incrementalSize = -1L; + processedSize = -1L; + totalFileCount = -1; + totalSize = -1L; + return; + } + startTime = in.readVLong(); time = in.readVLong(); @@ -71,6 +85,20 @@ public class SnapshotStats implements Writeable, ToXContentObject { this.processedSize = processedSize; } + /** + * Returns a stats instance with -1 for all non-time values, for use in situations where the snapshot stats are unavailable. + */ + public static SnapshotStats forMissingStats() { + return new SnapshotStats(0L, 0L, -1, -1, -1, -1L, -1L, -1L); + } + + /** + * Returns true if this instance is for a shard snapshot with unavailable stats. + */ + public boolean isMissingStats() { + return incrementalFileCount == -1; + } + /** * Returns time when snapshot started */ @@ -129,6 +157,22 @@ public long getProcessedSize() { @Override public void writeTo(StreamOutput out) throws IOException { + if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { + if (isMissingStats()) { + out.writeBoolean(false); + return; + } + out.writeBoolean(true); + } else if (isMissingStats()) { + throw new IllegalStateException( + "cannot serialize empty stats for transport version [" + + out.getTransportVersion() + + "] less than [" + + SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS + + "]" + ); + } + out.writeVLong(startTime); out.writeVLong(time); @@ -300,6 +344,10 @@ public static SnapshotStats fromXContent(XContentParser parser) throws IOExcepti * @param updateTimestamps Whether or not start time and duration should be updated */ void add(SnapshotStats stats, boolean updateTimestamps) { + if (stats.isMissingStats()) { + return; + } + incrementalFileCount += stats.incrementalFileCount; totalFileCount += stats.totalFileCount; processedFileCount += stats.processedFileCount; diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java index 2919b2d73267d..8b24968e99462 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java @@ -11,6 +11,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.elasticsearch.TransportVersion; +import org.elasticsearch.TransportVersions; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionType; import org.elasticsearch.action.support.ActionFilters; @@ -119,7 +121,7 @@ protected void masterOperation( Arrays.asList(request.snapshots()) ); if (currentSnapshots.isEmpty()) { - buildResponse(snapshotsInProgress, request, currentSnapshots, null, cancellableTask, listener); + buildResponse(snapshotsInProgress, request, currentSnapshots, null, state.getMinTransportVersion(), cancellableTask, listener); return; } @@ -152,6 +154,7 @@ protected void masterOperation( request, currentSnapshots, nodeSnapshotStatuses, + state.getMinTransportVersion(), cancellableTask, l ) @@ -160,7 +163,7 @@ protected void masterOperation( ); } else { // We don't have any in-progress shards, just return current stats - buildResponse(snapshotsInProgress, request, currentSnapshots, null, cancellableTask, listener); + buildResponse(snapshotsInProgress, request, currentSnapshots, null, state.getMinTransportVersion(), cancellableTask, listener); } } @@ -171,6 +174,7 @@ void buildResponse( SnapshotsStatusRequest request, List currentSnapshotEntries, TransportNodesSnapshotsStatus.NodesSnapshotStatus nodeSnapshotStatuses, + TransportVersion minTransportVersion, CancellableTask task, ActionListener listener ) { @@ -235,19 +239,27 @@ void buildResponse( }; final SnapshotIndexShardStatus shardStatus; if (stage == SnapshotIndexShardStage.DONE) { + final ShardId shardId = entry.shardId(shardEntry.getKey()); // When processing currently running snapshots, instead of reading the statistics from the repository, which can be // expensive, we choose instead to provide a message to the caller explaining why the stats are missing and the API - // that can be used to load them. - shardStatus = new SnapshotIndexShardStatus( - entry.shardId(shardEntry.getKey()), - stage, - new SnapshotStats(), - null, - null, - """ + // that can be used to load them once the snapshot has completed. + if (minTransportVersion.onOrAfter(TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { + shardStatus = SnapshotIndexShardStatus.forMissingStats(shardId, stage, """ Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ - completing the snapshot; use /_snapshot///_status to load from the repository.""" - ); + completing the shard snapshot; use /_snapshot///_status to load from the repository \ + once the snapshot has completed."""); + } else { + // BWC behavior, load the stats directly from the repository. + shardStatus = new SnapshotIndexShardStatus( + shardId, + repositoriesService.repository(entry.repository()) + .getShardSnapshotStatus( + entry.snapshot().getSnapshotId(), + entry.indices().get(shardId.getIndexName()), + shardId + ) + ); + } } else { shardStatus = new SnapshotIndexShardStatus(entry.shardId(shardEntry.getKey()), stage); } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java index e0411254e1280..783d4e1ff6628 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java @@ -9,10 +9,16 @@ package org.elasticsearch.action.admin.cluster.snapshots.status; +import org.elasticsearch.TransportVersion; +import org.elasticsearch.TransportVersions; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.common.io.stream.OutputStreamStreamOutput; import org.elasticsearch.test.AbstractXContentTestCase; import org.elasticsearch.xcontent.XContentParser; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.List; public class SnapshotStatsTests extends AbstractXContentTestCase { @@ -48,4 +54,46 @@ protected SnapshotStats doParseInstance(XContentParser parser) throws IOExceptio protected boolean supportsUnknownFields() { return true; } + + public void testMissingStats() throws IOException { + final var populatedStats = createTestInstance(); + final var missingStats = SnapshotStats.forMissingStats(); + assertEquals(0L, missingStats.getStartTime()); + assertEquals(0L, missingStats.getTime()); + assertEquals(-1, missingStats.getTotalFileCount()); + assertEquals(-1, missingStats.getIncrementalFileCount()); + assertEquals(-1, missingStats.getProcessedFileCount()); + assertEquals(-1L, missingStats.getTotalSize()); + assertEquals(-1L, missingStats.getIncrementalSize()); + assertEquals(-1L, missingStats.getProcessedSize()); + + // Verify round trip serialization. + for (var transportVersion : List.of( + TransportVersions.MINIMUM_COMPATIBLE, + TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS, + TransportVersion.current() + )) { + + for (var stats : List.of(populatedStats, missingStats)) { + final var bytesOut = new ByteArrayOutputStream(); + + try (var streamOut = new OutputStreamStreamOutput(bytesOut)) { + streamOut.setTransportVersion(transportVersion); + + if (transportVersion.onOrAfter(TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS) || stats != missingStats) { + stats.writeTo(streamOut); + } else { + assertThrows(IllegalStateException.class, () -> stats.writeTo(streamOut)); + continue; + } + } + + try (var streamIn = new ByteArrayStreamInput(bytesOut.toByteArray())) { + streamIn.setTransportVersion(transportVersion); + final var statsRead = new SnapshotStats(streamIn); + assertEquals(stats, statsRead); + } + } + } + } } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java index 4adaa44928b36..525cb293516a1 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusActionTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.action.admin.cluster.snapshots.status; +import org.elasticsearch.TransportVersion; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.ActionFilters; import org.elasticsearch.client.internal.node.NodeClient; @@ -199,6 +200,7 @@ public void onFailure(Exception e) { new SnapshotsStatusRequest(TEST_REQUEST_TIMEOUT), currentSnapshotEntries, nodeSnapshotStatuses, + TransportVersion.current(), cancellableTask, ActionListener.runAfter(listener, () -> listenerInvoked.set(true)) ); @@ -213,6 +215,7 @@ public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() { final var shardId2 = new ShardId(indexName, indexUuid, 2); final var nowMsecs = System.currentTimeMillis(); final var eightKb = ByteSizeValue.ofKb(8).getBytes(); + final var shard2SnapshotStats = new SnapshotStats(nowMsecs, 0, 1, 1, 1, eightKb, eightKb, eightKb); final var currentSnapshotEntries = List.of( SnapshotsInProgress.Entry.snapshot( @@ -275,7 +278,7 @@ public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() { new SnapshotIndexShardStatus( new ShardId(indexName, indexUuid, 2), SnapshotIndexShardStage.DONE, - new SnapshotStats(nowMsecs, 0, 1, 1, 1, eightKb, eightKb, eightKb), + shard2SnapshotStats, "nodeId2", null ) @@ -301,7 +304,7 @@ public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() { assertNotNull("expected non-null shard stats for SnapshotStatus: " + snapshotStatus, shardStats); assertEquals(new SnapshotShardsStats(0, 1 /* started */, 0, 2 /* done */, 0, 3 /* total */), shardStats); final var totalStats = snapshotStatus.getStats(); - assertNotNull("expected non-null total stats for SnapshotStatus: " + snapshotStatus, snapshotStatus); + assertNotNull("expected non-null total stats for SnapshotStatus: " + snapshotStatus, totalStats); assertEquals("expected total file count to be 1 in the stats: " + totalStats, 1, totalStats.getTotalFileCount()); assertEquals("expected total size to be " + eightKb + " in the stats: " + totalStats, eightKb, totalStats.getTotalSize()); final var snapshotStatusIndices = snapshotStatus.getIndices(); @@ -313,9 +316,28 @@ public void testShardSnapshotMissingDataFromNodeWhenNodeHasBeenRestarted() { ); final var shardMap = snapshotIndexStatus.getShards(); assertNotNull("expected a non-null shard map for SnapshotIndexStatus: " + snapshotIndexStatus, shardMap); + + // Verify data for the shard 0 entry, which is missing the stats. final var shard0Entry = shardMap.get(0); assertNotNull("no entry for shard 0 found in indexName [" + indexName + "] shardMap: " + shardMap, shard0Entry); - assertNotNull("expected a description string for shard 0 with missing stats from node0", shard0Entry.getDescription()); + assertEquals(SnapshotIndexShardStage.DONE, shard0Entry.getStage()); + final var description = shard0Entry.getDescription(); + assertNotNull("expected a non-null description string for shard 0 with missing stats from node0: " + shard0Entry, description); + assertTrue( + "unexpected description string text: " + description, + description.contains("shard stats missing from a currently running snapshot due to") + ); + assertEquals(SnapshotStats.forMissingStats(), shard0Entry.getStats()); + + // Verify data for the shard 2 entry, which is DONE and has stats present. + final var shard2Entry = shardMap.get(2); + assertNotNull("no entry for shard 2 found in indexName [" + indexName + "] shardMap: " + shardMap, shard2Entry); + assertEquals(SnapshotIndexShardStage.DONE, shard2Entry.getStage()); + assertNull( + "expected a null description string for shard 2 that has stats data present: " + shard2Entry, + shard2Entry.getDescription() + ); + assertEquals("unexpected stats for shard 2: " + shard2Entry, shard2SnapshotStats, shard2Entry.getStats()); }; final var listener = new ActionListener() { @@ -337,6 +359,7 @@ public void onFailure(Exception e) { new SnapshotsStatusRequest(TEST_REQUEST_TIMEOUT), currentSnapshotEntries, nodeSnapshotStatuses, + TransportVersion.current(), new CancellableTask(randomLong(), "type", "action", "desc", null, Map.of()), ActionListener.runAfter(listener, () -> listenerInvoked.set(true)) ); From f3464b579104d96e7ce223ad73e53c899ad1aecb Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Fri, 30 May 2025 16:12:47 -0400 Subject: [PATCH 4/7] Update docs/changelog/128399.yaml --- docs/changelog/128399.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/128399.yaml diff --git a/docs/changelog/128399.yaml b/docs/changelog/128399.yaml new file mode 100644 index 0000000000000..042c1b9153f72 --- /dev/null +++ b/docs/changelog/128399.yaml @@ -0,0 +1,5 @@ +pr: 128399 +summary: Allow missing shard stats for restarted nodes for `_snapshot/_status` +area: Snapshot/Restore +type: enhancement +issues: [] From 64217d47ba061be0a332f3cd2c79ef077abab217 Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Tue, 3 Jun 2025 12:41:21 -0400 Subject: [PATCH 5/7] Improve serialization tests, address code review comments --- .../elasticsearch/snapshots/SnapshotStatusApisIT.java | 9 +++++---- .../snapshots/status/SnapshotIndexShardStatus.java | 7 ++++--- .../admin/cluster/snapshots/status/SnapshotStats.java | 7 +++++++ .../snapshots/status/TransportSnapshotsStatusAction.java | 2 +- .../snapshots/status/SnapshotIndexShardStatusTests.java | 4 ++++ .../cluster/snapshots/status/SnapshotStatsTests.java | 5 ++++- .../org/elasticsearch/test/AbstractXContentTestCase.java | 9 ++++++++- 7 files changed, 33 insertions(+), 10 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java index 4002b569915ba..96f99447e7b07 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java @@ -254,10 +254,11 @@ public void testCorrectCountsForDoneShards() throws Exception { internalCluster().restartNode(dataNodeTwo); final var snapshotStatusAfterRestart = getSnapshotStatus(repoName, snapshotOne); - final var snapshotShardStateAfterNodeRestart = stateFirstShard(snapshotStatusAfterRestart, indexTwo); - assertThat(snapshotShardStateAfterNodeRestart.getStage(), is(SnapshotIndexShardStage.DONE)); - assertNotNull("expected a non-null description string for missing stats", snapshotShardStateAfterNodeRestart.getDescription()); - final var missingStats = snapshotShardStateAfterNodeRestart.getStats(); + + final var snapshotShardStateIndexTwo = stateFirstShard(snapshotStatusAfterRestart, indexTwo); + assertThat(snapshotShardStateIndexTwo.getStage(), is(SnapshotIndexShardStage.DONE)); + assertNotNull("expected a non-null description string for missing stats", snapshotShardStateIndexTwo.getDescription()); + final var missingStats = snapshotShardStateIndexTwo.getStats(); assertThat(missingStats.getTotalFileCount(), equalTo(-1)); assertThat(missingStats.getTotalSize(), equalTo(-1L)); diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java index e6f5d4092af61..2e4e63cda39cc 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java @@ -101,12 +101,13 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException { } /** - * Creates an instance for scenarios where the snapshot stats are unavailable, with a non-null description of why the stats are missing. + * Creates an instance for scenarios where the snapshot is {@link SnapshotIndexShardStage#DONE} but the stats are unavailable, with a + * non-null description of why the stats are missing. */ - public static SnapshotIndexShardStatus forMissingStats(ShardId shardId, SnapshotIndexShardStage stage, String description) { + public static SnapshotIndexShardStatus forDoneButMissingStats(ShardId shardId, String description) { return new SnapshotIndexShardStatus( shardId, - stage, + SnapshotIndexShardStage.DONE, SnapshotStats.forMissingStats(), null, null, diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java index f7fb5806b5d08..6fc5a27b8af2e 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java @@ -39,6 +39,7 @@ public class SnapshotStats implements Writeable, ToXContentObject { SnapshotStats() {} SnapshotStats(StreamInput in) throws IOException { + // We use a boolean to indicate if the stats are present (true) or missing (false), to skip writing all the values if missing. if (in.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS) && in.readBoolean() == false) { startTime = 0L; time = 0L; @@ -158,6 +159,7 @@ public long getProcessedSize() { @Override public void writeTo(StreamOutput out) throws IOException { if (out.getTransportVersion().onOrAfter(SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { + // We use a boolean to indicate if the stats are present (true) or missing (false), to skip writing all the values if missing. if (isMissingStats()) { out.writeBoolean(false); return; @@ -326,6 +328,11 @@ public static SnapshotStats fromXContent(XContentParser parser) throws IOExcepti } } } + // For missing stats incrementalFileCount will be -1, and we expect processedFileCount and processedSize to be omitted (still zero). + if (incrementalFileCount == -1) { + assert processedFileCount == 0 && processedSize == 0L && incrementalSize == -1L && totalFileCount == -1 && totalSize == -1L; + return SnapshotStats.forMissingStats(); + } return new SnapshotStats( startTime, time, diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java index 8b24968e99462..e9381066efad6 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java @@ -244,7 +244,7 @@ void buildResponse( // expensive, we choose instead to provide a message to the caller explaining why the stats are missing and the API // that can be used to load them once the snapshot has completed. if (minTransportVersion.onOrAfter(TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { - shardStatus = SnapshotIndexShardStatus.forMissingStats(shardId, stage, """ + shardStatus = SnapshotIndexShardStatus.forDoneButMissingStats(shardId, """ Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ completing the shard snapshot; use /_snapshot///_status to load from the repository \ once the snapshot has completed."""); diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java index 777a8611ce4f6..412329cc546e9 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatusTests.java @@ -123,4 +123,8 @@ public static SnapshotIndexShardStatus fromXContent(XContentParser parser, Strin XContentParserUtils.ensureExpectedToken(XContentParser.Token.FIELD_NAME, parser.currentToken(), parser); return PARSER.parse(parser, indexId, parser.currentName()); } + + public void testForDoneButMissingStatsXContentSerialization() throws IOException { + testFromXContent(() -> SnapshotIndexShardStatus.forDoneButMissingStats(createTestInstance().getShardId(), randomAlphaOfLength(16))); + } } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java index 783d4e1ff6628..6e9111c938b3e 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java @@ -67,7 +67,7 @@ public void testMissingStats() throws IOException { assertEquals(-1L, missingStats.getIncrementalSize()); assertEquals(-1L, missingStats.getProcessedSize()); - // Verify round trip serialization. + // Verify round trip Transport serialization. for (var transportVersion : List.of( TransportVersions.MINIMUM_COMPATIBLE, TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS, @@ -95,5 +95,8 @@ public void testMissingStats() throws IOException { } } } + + // Verify round trip XContent serialization. + testFromXContent(SnapshotStats::forMissingStats); } } diff --git a/test/framework/src/main/java/org/elasticsearch/test/AbstractXContentTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/AbstractXContentTestCase.java index 39b0f2b60662e..bd39f11541ac5 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/AbstractXContentTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/AbstractXContentTestCase.java @@ -284,9 +284,16 @@ public static void testFromXContent( * both for equality and asserts equality on the two queries. */ public final void testFromXContent() throws IOException { + testFromXContent(this::createTestInstance); + } + + /** + * Generic test that creates a new instance using the given supplier and verifies XContent round trip serialization. + */ + public final void testFromXContent(Supplier testInstanceSupplier) throws IOException { testFromXContent( NUMBER_OF_TEST_RUNS, - this::createTestInstance, + testInstanceSupplier, supportsUnknownFields(), getShuffleFieldsExceptions(), getRandomFieldsExcludeFilter(), From b2f4bf38d4349cfc610e008a735a6baf978b6ac4 Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Tue, 3 Jun 2025 16:11:21 -0400 Subject: [PATCH 6/7] Update server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java Adjust forMissingStats() javadoc Co-authored-by: Dianna Hohensee --- .../action/admin/cluster/snapshots/status/SnapshotStats.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java index 6fc5a27b8af2e..acdf009d7bc7a 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java @@ -87,7 +87,7 @@ public class SnapshotStats implements Writeable, ToXContentObject { } /** - * Returns a stats instance with -1 for all non-time values, for use in situations where the snapshot stats are unavailable. + * Returns a stats instance with invalid field values for use in situations where the snapshot stats are unavailable. */ public static SnapshotStats forMissingStats() { return new SnapshotStats(0L, 0L, -1, -1, -1, -1L, -1L, -1L); From b1fc83cb9457952099324ad85a7dc8260058c256 Mon Sep 17 00:00:00 2001 From: Jeremy Dahlgren Date: Tue, 3 Jun 2025 16:51:34 -0400 Subject: [PATCH 7/7] Fix fromXContent() bug, add @Nullable, update description string --- .../status/SnapshotIndexShardStatus.java | 4 +++- .../snapshots/status/SnapshotStats.java | 13 ++++++------ .../TransportSnapshotsStatusAction.java | 4 ++-- .../snapshots/status/SnapshotStatsTests.java | 20 +++++++++++++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java index 2e4e63cda39cc..c745199627026 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotIndexShardStatus.java @@ -13,6 +13,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.core.Nullable; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus; import org.elasticsearch.xcontent.ToXContentFragment; @@ -90,7 +91,7 @@ public SnapshotIndexShardStatus(StreamInput in) throws IOException { SnapshotStats stats, String nodeId, String failure, - String description + @Nullable String description ) { super(shardId); this.stage = stage; @@ -146,6 +147,7 @@ public String getFailure() { /** * Returns the optional description of the data values contained in the {@code stats} field. */ + @Nullable public String getDescription() { return description; } diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java index acdf009d7bc7a..5588e564eab11 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStats.java @@ -250,10 +250,10 @@ public static SnapshotStats fromXContent(XContentParser parser) throws IOExcepti long time = 0; int incrementalFileCount = 0; int totalFileCount = 0; - int processedFileCount = 0; + int processedFileCount = Integer.MIN_VALUE; long incrementalSize = 0; long totalSize = 0; - long processedSize = 0; + long processedSize = Long.MIN_VALUE; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { XContentParserUtils.ensureExpectedToken(XContentParser.Token.FIELD_NAME, token, parser); String currentName = parser.currentName(); @@ -328,10 +328,11 @@ public static SnapshotStats fromXContent(XContentParser parser) throws IOExcepti } } } - // For missing stats incrementalFileCount will be -1, and we expect processedFileCount and processedSize to be omitted (still zero). - if (incrementalFileCount == -1) { - assert processedFileCount == 0 && processedSize == 0L && incrementalSize == -1L && totalFileCount == -1 && totalSize == -1L; - return SnapshotStats.forMissingStats(); + // Handle the case where the "processed" sub-object is omitted in toXContent() when processedFileCount == incrementalFileCount. + if (processedFileCount == Integer.MIN_VALUE) { + assert processedSize == Long.MIN_VALUE; + processedFileCount = incrementalFileCount; + processedSize = incrementalSize; } return new SnapshotStats( startTime, diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java index e9381066efad6..b501d64b2cd84 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/status/TransportSnapshotsStatusAction.java @@ -246,8 +246,8 @@ void buildResponse( if (minTransportVersion.onOrAfter(TransportVersions.SNAPSHOT_INDEX_SHARD_STATUS_MISSING_STATS)) { shardStatus = SnapshotIndexShardStatus.forDoneButMissingStats(shardId, """ Snapshot shard stats missing from a currently running snapshot due to a node leaving the cluster after \ - completing the shard snapshot; use /_snapshot///_status to load from the repository \ - once the snapshot has completed."""); + completing the shard snapshot; retry once the snapshot has completed to load all shard stats from the \ + repository."""); } else { // BWC behavior, load the stats directly from the repository. shardStatus = new SnapshotIndexShardStatus( diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java index 6e9111c938b3e..ed794f4e9d0aa 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/snapshots/status/SnapshotStatsTests.java @@ -55,6 +55,26 @@ protected boolean supportsUnknownFields() { return true; } + public void testXContentSerializationWhenProcessedFileCountEqualsIncrementalFileCount() throws IOException { + final var instance = createTestInstance(); + final var incrementalSameAsProcessed = new SnapshotStats( + instance.getStartTime(), + instance.getTime(), + instance.getIncrementalFileCount(), + instance.getTotalFileCount(), + instance.getIncrementalFileCount(), // processedFileCount + instance.getIncrementalSize(), + instance.getTotalSize(), + instance.getIncrementalSize() // processedSize + ); + // toXContent() omits the "processed" sub-object in this case, make sure the processed values are set as expected in fromXContent(). + testFromXContent(() -> incrementalSameAsProcessed); + } + + public void testXContentSerializationForEmptyStats() throws IOException { + testFromXContent(SnapshotStats::new); + } + public void testMissingStats() throws IOException { final var populatedStats = createTestInstance(); final var missingStats = SnapshotStats.forMissingStats();