Skip to content

Commit 1f93339

Browse files
Less Verbose Serialization of Snapshot Failure in SLM Metadata (#80942)
We should not serialize the full exception including cause(s) and stacktraces here. This can be a string of multiple MBs for a very large cluster that has a large subset of indices/shards failing to snapshot. We can get the full details of what failed for each shard in detail from the repository as well as from logs anyway. If we fail to finalize the snapshot we still get the rough reason for that failure with this change and can look at the logs for more details.
1 parent 85b3435 commit 1f93339

File tree

5 files changed

+23
-52
lines changed

5 files changed

+23
-52
lines changed

server/src/main/java/org/elasticsearch/ElasticsearchException.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ public class ElasticsearchException extends RuntimeException implements ToXConte
5858
* to control if the {@code caused_by} element should render. Unlike most parameters to {@code toXContent} methods this parameter is
5959
* internal only and not available as a URL parameter.
6060
*/
61-
private static final String REST_EXCEPTION_SKIP_CAUSE = "rest.exception.cause.skip";
61+
public static final String REST_EXCEPTION_SKIP_CAUSE = "rest.exception.cause.skip";
6262
/**
6363
* Passed in the {@link Params} of {@link #generateThrowableXContent(XContentBuilder, Params, Throwable)}
6464
* to control if the {@code stack_trace} element should render. Unlike most parameters to {@code toXContent} methods this parameter is

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ public class SnapshotInvocationRecord extends AbstractDiffable<SnapshotInvocatio
3737
static final ParseField TIMESTAMP = new ParseField("time");
3838
static final ParseField DETAILS = new ParseField("details");
3939

40-
private String snapshotName;
41-
private Long snapshotStartTimestamp;
42-
private long snapshotFinishTimestamp;
43-
private String details;
40+
private final String snapshotName;
41+
private final Long snapshotStartTimestamp;
42+
private final long snapshotFinishTimestamp;
43+
private final String details;
4444

4545
public static final ConstructingObjectParser<SnapshotInvocationRecord, String> PARSER = new ConstructingObjectParser<>(
4646
"snapshot_policy_invocation_record",

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,10 @@
2020
import org.elasticsearch.cluster.metadata.Metadata;
2121
import org.elasticsearch.cluster.service.ClusterService;
2222
import org.elasticsearch.common.Strings;
23-
import org.elasticsearch.common.bytes.BytesReference;
2423
import org.elasticsearch.core.TimeValue;
2524
import org.elasticsearch.snapshots.SnapshotException;
2625
import org.elasticsearch.snapshots.SnapshotInfo;
2726
import org.elasticsearch.xcontent.ToXContent;
28-
import org.elasticsearch.xcontent.XContentBuilder;
29-
import org.elasticsearch.xcontent.json.JsonXContent;
3027
import org.elasticsearch.xpack.core.ClientHelper;
3128
import org.elasticsearch.xpack.core.scheduler.SchedulerEngine;
3229
import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
@@ -39,13 +36,10 @@
3936

4037
import java.io.IOException;
4138
import java.time.Instant;
42-
import java.util.Collections;
4339
import java.util.HashMap;
4440
import java.util.Map;
4541
import java.util.Optional;
4642

47-
import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
48-
4943
public class SnapshotLifecycleTask implements SchedulerEngine.Listener {
5044

5145
private static final Logger logger = LogManager.getLogger(SnapshotLifecycleTask.class);
@@ -135,11 +129,6 @@ public void onResponse(CreateSnapshotResponse createSnapshotResponse) {
135129
request.snapshot(),
136130
"failed to create snapshot successfully, " + failures + " out of " + total + " total shards failed"
137131
);
138-
// Add each failed shard's exception as suppressed, the exception contains
139-
// information about which shard failed
140-
// TODO: this seems wrong, investigate whether we actually need all the shard level exception here given that we
141-
// could be dealing with tens of thousands of them at a time
142-
snapInfo.shardFailures().forEach(e::addSuppressed);
143132
// Call the failure handler to register this as a failure and persist it
144133
onFailure(e);
145134
}
@@ -194,13 +183,17 @@ static Optional<SnapshotLifecyclePolicyMetadata> getSnapPolicyMetadata(final Str
194183
);
195184
}
196185

186+
public static String exceptionToString(Exception ex) {
187+
return Strings.toString((builder, params) -> {
188+
ElasticsearchException.generateThrowableXContent(builder, params, ex);
189+
return builder;
190+
}, ToXContent.EMPTY_PARAMS);
191+
}
192+
197193
/**
198194
* A cluster state update task to write the result of a snapshot job to the cluster metadata for the associated policy.
199195
*/
200196
private static class WriteJobStatus extends ClusterStateUpdateTask {
201-
private static final ToXContent.Params STACKTRACE_PARAMS = new ToXContent.MapParams(
202-
Collections.singletonMap(REST_EXCEPTION_SKIP_STACK_TRACE, "false")
203-
);
204197

205198
private final String policyName;
206199
private final String snapshotName;
@@ -230,18 +223,6 @@ static WriteJobStatus failure(String policyId, String snapshotName, long timesta
230223
return new WriteJobStatus(policyId, snapshotName, timestamp, timestamp, Optional.of(exception));
231224
}
232225

233-
private String exceptionToString() throws IOException {
234-
if (exception.isPresent()) {
235-
try (XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder()) {
236-
causeXContentBuilder.startObject();
237-
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, exception.get());
238-
causeXContentBuilder.endObject();
239-
return BytesReference.bytes(causeXContentBuilder).utf8ToString();
240-
}
241-
}
242-
return null;
243-
}
244-
245226
@Override
246227
public ClusterState execute(ClusterState currentState) throws Exception {
247228
SnapshotLifecycleMetadata snapMeta = currentState.metadata().custom(SnapshotLifecycleMetadata.TYPE);
@@ -274,7 +255,14 @@ public ClusterState execute(ClusterState currentState) throws Exception {
274255

275256
if (exception.isPresent()) {
276257
stats.snapshotFailed(policyName);
277-
newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, null, snapshotFinishTime, exceptionToString()));
258+
newPolicyMetadata.setLastFailure(
259+
new SnapshotInvocationRecord(
260+
snapshotName,
261+
null,
262+
snapshotFinishTime,
263+
exception.map(SnapshotLifecycleTask::exceptionToString).orElse(null)
264+
)
265+
);
278266
} else {
279267
stats.snapshotTaken(policyName);
280268
newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, snapshotFinishTime, null));

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/history/SnapshotHistoryItem.java

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77

88
package org.elasticsearch.xpack.slm.history;
99

10-
import org.elasticsearch.ElasticsearchException;
1110
import org.elasticsearch.common.Strings;
12-
import org.elasticsearch.common.bytes.BytesReference;
1311
import org.elasticsearch.common.io.stream.StreamInput;
1412
import org.elasticsearch.common.io.stream.StreamOutput;
1513
import org.elasticsearch.common.io.stream.Writeable;
@@ -19,16 +17,13 @@
1917
import org.elasticsearch.xcontent.ToXContentObject;
2018
import org.elasticsearch.xcontent.XContentBuilder;
2119
import org.elasticsearch.xcontent.XContentParser;
22-
import org.elasticsearch.xcontent.json.JsonXContent;
2320
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
21+
import org.elasticsearch.xpack.slm.SnapshotLifecycleTask;
2422

2523
import java.io.IOException;
26-
import java.util.Collections;
2724
import java.util.Map;
2825
import java.util.Objects;
2926

30-
import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
31-
3227
/**
3328
* Represents the record of a Snapshot Lifecycle Management action, so that it
3429
* can be indexed in a history index or recorded to a log in a structured way
@@ -138,7 +133,7 @@ public static SnapshotHistoryItem creationFailureRecord(
138133
String snapshotName,
139134
Exception exception
140135
) throws IOException {
141-
String exceptionString = exceptionToString(exception);
136+
String exceptionString = SnapshotLifecycleTask.exceptionToString(exception);
142137
return new SnapshotHistoryItem(
143138
timeStamp,
144139
policy.getId(),
@@ -162,7 +157,7 @@ public static SnapshotHistoryItem deletionFailureRecord(
162157
String repository,
163158
Exception exception
164159
) throws IOException {
165-
String exceptionString = exceptionToString(exception);
160+
String exceptionString = SnapshotLifecycleTask.exceptionToString(exception);
166161
return new SnapshotHistoryItem(timestamp, policyId, repository, snapshotName, DELETE_OPERATION, false, null, exceptionString);
167162
}
168163

@@ -273,15 +268,4 @@ public String toString() {
273268
return Strings.toString(this);
274269
}
275270

276-
private static String exceptionToString(Exception exception) throws IOException {
277-
Params stacktraceParams = new MapParams(Collections.singletonMap(REST_EXCEPTION_SKIP_STACK_TRACE, "false"));
278-
String exceptionString;
279-
try (XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder()) {
280-
causeXContentBuilder.startObject();
281-
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, stacktraceParams, exception);
282-
causeXContentBuilder.endObject();
283-
exceptionString = BytesReference.bytes(causeXContentBuilder).utf8ToString();
284-
}
285-
return exceptionString;
286-
}
287271
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTaskTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ public void testPartialFailureSnapshot() throws Exception {
288288
item.getErrorDetails(),
289289
containsString("failed to create snapshot successfully, 1 out of 3 total shards failed")
290290
);
291-
assertThat(item.getErrorDetails(), containsString("forced failure"));
292291
});
293292

294293
SnapshotLifecycleTask task = new SnapshotLifecycleTask(client, clusterService, historyStore);

0 commit comments

Comments
 (0)