Skip to content

Commit 8127a06

Browse files
committed
Recover broken IndexMetaData as closed
Today if something is wrong with the IndexMetaData we detect it very late and most of the time if that happens we already allocated the index and get endless loops and full log files on data-nodes. This change tries to verify IndexService creattion during initial state recovery on the master and if the recovery fails the index is imported as `closed` and won't be allocated at all. Closes #17187
1 parent 7f16a1d commit 8127a06

File tree

8 files changed

+238
-33
lines changed

8 files changed

+238
-33
lines changed

buildSrc/src/main/resources/checkstyle_suppressions.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,6 @@
419419
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]env[/\\]NodeEnvironment.java" checks="LineLength" />
420420
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]AsyncShardFetch.java" checks="LineLength" />
421421
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]DanglingIndicesState.java" checks="LineLength" />
422-
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]Gateway.java" checks="LineLength" />
423422
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]GatewayAllocator.java" checks="LineLength" />
424423
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]GatewayMetaState.java" checks="LineLength" />
425424
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gateway[/\\]GatewayService.java" checks="LineLength" />

core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexStateService.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.cluster.metadata;
2121

22+
import org.elasticsearch.ElasticsearchException;
2223
import org.elasticsearch.action.ActionListener;
2324
import org.elasticsearch.action.admin.indices.close.CloseIndexClusterStateUpdateRequest;
2425
import org.elasticsearch.action.admin.indices.open.OpenIndexClusterStateUpdateRequest;
@@ -37,6 +38,8 @@
3738
import org.elasticsearch.common.inject.Inject;
3839
import org.elasticsearch.common.settings.Settings;
3940
import org.elasticsearch.index.Index;
41+
import org.elasticsearch.index.NodeServicesProvider;
42+
import org.elasticsearch.indices.IndicesService;
4043
import org.elasticsearch.rest.RestStatus;
4144
import org.elasticsearch.snapshots.RestoreService;
4245
import org.elasticsearch.snapshots.SnapshotsService;
@@ -59,10 +62,16 @@ public class MetaDataIndexStateService extends AbstractComponent {
5962
private final AllocationService allocationService;
6063

6164
private final MetaDataIndexUpgradeService metaDataIndexUpgradeService;
65+
private final NodeServicesProvider nodeServiceProvider;
66+
private final IndicesService indicesService;
6267

6368
@Inject
64-
public MetaDataIndexStateService(Settings settings, ClusterService clusterService, AllocationService allocationService, MetaDataIndexUpgradeService metaDataIndexUpgradeService) {
69+
public MetaDataIndexStateService(Settings settings, ClusterService clusterService, AllocationService allocationService,
70+
MetaDataIndexUpgradeService metaDataIndexUpgradeService,
71+
NodeServicesProvider nodeServicesProvider, IndicesService indicesService) {
6572
super(settings);
73+
this.nodeServiceProvider = nodeServicesProvider;
74+
this.indicesService = indicesService;
6675
this.clusterService = clusterService;
6776
this.allocationService = allocationService;
6877
this.metaDataIndexUpgradeService = metaDataIndexUpgradeService;
@@ -162,6 +171,12 @@ public ClusterState execute(ClusterState currentState) {
162171
// The index might be closed because we couldn't import it due to old incompatible version
163172
// We need to check that this index can be upgraded to the current version
164173
indexMetaData = metaDataIndexUpgradeService.upgradeIndexMetaData(indexMetaData);
174+
try {
175+
indicesService.verifyIndexMetadata(nodeServiceProvider, indexMetaData);
176+
} catch (Exception e) {
177+
throw new ElasticsearchException("Failed to verify index " + indexMetaData.getIndex(), e);
178+
}
179+
165180
mdBuilder.put(indexMetaData, true);
166181
blocksBuilder.removeIndexBlock(indexName, INDEX_CLOSED_BLOCK);
167182
}

core/src/main/java/org/elasticsearch/gateway/Gateway.java

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,14 @@
3535
import org.elasticsearch.discovery.Discovery;
3636
import org.elasticsearch.env.NodeEnvironment;
3737
import org.elasticsearch.index.Index;
38+
import org.elasticsearch.index.IndexService;
39+
import org.elasticsearch.index.NodeServicesProvider;
40+
import org.elasticsearch.indices.IndicesService;
3841

42+
import java.io.IOException;
3943
import java.nio.file.Path;
44+
import java.util.Arrays;
45+
import java.util.Collections;
4046
import java.util.function.Supplier;
4147

4248
/**
@@ -53,10 +59,15 @@ public class Gateway extends AbstractComponent implements ClusterStateListener {
5359
private final TransportNodesListGatewayMetaState listGatewayMetaState;
5460

5561
private final Supplier<Integer> minimumMasterNodesProvider;
62+
private final IndicesService indicesService;
63+
private final NodeServicesProvider nodeServicesProvider;
5664

5765
public Gateway(Settings settings, ClusterService clusterService, NodeEnvironment nodeEnv, GatewayMetaState metaState,
58-
TransportNodesListGatewayMetaState listGatewayMetaState, Discovery discovery) {
66+
TransportNodesListGatewayMetaState listGatewayMetaState, Discovery discovery,
67+
NodeServicesProvider nodeServicesProvider, IndicesService indicesService) {
5968
super(settings);
69+
this.nodeServicesProvider = nodeServicesProvider;
70+
this.indicesService = indicesService;
6071
this.clusterService = clusterService;
6172
this.nodeEnv = nodeEnv;
6273
this.metaState = metaState;
@@ -66,9 +77,9 @@ public Gateway(Settings settings, ClusterService clusterService, NodeEnvironment
6677
}
6778

6879
public void performStateRecovery(final GatewayStateRecoveredListener listener) throws GatewayException {
69-
ObjectHashSet<String> nodesIds = new ObjectHashSet<>(clusterService.state().nodes().masterNodes().keys());
70-
logger.trace("performing state recovery from {}", nodesIds);
71-
TransportNodesListGatewayMetaState.NodesGatewayMetaState nodesState = listGatewayMetaState.list(nodesIds.toArray(String.class), null).actionGet();
80+
String[] nodesIds = clusterService.state().nodes().masterNodes().keys().toArray(String.class);
81+
logger.trace("performing state recovery from {}", Arrays.toString(nodesIds));
82+
TransportNodesListGatewayMetaState.NodesGatewayMetaState nodesState = listGatewayMetaState.list(nodesIds, null).actionGet();
7283

7384

7485
int requiredAllocation = Math.max(1, minimumMasterNodesProvider.get());
@@ -129,7 +140,17 @@ public void performStateRecovery(final GatewayStateRecoveredListener listener) t
129140
if (electedIndexMetaData != null) {
130141
if (indexMetaDataCount < requiredAllocation) {
131142
logger.debug("[{}] found [{}], required [{}], not adding", index, indexMetaDataCount, requiredAllocation);
143+
} // TODO if this logging statement is correct then we are missing an else here
144+
try {
145+
if (electedIndexMetaData.getState() == IndexMetaData.State.OPEN) {
146+
// verify that we can actually create this index - if not we recover it as closed with lots of warn logs
147+
indicesService.verifyIndexMetadata(nodeServicesProvider, electedIndexMetaData);
148+
}
149+
} catch (Exception e) {
150+
logger.warn("recovering index {} failed - recovering as closed", e, electedIndexMetaData.getIndex());
151+
electedIndexMetaData = IndexMetaData.builder(electedIndexMetaData).state(IndexMetaData.State.CLOSE).build();
132152
}
153+
133154
metaDataBuilder.put(electedIndexMetaData, false);
134155
}
135156
}

core/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ public GatewayMetaState(Settings settings, NodeEnvironment nodeEnv, MetaStateSer
8686
if (DiscoveryNode.masterNode(settings) || DiscoveryNode.dataNode(settings)) {
8787
try {
8888
ensureNoPre019State();
89-
pre20Upgrade();
9089
IndexFolderUpgrader.upgradeIndicesIfNeeded(settings, nodeEnv);
90+
upgradeMetaData();
9191
long startNS = System.nanoTime();
9292
metaStateService.loadFullState();
9393
logger.debug("took {} to load state", TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - startNS)));
@@ -222,7 +222,7 @@ private void ensureNoPre019State() throws Exception {
222222
* MetaDataIndexUpgradeService might also update obsolete settings if needed. When this happens we rewrite
223223
* index metadata with new settings.
224224
*/
225-
private void pre20Upgrade() throws Exception {
225+
private void upgradeMetaData() throws Exception {
226226
MetaData metaData = loadMetaState();
227227
List<IndexMetaData> updateIndexMetaData = new ArrayList<>();
228228
for (IndexMetaData indexMetaData : metaData) {
@@ -235,7 +235,7 @@ private void pre20Upgrade() throws Exception {
235235
// means the upgrade can continue. Now it's safe to overwrite index metadata with the new version.
236236
for (IndexMetaData indexMetaData : updateIndexMetaData) {
237237
// since we still haven't upgraded the index folders, we write index state in the old folder
238-
metaStateService.writeIndex("upgrade", indexMetaData, nodeEnv.resolveIndexFolder(indexMetaData.getIndex().getName()));
238+
metaStateService.writeIndex("upgrade", indexMetaData, nodeEnv.resolveIndexFolder(indexMetaData.getIndex().getUUID()));
239239
}
240240
}
241241

core/src/main/java/org/elasticsearch/gateway/GatewayService.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
4444
import org.elasticsearch.discovery.Discovery;
4545
import org.elasticsearch.env.NodeEnvironment;
46+
import org.elasticsearch.index.NodeServicesProvider;
47+
import org.elasticsearch.indices.IndicesService;
4648
import org.elasticsearch.rest.RestStatus;
4749
import org.elasticsearch.threadpool.ThreadPool;
4850

@@ -95,9 +97,11 @@ public class GatewayService extends AbstractLifecycleComponent<GatewayService> i
9597
@Inject
9698
public GatewayService(Settings settings, AllocationService allocationService, ClusterService clusterService,
9799
ThreadPool threadPool, NodeEnvironment nodeEnvironment, GatewayMetaState metaState,
98-
TransportNodesListGatewayMetaState listGatewayMetaState, Discovery discovery) {
100+
TransportNodesListGatewayMetaState listGatewayMetaState, Discovery discovery,
101+
NodeServicesProvider nodeServicesProvider, IndicesService indicesService) {
99102
super(settings);
100-
this.gateway = new Gateway(settings, clusterService, nodeEnvironment, metaState, listGatewayMetaState, discovery);
103+
this.gateway = new Gateway(settings, clusterService, nodeEnvironment, metaState, listGatewayMetaState, discovery,
104+
nodeServicesProvider, indicesService);
101105
this.allocationService = allocationService;
102106
this.clusterService = clusterService;
103107
this.threadPool = threadPool;

core/src/main/java/org/elasticsearch/indices/IndicesService.java

Lines changed: 59 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.indices;
2121

22+
import com.carrotsearch.hppc.cursors.ObjectCursor;
2223
import org.apache.lucene.index.DirectoryReader;
2324
import org.apache.lucene.store.LockObtainFailedException;
2425
import org.apache.lucene.util.CollectionUtil;
@@ -33,6 +34,7 @@
3334
import org.elasticsearch.cluster.ClusterState;
3435
import org.elasticsearch.cluster.metadata.IndexMetaData;
3536
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
37+
import org.elasticsearch.cluster.metadata.MappingMetaData;
3638
import org.elasticsearch.cluster.service.ClusterService;
3739
import org.elasticsearch.common.Nullable;
3840
import org.elasticsearch.common.breaker.CircuitBreaker;
@@ -66,6 +68,7 @@
6668
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
6769
import org.elasticsearch.index.flush.FlushStats;
6870
import org.elasticsearch.index.get.GetStats;
71+
import org.elasticsearch.index.mapper.MapperService;
6972
import org.elasticsearch.index.merge.MergeStats;
7073
import org.elasticsearch.index.recovery.RecoveryStats;
7174
import org.elasticsearch.index.refresh.RefreshStats;
@@ -74,6 +77,7 @@
7477
import org.elasticsearch.index.shard.IndexEventListener;
7578
import org.elasticsearch.index.shard.IndexShard;
7679
import org.elasticsearch.index.shard.IndexShardState;
80+
import org.elasticsearch.index.shard.IndexingOperationListener;
7781
import org.elasticsearch.index.shard.IndexingStats;
7882
import org.elasticsearch.index.shard.ShardId;
7983
import org.elasticsearch.index.store.IndexStoreConfig;
@@ -88,9 +92,11 @@
8892
import org.elasticsearch.search.query.QuerySearchResult;
8993
import org.elasticsearch.threadpool.ThreadPool;
9094

95+
import java.io.Closeable;
9196
import java.io.IOException;
9297
import java.nio.file.Files;
9398
import java.util.ArrayList;
99+
import java.util.Collections;
94100
import java.util.EnumSet;
95101
import java.util.HashMap;
96102
import java.util.Iterator;
@@ -324,44 +330,30 @@ public IndexService indexServiceSafe(Index index) {
324330
* @throws IndexAlreadyExistsException if the index already exists.
325331
*/
326332
public synchronized IndexService createIndex(final NodeServicesProvider nodeServicesProvider, IndexMetaData indexMetaData, List<IndexEventListener> builtInListeners) throws IOException {
333+
327334
if (!lifecycle.started()) {
328335
throw new IllegalStateException("Can't create an index [" + indexMetaData.getIndex() + "], node is closed");
329336
}
330337
if (indexMetaData.getIndexUUID().equals(IndexMetaData.INDEX_UUID_NA_VALUE)) {
331338
throw new IllegalArgumentException("index must have a real UUID found value: [" + indexMetaData.getIndexUUID() + "]");
332339
}
333340
final Index index = indexMetaData.getIndex();
334-
final Predicate<String> indexNameMatcher = (indexExpression) -> indexNameExpressionResolver.matchesIndex(index.getName(), indexExpression, clusterService.state());
335-
final IndexSettings idxSettings = new IndexSettings(indexMetaData, this.settings, indexNameMatcher, indexScopeSetting);
336341
if (hasIndex(index)) {
337342
throw new IndexAlreadyExistsException(index);
338343
}
339-
logger.debug("creating Index [{}], shards [{}]/[{}{}]",
340-
indexMetaData.getIndex(),
341-
idxSettings.getNumberOfShards(),
342-
idxSettings.getNumberOfReplicas(),
343-
idxSettings.isShadowReplicaIndex() ? "s" : "");
344-
345-
final IndexModule indexModule = new IndexModule(idxSettings, indexStoreConfig, analysisRegistry);
346-
pluginsService.onIndexModule(indexModule);
347-
for (IndexEventListener listener : builtInListeners) {
348-
indexModule.addIndexEventListener(listener);
349-
}
344+
List<IndexEventListener> finalListeners = new ArrayList<>(builtInListeners);
350345
final IndexEventListener onStoreClose = new IndexEventListener() {
351346
@Override
352347
public void onStoreClosed(ShardId shardId) {
353348
indicesQueryCache.onClose(shardId);
354349
}
355350
};
356-
indexModule.addIndexEventListener(onStoreClose);
357-
indexModule.addIndexEventListener(oldShardsStats);
358-
final IndexEventListener listener = indexModule.freeze();
359-
listener.beforeIndexCreated(index, idxSettings.getSettings());
360-
final IndexService indexService = indexModule.newIndexService(nodeEnv, this, nodeServicesProvider, indicesQueryCache, mapperRegistry, indicesFieldDataCache, indexingMemoryController);
351+
finalListeners.add(onStoreClose);
352+
finalListeners.add(oldShardsStats);
353+
final IndexService indexService = createIndexService("create index", nodeServicesProvider, indexMetaData, indicesQueryCache, indicesFieldDataCache, finalListeners, indexingMemoryController);
361354
boolean success = false;
362355
try {
363-
assert indexService.getIndexEventListener() == listener;
364-
listener.afterIndexCreated(indexService);
356+
indexService.getIndexEventListener().afterIndexCreated(indexService);
365357
indices = newMapBuilder(indices).put(index.getUUID(), indexService).immutableMap();
366358
success = true;
367359
return indexService;
@@ -370,7 +362,54 @@ public void onStoreClosed(ShardId shardId) {
370362
indexService.close("plugins_failed", true);
371363
}
372364
}
365+
}
373366

367+
/**
368+
* This creates a new IndexService without registering it
369+
*/
370+
private synchronized IndexService createIndexService(final String reason, final NodeServicesProvider nodeServicesProvider, IndexMetaData indexMetaData, IndicesQueryCache indicesQueryCache, IndicesFieldDataCache indicesFieldDataCache, List<IndexEventListener> builtInListeners, IndexingOperationListener... indexingOperationListeners) throws IOException {
371+
final Index index = indexMetaData.getIndex();
372+
final Predicate<String> indexNameMatcher = (indexExpression) -> indexNameExpressionResolver.matchesIndex(index.getName(), indexExpression, clusterService.state());
373+
final IndexSettings idxSettings = new IndexSettings(indexMetaData, this.settings, indexNameMatcher, indexScopeSetting);
374+
logger.debug("creating Index [{}], shards [{}]/[{}{}] - reason [{}]",
375+
indexMetaData.getIndex(),
376+
idxSettings.getNumberOfShards(),
377+
idxSettings.getNumberOfReplicas(),
378+
idxSettings.isShadowReplicaIndex() ? "s" : "", reason);
379+
380+
final IndexModule indexModule = new IndexModule(idxSettings, indexStoreConfig, analysisRegistry);
381+
pluginsService.onIndexModule(indexModule);
382+
for (IndexEventListener listener : builtInListeners) {
383+
indexModule.addIndexEventListener(listener);
384+
}
385+
final IndexEventListener listener = indexModule.freeze();
386+
listener.beforeIndexCreated(index, idxSettings.getSettings());
387+
return indexModule.newIndexService(nodeEnv, this, nodeServicesProvider, indicesQueryCache, mapperRegistry, indicesFieldDataCache, indexingOperationListeners);
388+
}
389+
390+
/**
391+
* This method verifies that the given {@link IndexMetaData} holds sane values to create an {@link IndexService}. This method will throw an
392+
* exception if the creation fails. The created {@link IndexService} will not be registered and will be closed immediately.
393+
*/
394+
public synchronized void verifyIndexMetadata(final NodeServicesProvider nodeServicesProvider, IndexMetaData metaData) throws IOException {
395+
final List<Closeable> closeables = new ArrayList<>();
396+
try {
397+
IndicesFieldDataCache indicesFieldDataCache = new IndicesFieldDataCache(settings, new IndexFieldDataCache.Listener() {});
398+
closeables.add(indicesFieldDataCache);
399+
IndicesQueryCache indicesQueryCache = new IndicesQueryCache(settings);
400+
closeables.add(indicesQueryCache);
401+
// this will also fail if some plugin fails etc. which is nice since we can verify that early
402+
final IndexService service = createIndexService("metadata verification", nodeServicesProvider,
403+
metaData, indicesQueryCache, indicesFieldDataCache, Collections.emptyList());
404+
for (ObjectCursor<MappingMetaData> typeMapping : metaData.getMappings().values()) {
405+
// don't apply the default mapping, it has been applied when the mapping was created
406+
service.mapperService().merge(typeMapping.value.type(), typeMapping.value.source(),
407+
MapperService.MergeReason.MAPPING_RECOVERY, true);
408+
}
409+
closeables.add(() -> service.close("metadata verification", false));
410+
} finally {
411+
IOUtils.close(closeables);
412+
}
374413
}
375414

376415
/**

0 commit comments

Comments
 (0)