Skip to content

Commit fe1c8be

Browse files
committed
Avoid self-deadlock in the translog (#29520)
Today when reading an operation from the current generation fails tragically we attempt to close the translog. However, by invoking close before releasing the read lock we end up in self-deadlock because closing tries to acquire the write lock and the read lock can not be upgraded to a write lock. To avoid this, we move the close invocation outside of the try-with-resources that acquired the read lock. As an extra guard against this, we document the problem and add an assertion that we are not trying to invoke close while holding the read lock.
1 parent 991da98 commit fe1c8be

File tree

2 files changed

+20
-10
lines changed

2 files changed

+20
-10
lines changed

server/src/main/java/org/elasticsearch/index/translog/Translog.java

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -585,12 +585,7 @@ public Operation readOperation(Location location) throws IOException {
585585
if (current.generation == location.generation) {
586586
// no need to fsync here the read operation will ensure that buffers are written to disk
587587
// if they are still in RAM and we are reading onto that position
588-
try {
589-
return current.read(location);
590-
} catch (final Exception ex) {
591-
closeOnTragicEvent(ex);
592-
throw ex;
593-
}
588+
return current.read(location);
594589
} else {
595590
// read backwards - it's likely we need to read on that is recent
596591
for (int i = readers.size() - 1; i >= 0; i--) {
@@ -600,6 +595,9 @@ public Operation readOperation(Location location) throws IOException {
600595
}
601596
}
602597
}
598+
} catch (final Exception ex) {
599+
closeOnTragicEvent(ex);
600+
throw ex;
603601
}
604602
return null;
605603
}
@@ -737,15 +735,28 @@ public boolean ensureSynced(Stream<Location> locations) throws IOException {
737735
}
738736
}
739737

738+
/**
739+
* Closes the translog if the current translog writer experienced a tragic exception.
740+
*
741+
* Note that in case this thread closes the translog it must not already be holding a read lock on the translog as it will acquire a
742+
* write lock in the course of closing the translog
743+
*
744+
* @param ex if an exception occurs closing the translog, it will be suppressed into the provided exception
745+
*/
740746
private void closeOnTragicEvent(final Exception ex) {
747+
// we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock
748+
assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName();
741749
if (current.getTragicException() != null) {
742750
try {
743751
close();
744752
} catch (final AlreadyClosedException inner) {
745-
// don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as suppressed because
746-
// will contain the Exception ex as cause. See also https://github.com/elastic/elasticsearch/issues/15941
753+
/*
754+
* Don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as
755+
* suppressed because it will contain the provided exception as its cause. See also
756+
* https://github.com/elastic/elasticsearch/issues/15941.
757+
*/
747758
} catch (final Exception inner) {
748-
assert (ex != inner.getCause());
759+
assert ex != inner.getCause();
749760
ex.addSuppressed(inner);
750761
}
751762
}

server/src/test/java/org/elasticsearch/index/translog/TranslogTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1812,7 +1812,6 @@ public void testTragicEventCanBeAnyException() throws IOException {
18121812
assertTrue(translog.getTragicException() instanceof UnknownException);
18131813
}
18141814

1815-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/29509")
18161815
public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException {
18171816
Path tempDir = createTempDir();
18181817
final FailSwitch fail = new FailSwitch();

0 commit comments

Comments
 (0)