|
8 | 8 |
|
9 | 9 | package org.opensearch.index.translog; |
10 | 10 |
|
| 11 | +import org.apache.logging.log4j.LogManager; |
11 | 12 | import org.apache.logging.log4j.Logger; |
| 13 | +import org.opensearch.action.LatchedActionListener; |
12 | 14 | import org.opensearch.cluster.service.ClusterService; |
13 | 15 | import org.opensearch.common.blobstore.BlobMetadata; |
14 | 16 | import org.opensearch.common.collect.Tuple; |
|
33 | 35 | import java.util.Optional; |
34 | 36 | import java.util.Set; |
35 | 37 | import java.util.TreeSet; |
| 38 | +import java.util.concurrent.CountDownLatch; |
| 39 | +import java.util.concurrent.TimeUnit; |
| 40 | +import java.util.concurrent.atomic.AtomicLong; |
36 | 41 | import java.util.function.BooleanSupplier; |
37 | 42 | import java.util.function.LongConsumer; |
38 | 43 | import java.util.function.LongSupplier; |
|
52 | 57 | */ |
53 | 58 | public class RemoteFsTimestampAwareTranslog extends RemoteFsTranslog { |
54 | 59 |
|
| 60 | + private static Logger staticLogger = LogManager.getLogger(RemoteFsTimestampAwareTranslog.class); |
55 | 61 | private final Logger logger; |
56 | 62 | private final Map<Long, String> metadataFilePinnedTimestampMap; |
57 | 63 | // For metadata files, with no min generation in the name, we cache generation data to avoid multiple reads. |
58 | 64 | private final Map<String, Tuple<Long, Long>> oldFormatMetadataFileGenerationMap; |
| 65 | + private final Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap; |
| 66 | + private final AtomicLong minPrimaryTermInRemote = new AtomicLong(Long.MAX_VALUE); |
59 | 67 |
|
60 | 68 | public RemoteFsTimestampAwareTranslog( |
61 | 69 | TranslogConfig config, |
@@ -86,6 +94,7 @@ public RemoteFsTimestampAwareTranslog( |
86 | 94 | logger = Loggers.getLogger(getClass(), shardId); |
87 | 95 | this.metadataFilePinnedTimestampMap = new HashMap<>(); |
88 | 96 | this.oldFormatMetadataFileGenerationMap = new HashMap<>(); |
| 97 | + this.oldFormatMetadataFilePrimaryTermMap = new HashMap<>(); |
89 | 98 | } |
90 | 99 |
|
91 | 100 | @Override |
@@ -165,7 +174,11 @@ public void onResponse(List<BlobMetadata> blobMetadata) { |
165 | 174 | return; |
166 | 175 | } |
167 | 176 |
|
168 | | - List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles); |
| 177 | + List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted( |
| 178 | + metadataFiles, |
| 179 | + metadataFilePinnedTimestampMap, |
| 180 | + logger |
| 181 | + ); |
169 | 182 |
|
170 | 183 | // If index is not deleted, make sure to keep latest metadata file |
171 | 184 | if (indexDeleted == false) { |
@@ -209,7 +222,7 @@ public void onResponse(List<BlobMetadata> blobMetadata) { |
209 | 222 | oldFormatMetadataFileGenerationMap.keySet().retainAll(metadataFilesNotToBeDeleted); |
210 | 223 |
|
211 | 224 | // Delete stale primary terms |
212 | | - deleteStaleRemotePrimaryTerms(metadataFiles); |
| 225 | + deleteStaleRemotePrimaryTerms(metadataFilesNotToBeDeleted); |
213 | 226 | } else { |
214 | 227 | remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS); |
215 | 228 | } |
@@ -259,8 +272,16 @@ protected Set<Long> getGenerationsToBeDeleted( |
259 | 272 | return generationsToBeDeleted; |
260 | 273 | } |
261 | 274 |
|
262 | | - // Visible for testing |
263 | 275 | protected List<String> getMetadataFilesToBeDeleted(List<String> metadataFiles) { |
| 276 | + return getMetadataFilesToBeDeleted(metadataFiles, metadataFilePinnedTimestampMap, logger); |
| 277 | + } |
| 278 | + |
| 279 | + // Visible for testing |
| 280 | + protected static List<String> getMetadataFilesToBeDeleted( |
| 281 | + List<String> metadataFiles, |
| 282 | + Map<Long, String> metadataFilePinnedTimestampMap, |
| 283 | + Logger logger |
| 284 | + ) { |
264 | 285 | Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(); |
265 | 286 |
|
266 | 287 | // Keep files since last successful run of scheduler |
@@ -351,27 +372,167 @@ protected Tuple<Long, Long> getMinMaxTranslogGenerationFromMetadataFile( |
351 | 372 | } |
352 | 373 | } |
353 | 374 |
|
| 375 | + private void deleteStaleRemotePrimaryTerms(List<String> metadataFiles) { |
| 376 | + deleteStaleRemotePrimaryTerms( |
| 377 | + metadataFiles, |
| 378 | + translogTransferManager, |
| 379 | + oldFormatMetadataFilePrimaryTermMap, |
| 380 | + minPrimaryTermInRemote, |
| 381 | + logger |
| 382 | + ); |
| 383 | + } |
| 384 | + |
354 | 385 | /** |
355 | 386 | * This method must be called only after there are valid generations to delete in trimUnreferencedReaders as it ensures |
356 | 387 | * implicitly that minimum primary term in latest translog metadata in remote store is the current primary term. |
357 | 388 | * <br> |
358 | 389 | * This will also delete all stale translog metadata files from remote except the latest basis the metadata file comparator. |
359 | 390 | */ |
360 | | - private void deleteStaleRemotePrimaryTerms(List<String> metadataFiles) { |
| 391 | + private static void deleteStaleRemotePrimaryTerms( |
| 392 | + List<String> metadataFiles, |
| 393 | + TranslogTransferManager translogTransferManager, |
| 394 | + Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap, |
| 395 | + AtomicLong minPrimaryTermInRemote, |
| 396 | + Logger logger |
| 397 | + ) { |
361 | 398 | // The deletion of older translog files in remote store is on best-effort basis, there is a possibility that there |
362 | 399 | // are older files that are no longer needed and should be cleaned up. In here, we delete all files that are part |
363 | 400 | // of older primary term. |
364 | | - if (olderPrimaryCleaned.trySet(Boolean.TRUE)) { |
365 | | - if (metadataFiles.isEmpty()) { |
366 | | - logger.trace("No metadata is uploaded yet, returning from deleteStaleRemotePrimaryTerms"); |
367 | | - return; |
| 401 | + if (metadataFiles.isEmpty()) { |
| 402 | + logger.trace("No metadata is uploaded yet, returning from deleteStaleRemotePrimaryTerms"); |
| 403 | + return; |
| 404 | + } |
| 405 | + Optional<Long> minPrimaryTermFromMetadataFiles = metadataFiles.stream().map(file -> { |
| 406 | + try { |
| 407 | + return getMinMaxPrimaryTermFromMetadataFile(file, translogTransferManager, oldFormatMetadataFilePrimaryTermMap).v1(); |
| 408 | + } catch (IOException e) { |
| 409 | + return Long.MAX_VALUE; |
368 | 410 | } |
369 | | - Optional<Long> minPrimaryTerm = metadataFiles.stream() |
370 | | - .map(file -> RemoteStoreUtils.invertLong(file.split(METADATA_SEPARATOR)[1])) |
371 | | - .min(Long::compareTo); |
372 | | - // First we delete all stale primary terms folders from remote store |
373 | | - long minimumReferencedPrimaryTerm = minPrimaryTerm.get() - 1; |
| 411 | + }).min(Long::compareTo); |
| 412 | + // First we delete all stale primary terms folders from remote store |
| 413 | + long minimumReferencedPrimaryTerm = minPrimaryTermFromMetadataFiles.get() - 1; |
| 414 | + Long minPrimaryTerm = getMinPrimaryTermInRemote(minPrimaryTermInRemote, translogTransferManager, logger); |
| 415 | + if (minimumReferencedPrimaryTerm > minPrimaryTerm) { |
374 | 416 | translogTransferManager.deletePrimaryTermsAsync(minimumReferencedPrimaryTerm); |
| 417 | + minPrimaryTermInRemote.set(minimumReferencedPrimaryTerm); |
| 418 | + } else { |
| 419 | + logger.debug( |
| 420 | + "Skipping primary term cleanup. minimumReferencedPrimaryTerm = {}, minPrimaryTermInRemote = {}", |
| 421 | + minimumReferencedPrimaryTerm, |
| 422 | + minPrimaryTermInRemote |
| 423 | + ); |
375 | 424 | } |
376 | 425 | } |
| 426 | + |
| 427 | + private static Long getMinPrimaryTermInRemote( |
| 428 | + AtomicLong minPrimaryTermInRemote, |
| 429 | + TranslogTransferManager translogTransferManager, |
| 430 | + Logger logger |
| 431 | + ) { |
| 432 | + if (minPrimaryTermInRemote.get() == Long.MAX_VALUE) { |
| 433 | + CountDownLatch latch = new CountDownLatch(1); |
| 434 | + translogTransferManager.listPrimaryTermsInRemoteAsync(new LatchedActionListener<>(new ActionListener<>() { |
| 435 | + @Override |
| 436 | + public void onResponse(Set<Long> primaryTermsInRemote) { |
| 437 | + Optional<Long> minPrimaryTerm = primaryTermsInRemote.stream().min(Long::compareTo); |
| 438 | + minPrimaryTerm.ifPresent(minPrimaryTermInRemote::set); |
| 439 | + } |
| 440 | + |
| 441 | + @Override |
| 442 | + public void onFailure(Exception e) { |
| 443 | + logger.error("Exception while fetching min primary term from remote translog", e); |
| 444 | + } |
| 445 | + }, latch)); |
| 446 | + |
| 447 | + try { |
| 448 | + if (latch.await(5, TimeUnit.MINUTES) == false) { |
| 449 | + logger.error("Timeout while fetching min primary term from remote translog"); |
| 450 | + } |
| 451 | + } catch (InterruptedException e) { |
| 452 | + logger.error("Exception while fetching min primary term from remote translog", e); |
| 453 | + } |
| 454 | + } |
| 455 | + return minPrimaryTermInRemote.get(); |
| 456 | + } |
| 457 | + |
| 458 | + protected static Tuple<Long, Long> getMinMaxPrimaryTermFromMetadataFile( |
| 459 | + String metadataFile, |
| 460 | + TranslogTransferManager translogTransferManager, |
| 461 | + Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap |
| 462 | + ) throws IOException { |
| 463 | + Tuple<Long, Long> minMaxPrimaryTermFromFileName = TranslogTransferMetadata.getMinMaxPrimaryTermFromFilename(metadataFile); |
| 464 | + if (minMaxPrimaryTermFromFileName != null) { |
| 465 | + return minMaxPrimaryTermFromFileName; |
| 466 | + } else { |
| 467 | + if (oldFormatMetadataFilePrimaryTermMap.containsKey(metadataFile)) { |
| 468 | + return oldFormatMetadataFilePrimaryTermMap.get(metadataFile); |
| 469 | + } else { |
| 470 | + TranslogTransferMetadata metadata = translogTransferManager.readMetadata(metadataFile); |
| 471 | + long maxPrimaryTem = TranslogTransferMetadata.getPrimaryTermFromFileName(metadataFile); |
| 472 | + long minPrimaryTem = -1; |
| 473 | + if (metadata.getGenerationToPrimaryTermMapper() != null |
| 474 | + && metadata.getGenerationToPrimaryTermMapper().values().isEmpty() == false) { |
| 475 | + Optional<Long> primaryTerm = metadata.getGenerationToPrimaryTermMapper() |
| 476 | + .values() |
| 477 | + .stream() |
| 478 | + .map(s -> Long.parseLong(s)) |
| 479 | + .min(Long::compareTo); |
| 480 | + if (primaryTerm.isPresent()) { |
| 481 | + minPrimaryTem = primaryTerm.get(); |
| 482 | + } |
| 483 | + } |
| 484 | + Tuple<Long, Long> minMaxPrimaryTermTuple = new Tuple<>(minPrimaryTem, maxPrimaryTem); |
| 485 | + oldFormatMetadataFilePrimaryTermMap.put(metadataFile, minMaxPrimaryTermTuple); |
| 486 | + return minMaxPrimaryTermTuple; |
| 487 | + } |
| 488 | + } |
| 489 | + } |
| 490 | + |
| 491 | + public static void cleanup(TranslogTransferManager translogTransferManager) throws IOException { |
| 492 | + ActionListener<List<BlobMetadata>> listMetadataFilesListener = new ActionListener<>() { |
| 493 | + @Override |
| 494 | + public void onResponse(List<BlobMetadata> blobMetadata) { |
| 495 | + List<String> metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList()); |
| 496 | + |
| 497 | + try { |
| 498 | + if (metadataFiles.isEmpty()) { |
| 499 | + staticLogger.debug("No stale translog metadata files found"); |
| 500 | + return; |
| 501 | + } |
| 502 | + List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles, new HashMap<>(), staticLogger); |
| 503 | + if (metadataFilesToBeDeleted.isEmpty()) { |
| 504 | + staticLogger.debug("No metadata files to delete"); |
| 505 | + return; |
| 506 | + } |
| 507 | + staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted); |
| 508 | + |
| 509 | + // For all the files that we are keeping, fetch min and max generations |
| 510 | + List<String> metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles); |
| 511 | + metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted); |
| 512 | + staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted); |
| 513 | + |
| 514 | + // Delete stale metadata files |
| 515 | + translogTransferManager.deleteMetadataFilesAsync( |
| 516 | + metadataFilesToBeDeleted, |
| 517 | + // Delete stale primary terms |
| 518 | + () -> deleteStaleRemotePrimaryTerms( |
| 519 | + metadataFilesNotToBeDeleted, |
| 520 | + translogTransferManager, |
| 521 | + new HashMap<>(), |
| 522 | + new AtomicLong(Long.MAX_VALUE), |
| 523 | + staticLogger |
| 524 | + ) |
| 525 | + ); |
| 526 | + } catch (Exception e) { |
| 527 | + staticLogger.error("Exception while cleaning up metadata and primary terms", e); |
| 528 | + } |
| 529 | + } |
| 530 | + |
| 531 | + @Override |
| 532 | + public void onFailure(Exception e) { |
| 533 | + staticLogger.error("Exception while cleaning up metadata and primary terms", e); |
| 534 | + } |
| 535 | + }; |
| 536 | + translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener); |
| 537 | + } |
377 | 538 | } |
0 commit comments