-
Notifications
You must be signed in to change notification settings - Fork 973
Fix Kyuubi OOM bug when multiple batch jobs are submitted at once in large amount #7227
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
1cbeb7b
de98cf8
783954d
b952a15
eb34301
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,36 +181,51 @@ class KyuubiRestFrontendService(override val serverable: Serverable) | |
| @VisibleForTesting | ||
| private[kyuubi] def recoverBatchSessions(): Unit = withBatchRecoveryLockRequired { | ||
| val recoveryNumThreads = conf.get(METADATA_RECOVERY_THREADS) | ||
| val recoveryBatchSize: Int = conf.get(BATCH_SESSIONS_RECOVERY_SIZE) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about directly use
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its default values is only 10 by defaults
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we want greater concurrency, we should be able to increase this value. |
||
| val batchRecoveryExecutor = | ||
| ThreadUtils.newDaemonFixedThreadPool(recoveryNumThreads, "batch-recovery-executor") | ||
| try { | ||
| val batchSessionsToRecover = sessionManager.getBatchSessionsToRecover(connectionUrl) | ||
| val pendingRecoveryTasksCount = new AtomicInteger(0) | ||
| val tasks = batchSessionsToRecover.flatMap { batchSession => | ||
| val batchId = batchSession.batchJobSubmissionOp.batchId | ||
| try { | ||
| val task: Future[Unit] = batchRecoveryExecutor.submit(() => | ||
| Utils.tryLogNonFatalError(sessionManager.openBatchSession(batchSession))) | ||
| Some(task -> batchId) | ||
| } catch { | ||
| case e: Throwable => | ||
| error(s"Error while submitting batch[$batchId] for recovery", e) | ||
| None | ||
| } | ||
| } | ||
| val offset: Int = 0 | ||
| val shouldFetchRemainingBatchSessions: Boolean = true | ||
| val totalBatchRecovered: Int = 0 | ||
| while(shouldFetchRemainingBatchSessions) { | ||
| val batchSessionsToRecover = sessionManager.getBatchSessionsToRecover(connectionUrl, offset, recoveryBatchSize) | ||
| if(batchSessionsToRecover.length > 0){ | ||
| val pendingRecoveryTasksCount = new AtomicInteger(0) | ||
| val tasks = batchSessionsToRecover.flatMap { batchSession => | ||
| val batchId = batchSession.batchJobSubmissionOp.batchId | ||
| try { | ||
| val task: Future[Unit] = batchRecoveryExecutor.submit(() => | ||
| Utils.tryLogNonFatalError(sessionManager.openBatchSession(batchSession))) | ||
| Some(task -> batchId) | ||
| } catch { | ||
| case e: Throwable => | ||
| error(s"Error while submitting batch[$batchId] for recovery", e) | ||
| None | ||
| } | ||
| } | ||
|
|
||
| pendingRecoveryTasksCount.addAndGet(tasks.size) | ||
| pendingRecoveryTasksCount.addAndGet(tasks.size) | ||
|
|
||
| tasks.foreach { case (task, batchId) => | ||
| try { | ||
| task.get() | ||
| } catch { | ||
| case e: Throwable => | ||
| error(s"Error while recovering batch[$batchId]", e) | ||
| } finally { | ||
| val pendingTasks = pendingRecoveryTasksCount.decrementAndGet() | ||
| info(s"Batch[$batchId] recovery task terminated, current pending tasks $pendingTasks") | ||
| tasks.foreach { case (task, batchId) => | ||
| try { | ||
| task.get() | ||
| } catch { | ||
| case e: Throwable => | ||
| error(s"Error while recovering batch[$batchId]", e) | ||
| } finally { | ||
| val pendingTasks = pendingRecoveryTasksCount.decrementAndGet() | ||
| info(s"Batch[$batchId] recovery task terminated, current pending tasks $pendingTasks") | ||
| } | ||
| } | ||
| totalBatchRecovered += batchSessionsToRecover.length | ||
| offset += batchSessionsToRecover.length | ||
| } | ||
| else { | ||
| shouldFetchRemainingBatchSessions = false | ||
| info(s"No more batches left to recover from metadata store.") | ||
| } | ||
| info(s"Recovered $totalBatchRecovered batches total so far successfully from metadata store.") | ||
| } | ||
| } finally { | ||
| ThreadUtils.shutdown(batchRecoveryExecutor) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Grammatical error in test description. Should be 'reaches timeout' instead of 'reach timeout', and 'should have the associated' instead of 'should have associated'.