From 90aba4076dca561ea3be2b00c3b0521a7d9042e8 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 6 Dec 2024 15:50:04 +0100 Subject: [PATCH 1/2] Enable the SSHManager tests on 32-bit platforms LibSSH.jl should now work on 32-bit. --- test/runtests.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 17f2b4f..5eea288 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,8 +8,8 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) cmd = `$test_exename $test_exeflags` -# LibSSH.jl currently only works on 64bit unixes -if Sys.isunix() && Sys.WORD_SIZE == 64 +# LibSSH.jl currently only works on unixes +if Sys.isunix() # Run the SSH tests with a single thread because LibSSH.jl is not thread-safe sshtestfile = joinpath(@__DIR__, "sshmanager.jl") run(addenv(`$cmd $sshtestfile`, "JULIA_NUM_THREADS" => "1")) From 3134c5aebaf16ce2884a958abee8f0d840243b58 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 6 Dec 2024 22:12:27 +0100 Subject: [PATCH 2/2] Replace a timeout task with timedwait() According to a stacktrace from a hung CI job this task was causing the process to hang before exiting: ```julia InterruptException() _jl_mutex_unlock at C:/workdir/src\threading.c:1012 jl_mutex_unlock at C:/workdir/src\julia_locks.h:80 [inlined] ijl_task_get_next at C:/workdir/src\scheduler.c:458 poptask at .\task.jl:1163 wait at .\task.jl:1172 task_done_hook at .\task.jl:839 jfptr_task_done_hook_98752.1 at C:\hostedtoolcache\windows\julia\nightly\x64\lib\julia\sys.dll (unknown line) jl_apply at C:/workdir/src\julia.h:2233 [inlined] jl_finish_task at C:/workdir/src\task.c:338 start_task at C:/workdir/src\task.c:1274 From worker 82: fatal: error thrown and no exception handler available.Unhandled Task ERROR: InterruptException: Stacktrace: [1] poptask(W::Base.IntrusiveLinkedListSynchronized{Task}) @ Base .\task.jl:1163 [2] wait() @ Base .\task.jl:1172 [3] wait(c::Base.GenericCondition{ReentrantLock}; first::Bool) @ Base .\condition.jl:141 [4] wait @ .\condition.jl:136 [inlined] [5] put_buffered(c::Channel{Any}, v::Int64) @ Base .\channels.jl:420 [6] put!(c::Channel{Any}, v::Int64) @ Base .\channels.jl:398 [7] put!(rv::DistributedNext.RemoteValue, args::Int64) @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\remotecall.jl:703 [8] (::DistributedNext.var"#create_worker##11#create_worker##12"{DistributedNext.RemoteValue, Float64})() @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\cluster.jl:721 ``` Replaced it with a call to `timedwait()`, which has the advantage of being a lot simpler than an extra task. --- docs/src/_changelog.md | 5 +++++ src/cluster.jl | 12 ++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md index 75c9ee1..56d586e 100644 --- a/docs/src/_changelog.md +++ b/docs/src/_changelog.md @@ -7,6 +7,11 @@ CurrentModule = DistributedNext This documents notable changes in DistributedNext.jl. The format is based on [Keep a Changelog](https://keepachangelog.com). +## Unreleased + +### Fixed +- Fixed a cause of potential hangs when exiting the process ([#16]). + ## [v1.0.0] - 2024-12-02 ### Added diff --git a/src/cluster.jl b/src/cluster.jl index 958dc01..0a74dcc 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -712,17 +712,9 @@ function create_worker(manager, wconfig) send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message) errormonitor(@async manage(w.manager, w.id, w.config, :register)) + # wait for rr_ntfy_join with timeout - timedout = false - errormonitor( - @async begin - sleep($timeout) - timedout = true - put!(rr_ntfy_join, 1) - end - ) - wait(rr_ntfy_join) - if timedout + if timedwait(() -> isready(rr_ntfy_join), timeout) === :timed_out error("worker did not connect within $timeout seconds") end lock(client_refs) do