- 
                Notifications
    You must be signed in to change notification settings 
- Fork 8
Closed
Description
I am trying to run the following script:
jobscript
#!/bin/bash
#SBATCH --job-name=julia-demo
#SBATCH --time=00:01:00
#SBATCH --nodes=2
#SBATCH --output=log.out
#SBATCH --error=log.err
cd $SCRATCH/temp
julia=$SCRATCH/julia/julia-1.7.0-rc2/bin/julia
srun $julia script.jlscript.jl
using Distributed, SlurmClusterManager
addprocs(SlurmManager())
using Distributed
@show workers()
# Define what id() is
@everywhere id() = (myid(), gethostname())
# Run id() on all nodes
ids = [id(), [@fetchfrom i id() for i in workers()]...]
# Print
println.(ids)
rmprocs.(workers())This results in the following error:
$ cat log.err
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] addprocs_locked(ERROR: manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Distributed LoadError: /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:504
 [3] addprocs_locked
   @ /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:454 [inlined]
 [4] addprocs(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:447
 [5] addprocs(manager::SlurmManager)
   @ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:441
 [6] top-level scope
   @ /scratch/jb6888/temp/script.jl:2
    nested task error: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] TaskFailedException
    Stacktrace:
     [1] wait
       @ ./task.jl:322 [inlined]
     [2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
       @ SlurmClusterManager /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:75
     [3] (::Distributed.var"#39#42"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
       @ Distributed ./task.jl:411
    
        nested task error: could not parse 9196#10.0.3.111
        Stacktrace:
         [1] error(s::String)
           @ Base ./error.jl:33
         [2] macro expansion
           @ /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:60 [inlined]
         [3] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
           @ SlurmClusterManager ./task.jl:411
in expression starting at /scratch/jb6888/temp/script.jl:2
addprocs_locked(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:504
 [3] addprocs_locked
   @ /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:454 [inlined]
 [4] addprocs(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:447
 [5] addprocs(manager::SlurmManager)
   @ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:441
 [6] top-level scope
   @ /scratch/jb6888/temp/script.jl:2
    nested task error: TaskFailedException
    Stacktrace:
     [1] wait
       @ ./task.jl:322 [inlined]
     [2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
       @ SlurmClusterManager /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:75
     [3] (::Distributed.var"#39#42"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
       @ Distributed ./task.jl:411
    
        nested task error: could not parse 9340#10.0.3.111
        Stacktrace:
         [1] error(s::String)
           @ Base ./error.jl:33
         [2] macro expansion
           @ /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:60 [inlined]
         [3] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
           @ SlurmClusterManager ./task.jl:411
in expression starting at /scratch/jb6888/temp/script.jl:2
srun: error: compute-25-12: task 1: Exited with exit code 1
srun: Terminating job step 2742997.0
slurmstepd: *** STEP 2742997.0 ON compute-25-11 CANCELLED AT 2021-10-28T10:59:29 ***
srun: Job step aborted: Waiting up to 17 seconds for job step to finish.
srun: error: compute-25-11: task 0: KilledIt's possible that I'm misunderstanding the instruction to launch jobs, and would appreciate some help on this
Metadata
Metadata
Assignees
Labels
No labels