From 2b8fe1cdba39a9fe9874417e536f0a7b8c1e0cf7 Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 11:16:36 +0100 Subject: [PATCH 1/9] Improve Slurm support --- src/Scans.jl | 73 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/src/Scans.jl b/src/Scans.jl index a03d7a63..1b56494d 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -80,9 +80,18 @@ struct CondorExec <: AbstractExec end """ - SlurmExec(scriptfile, ncores) + SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1) -Execution mode which submits a scan to an slurm queue system claiming `ncores` cores. +Execution mode which submits a scan to a Slurm queue system claiming `ncores` cores. + +# Keyword arguments +- `memory::String`: Memory per task, e.g. `"24G"`. Sets `#SBATCH --mem` and automatically + derives `--heap-size-hint` (at 80% of `memory`) for the Julia process. +- `project::String`: Path to a Julia project environment. Defaults to the currently active + project (`dirname(Base.active_project())`). Pass `""` to omit the `--project` flag. +- `nthreads::Int`: Number of threads per task (default `1`). Sets `#SBATCH --cpus-per-task` + and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, `OPENBLAS_NUM_THREADS`, and + `MKL_NUM_THREADS` in the job script. !!! note `scriptfile` must **always** be `@__FILE__` @@ -90,6 +99,13 @@ Execution mode which submits a scan to an slurm queue system claiming `ncores` c struct SlurmExec <: AbstractExec scriptfile::String ncores::Int + memory::String + project::String + nthreads::Int +end + +function SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1) + SlurmExec(scriptfile, ncores, memory, project, nthreads) end """ @@ -440,25 +456,70 @@ function _runscan(f, scan::Scan{QueueExec}) end end +""" + _parse_memory_for_heap_hint(memory::String) -> String + +Derive a `--heap-size-hint` value at ~80% of the Slurm `--mem` value. +Supports suffixes `K`, `M`, `G`, `T` (case-insensitive). Returns an empty +string if `memory` cannot be parsed. +""" +function _parse_memory_for_heap_hint(memory::String) + m = match(r"^(\d+)\s*([KMGT]?)$"i, strip(memory)) + isnothing(m) && return "" + value = parse(Int, m.captures[1]) + unit = uppercase(m.captures[2]) + hint = floor(Int, value * 0.8) + hint < 1 && return "" + return "$(hint)$(unit)" +end + function runscan(f, scan::Scan{SlurmExec}) # make submission file for slurm + cmd = split(string(Base.julia_cmd()))[1] + julia = strip(cmd, ['`', '\'']) script = scan.exec.scriptfile dir = dirname(script) cores = scan.exec.ncores + nthreads = scan.exec.nthreads name = scan.name @info "Submitting slurm job for $script running on $cores cores." - # Adding the --queue command-line argument below means that when running the Condor job, + # Adding the --queue command-line argument below means that when running the Slurm job, # the SlurmExec is ignored even if explicitly defined inside the script. lines = [ "#!/bin/bash", "#SBATCH --ntasks=1", - "#SBATCH --cpus-per-task=1", + "#SBATCH --cpus-per-task=$nthreads", "#SBATCH -o %x_%a.stdout", "#SBATCH -e %x_%a.stderr", "#SBATCH --array=1-$cores", - "#SBATCH --chdir $dir", - "julia $(basename(script)) --queue" + "#SBATCH --chdir \"$dir\"", ] + if !isempty(scan.exec.memory) + push!(lines, "#SBATCH --mem=$(scan.exec.memory)") + end + # Prevent Julia startup crash from restrictive system ulimit on virtual memory. + # This does NOT bypass Slurm's cgroup --mem limit (which tracks RSS, not VIRT). + push!(lines, "ulimit -v unlimited") + # Pin threads: prevent over-subscription when running concurrent array tasks + append!(lines, [ + "export JULIA_NUM_THREADS=$nthreads", + "export OMP_NUM_THREADS=$nthreads", + "export OPENBLAS_NUM_THREADS=$nthreads", + "export MKL_NUM_THREADS=$nthreads", + ]) + # Build Julia command (quote paths to handle spaces) + juliacmd = "\"$julia\"" + if !isempty(scan.exec.memory) + heaphint = _parse_memory_for_heap_hint(scan.exec.memory) + if !isempty(heaphint) + juliacmd *= " --heap-size-hint=$heaphint" + end + end + if !isempty(scan.exec.project) + juliacmd *= " --project=\"$(scan.exec.project)\"" + end + juliacmd *= " $(basename(script)) --queue" + push!(lines, juliacmd) subfile = joinpath(dir, "$name.sh") @info "Writing job file to $subfile..." open(subfile, "w") do file From 1e2dac1c011426f072d724598f54d3419bf2e51d Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 11:23:11 +0100 Subject: [PATCH 2/9] improve slurm docs --- docs/src/scans.md | 114 ++++++++++++++++++++++++++++++++++++++++++++++ src/Scans.jl | 39 +++++++++++++--- 2 files changed, 147 insertions(+), 6 deletions(-) diff --git a/docs/src/scans.md b/docs/src/scans.md index 0b7d6ff2..75c37bd4 100644 --- a/docs/src/scans.md +++ b/docs/src/scans.md @@ -68,6 +68,7 @@ Scans can be executed in several ways, which are defined via the various subtype - [`BatchExec`](@ref Scans.BatchExec): divide the scan into batches and run a specific batch (can be used to balance load between processes) - [`QueueExec`](@ref Scans.QueueExec): create a "queue file" which is used to balance load between several processes. This can be executed from multiple processes simultaneously. Alternatively, `QueueExec` can be made to spawn several subprocesses on the local machine which then use the queueing system to balance load between them. - [`CondorExec`](@ref Scans.CondorExec): create a submission file (aka job file) for an HTCondor batch system running on the current machine and submit it, claiming a specified number of nodes, to execute the scan using a `QueueExec`. +- [`SlurmExec`](@ref Scans.SlurmExec): create and submit a Slurm array job. Supports per-task memory limits (`--mem`), automatic `--heap-size-hint`, thread pinning, Julia project auto-detection, and `ulimit -v unlimited` for safe Julia startup. See [Execution on Slurm](#execution-on-slurm) for details. - [`SSHExec`](@ref Scans.SSHExec): use one of the other `AbstractExec` types but first transfer the file to a remote host via SSH and then execute it. (**Note**: the remote machine must have Julia and Luna available with the same versions of both, and Julia must be available in a shell via the `julia` command.) For more details on how to set up execution over SSH, see [below](#execution-over-ssh). ### Command-line arguments @@ -156,6 +157,119 @@ julia> HDF5.h5open("pressure_energy_example_collected.h5", "r") do fi ``` Importantly, in our example here this file is less than one megabyte in size, whereas the `scanoutput` folder totals over 600 megabytes. To store the statistics as well, `stats` can be given as a special keyword argument to `scansave`. Because the arrays are not always the same size (see above), in the file these are stored in an array which is large enough to fit the longest and padded with `NaN`s. The number of actual statisics points available for each simulation is then stored in a special dataset `valid_length`. +## Execution on Slurm +[`SlurmExec`](@ref Scans.SlurmExec) creates and submits a Slurm array job, where each array task processes scan points via a file-based queue (internally using [`QueueExec`](@ref Scans.QueueExec)). This means the array tasks automatically balance load among themselves -- if one simulation finishes early, that task picks up the next unprocessed scan point. + +### Basic usage +```julia +using Luna + +scan = Scan("energy_scan", Scans.SlurmExec(@__FILE__, 8); energy=energies) +addvariable!(scan, :pressure, pressures) + +outputdir = joinpath(@__DIR__, "scanoutput") +runscan(scan) do scanidx, energy, pressure + prop_capillary(125e-6, 3, :He, pressure; λ0=800e-9, τfwhm=10e-15, energy, + scan, scanidx, filepath=outputdir) +end +``` +Here, `8` is the number of Slurm array tasks (not the total number of scan points). The queue system ensures all scan points are processed even if there are more points than tasks. + +### Memory management +When running many concurrent simulations, memory can be a concern. `SlurmExec` provides several features to help: + +```julia +Scans.SlurmExec(@__FILE__, 8; memory="24G") +``` + +Setting `memory` does three things: +1. Adds `#SBATCH --mem=24G` to the job script, so Slurm enforces a hard memory limit per task via cgroups. +2. Automatically sets Julia's `--heap-size-hint=19G` (80% of `--mem`), which tells the garbage collector to be more aggressive before reaching the limit. +3. The generated script also includes `ulimit -v unlimited`, which prevents Julia from crashing at startup due to restrictive virtual memory limits. This is safe because it only affects the virtual address space limit (which Julia needs to be large), **not** the physical RAM limit enforced by Slurm's cgroups. + +The `memory` string supports `K`, `M`, `G`, and `T` suffixes, matching Slurm's `--mem` format. + +### Thread pinning +By default, `SlurmExec` sets `nthreads=1` and exports the following environment variables in the job script: +```bash +export JULIA_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +``` +This prevents over-subscription when many array tasks run concurrently on the same node. With `JULIA_NUM_THREADS=1`, FFTW also automatically uses a single thread (via Luna's `Utils.FFTWthreads()`). + +If your simulations benefit from multi-threading, increase `nthreads`: +```julia +Scans.SlurmExec(@__FILE__, 8; nthreads=4, memory="24G") +``` +This sets `#SBATCH --cpus-per-task=4` and all thread environment variables to `4`. + +### Julia project environment +By default, `SlurmExec` automatically detects the active Julia project environment (via `Base.active_project()`) and passes `--project=` to the Julia command in the job script. This ensures that Slurm workers use the same package versions as the submission script. + +```julia +# Uses current project automatically (the default): +Scans.SlurmExec(@__FILE__, 8) + +# Explicit project path: +Scans.SlurmExec(@__FILE__, 8; project="/home/user/MyProject") + +# Omit --project flag (use default Julia environment): +Scans.SlurmExec(@__FILE__, 8; project="") +``` + +### Julia binary path +The generated job script uses the full path to the currently running Julia binary (obtained from `Base.julia_cmd()`), rather than relying on `julia` being on `PATH`. This ensures the same Julia version is used on compute nodes. All paths (Julia binary, working directory, project path) are quoted to handle spaces. + +### Full example +A complete example with all options: +```julia +using Luna + +energies = collect(range(50e-6, 200e-6; length=64)) +pressures = collect(0.6:0.4:1.4) + +exec = Scans.SlurmExec(@__FILE__, 16; + memory="24G", # 24 GB per task, GC hint at ~19 GB + nthreads=1, # single-threaded (default) + project=".") # use current directory as project + +scan = Scan("pressure_energy", exec; energy=energies) +addvariable!(scan, :pressure, pressures) + +runscan(scan) do scanidx, energy, pressure + prop_capillary(125e-6, 3, :He, pressure; λ0=800e-9, τfwhm=10e-15, energy, + scan, scanidx, filepath=joinpath(@__DIR__, "scanoutput")) +end +``` + +The generated Slurm job script will look like: +```bash +#!/bin/bash +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH -o %x_%a.stdout +#SBATCH -e %x_%a.stderr +#SBATCH --array=1-16 +#SBATCH --chdir "/path/to/script/directory" +#SBATCH --mem=24G +ulimit -v unlimited +export JULIA_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +"/path/to/julia" --heap-size-hint=19G --project="." script.jl --queue +``` + +### Combining with SSHExec +`SlurmExec` can be wrapped in [`SSHExec`](@ref Scans.SSHExec) to transfer the script to a remote Slurm cluster and submit it there: +```julia +exec = Scans.SlurmExec(@__FILE__, 16; memory="24G") +ssh_exec = Scans.SSHExec(exec, "cluster.example.com", "scans") +scan = Scan("remote_scan", ssh_exec; energy=energies) +``` + ## Execution over SSH Setup steps required: - On the remote machine, add Julia to your path upon loading even over SSH: add `export PATH=/opt/julia-1.5.1/bin:$PATH` or similar to your `.bashrc` file **above** the usual check for interactive running. diff --git a/src/Scans.jl b/src/Scans.jl index 1b56494d..366f0211 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -82,19 +82,46 @@ end """ SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1) -Execution mode which submits a scan to a Slurm queue system claiming `ncores` cores. +Execution mode which submits a scan to a Slurm queue system as an array job with `ncores` +array tasks. # Keyword arguments - `memory::String`: Memory per task, e.g. `"24G"`. Sets `#SBATCH --mem` and automatically - derives `--heap-size-hint` (at 80% of `memory`) for the Julia process. + derives `--heap-size-hint` (at 80% of `memory`) for the Julia process. Supports suffixes + `K`, `M`, `G`, `T`. - `project::String`: Path to a Julia project environment. Defaults to the currently active - project (`dirname(Base.active_project())`). Pass `""` to omit the `--project` flag. -- `nthreads::Int`: Number of threads per task (default `1`). Sets `#SBATCH --cpus-per-task` - and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, `OPENBLAS_NUM_THREADS`, and - `MKL_NUM_THREADS` in the job script. + project (`dirname(Base.active_project())`), so Slurm workers use the same environment as + the submission script. Pass `""` to omit the `--project` flag. +- `nthreads::Int`: Number of threads per array task (default `1`). Sets + `#SBATCH --cpus-per-task` and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, + `OPENBLAS_NUM_THREADS`, and `MKL_NUM_THREADS` in the job script. The default of `1` + prevents over-subscription when many array tasks run concurrently on a shared node. + +# Generated job script +The generated SBATCH script includes: +- `ulimit -v unlimited` to prevent Julia startup crashes from restrictive virtual memory + limits (this does **not** bypass Slurm's cgroup `--mem` enforcement on physical RAM). +- Thread-pinning environment variable exports (`JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, + `OPENBLAS_NUM_THREADS`, `MKL_NUM_THREADS`). +- The full path to the current Julia binary (from `Base.julia_cmd()`) rather than bare + `julia`, ensuring the same Julia version is used on compute nodes. +- All paths (Julia binary, `--chdir` directory, `--project` path) are quoted to handle + spaces in paths. +- Each array task runs the script with `--queue`, so internally a [`QueueExec`](@ref) is + used for file-based load balancing across array tasks. !!! note `scriptfile` must **always** be `@__FILE__` + +# Examples +```julia +# Minimal: uses current project, 1 thread per task, no memory limit +scan = Scan("my_scan", SlurmExec(@__FILE__, 8); energy=energies) + +# Full: 24 GB per task, custom project, 2 threads +scan = Scan("my_scan", SlurmExec(@__FILE__, 8; memory="24G", project="/path/to/env", nthreads=2); + energy=energies) +``` """ struct SlurmExec <: AbstractExec scriptfile::String From 63e6134611325f5baa0d77f0607d427b91148c07 Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 11:52:36 +0100 Subject: [PATCH 3/9] add Slurm tests (just script) --- src/Scans.jl | 43 ++++++++++++------- test/test_scans.jl | 101 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 16 deletions(-) diff --git a/src/Scans.jl b/src/Scans.jl index 366f0211..52193f1c 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -500,18 +500,19 @@ function _parse_memory_for_heap_hint(memory::String) return "$(hint)$(unit)" end -function runscan(f, scan::Scan{SlurmExec}) - # make submission file for slurm +""" + _slurm_script_lines(exec::SlurmExec) -> Vector{String} + +Build the lines of the SBATCH job script for a `SlurmExec`. This is separated from +`runscan` to allow testing the generated script content without a Slurm installation. +""" +function _slurm_script_lines(exec::SlurmExec) cmd = split(string(Base.julia_cmd()))[1] julia = strip(cmd, ['`', '\'']) - script = scan.exec.scriptfile + script = exec.scriptfile dir = dirname(script) - cores = scan.exec.ncores - nthreads = scan.exec.nthreads - name = scan.name - @info "Submitting slurm job for $script running on $cores cores." - # Adding the --queue command-line argument below means that when running the Slurm job, - # the SlurmExec is ignored even if explicitly defined inside the script. + cores = exec.ncores + nthreads = exec.nthreads lines = [ "#!/bin/bash", "#SBATCH --ntasks=1", @@ -521,8 +522,8 @@ function runscan(f, scan::Scan{SlurmExec}) "#SBATCH --array=1-$cores", "#SBATCH --chdir \"$dir\"", ] - if !isempty(scan.exec.memory) - push!(lines, "#SBATCH --mem=$(scan.exec.memory)") + if !isempty(exec.memory) + push!(lines, "#SBATCH --mem=$(exec.memory)") end # Prevent Julia startup crash from restrictive system ulimit on virtual memory. # This does NOT bypass Slurm's cgroup --mem limit (which tracks RSS, not VIRT). @@ -536,18 +537,28 @@ function runscan(f, scan::Scan{SlurmExec}) ]) # Build Julia command (quote paths to handle spaces) juliacmd = "\"$julia\"" - if !isempty(scan.exec.memory) - heaphint = _parse_memory_for_heap_hint(scan.exec.memory) + if !isempty(exec.memory) + heaphint = _parse_memory_for_heap_hint(exec.memory) if !isempty(heaphint) juliacmd *= " --heap-size-hint=$heaphint" end end - if !isempty(scan.exec.project) - juliacmd *= " --project=\"$(scan.exec.project)\"" + if !isempty(exec.project) + juliacmd *= " --project=\"$(exec.project)\"" end juliacmd *= " $(basename(script)) --queue" push!(lines, juliacmd) - subfile = joinpath(dir, "$name.sh") + return lines +end + +function runscan(f, scan::Scan{SlurmExec}) + script = scan.exec.scriptfile + name = scan.name + @info "Submitting slurm job for $script running on $(scan.exec.ncores) cores." + # Adding the --queue command-line argument below means that when running the Slurm job, + # the SlurmExec is ignored even if explicitly defined inside the script. + lines = _slurm_script_lines(scan.exec) + subfile = joinpath(dirname(script), "$name.sh") @info "Writing job file to $subfile..." open(subfile, "w") do file for l in lines diff --git a/test/test_scans.jl b/test/test_scans.jl index c4583c5a..0cbe88af 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -277,4 +277,105 @@ end @test length(readdir(td)) == length(energies) @test all(startswith.(readdir(td), "newname")) end +end + +## +@testset "SlurmExec construction" begin + # Backward compatibility: positional args only + ex = Scans.SlurmExec("/tmp/test.jl", 4) + @test ex.scriptfile == "/tmp/test.jl" + @test ex.ncores == 4 + @test ex.memory == "" + @test ex.nthreads == 1 + # project defaults to current active project + @test ex.project == dirname(Base.active_project()) + + # Full keyword args + ex2 = Scans.SlurmExec("/tmp/test.jl", 8; memory="24G", project="/custom/env", nthreads=2) + @test ex2.memory == "24G" + @test ex2.project == "/custom/env" + @test ex2.nthreads == 2 + + # Opt out of project + ex3 = Scans.SlurmExec("/tmp/test.jl", 4; project="") + @test ex3.project == "" + + # SSHExec wrapping + ssh = Scans.SSHExec(ex2, "myhost", "scans") + @test ssh.localexec === ex2 +end + +## +@testset "heap-size-hint parsing" begin + @test Scans._parse_memory_for_heap_hint("24G") == "19G" + @test Scans._parse_memory_for_heap_hint("10M") == "8M" + @test Scans._parse_memory_for_heap_hint("100G") == "80G" + @test Scans._parse_memory_for_heap_hint("2T") == "1T" + @test Scans._parse_memory_for_heap_hint("500") == "400" + @test Scans._parse_memory_for_heap_hint("500K") == "400K" + # Edge cases + @test Scans._parse_memory_for_heap_hint("1T") == "" # floor(0.8) = 0 < 1 + @test Scans._parse_memory_for_heap_hint("") == "" + @test Scans._parse_memory_for_heap_hint("abc") == "" + @test Scans._parse_memory_for_heap_hint("12.5G") == "" # no float support + # Case insensitive + @test Scans._parse_memory_for_heap_hint("24g") == "19G" + @test Scans._parse_memory_for_heap_hint("10m") == "8M" +end + +## +@testset "Slurm script generation" begin + # Full options: memory, project, nthreads + ex = Scans.SlurmExec("/path/with spaces/test.jl", 16; + memory="24G", project="/my project/env", nthreads=2) + lines = Scans._slurm_script_lines(ex) + script = join(lines, "\n") + + # Shebang + @test lines[1] == "#!/bin/bash" + # SBATCH directives + @test any(l -> l == "#SBATCH --ntasks=1", lines) + @test any(l -> l == "#SBATCH --cpus-per-task=2", lines) + @test any(l -> l == "#SBATCH --array=1-16", lines) + @test any(l -> l == "#SBATCH --mem=24G", lines) + # Quoted chdir for spaces + @test any(l -> l == "#SBATCH --chdir \"/path/with spaces\"", lines) + # ulimit + @test any(l -> l == "ulimit -v unlimited", lines) + # Thread pinning exports + @test any(l -> l == "export JULIA_NUM_THREADS=2", lines) + @test any(l -> l == "export OMP_NUM_THREADS=2", lines) + @test any(l -> l == "export OPENBLAS_NUM_THREADS=2", lines) + @test any(l -> l == "export MKL_NUM_THREADS=2", lines) + # Julia command (last line) + juliacmd = lines[end] + @test startswith(juliacmd, "\"") # Julia binary is quoted + @test occursin("--heap-size-hint=19G", juliacmd) + @test occursin("--project=\"/my project/env\"", juliacmd) + @test endswith(juliacmd, "test.jl --queue") + + # Minimal options: no memory, empty project + ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="") + lines_min = Scans._slurm_script_lines(ex_min) + script_min = join(lines_min, "\n") + # No --mem line + @test !any(l -> startswith(l, "#SBATCH --mem"), lines_min) + # No --heap-size-hint + @test !occursin("--heap-size-hint", lines_min[end]) + # No --project + @test !occursin("--project", lines_min[end]) + # cpus-per-task defaults to 1 + @test any(l -> l == "#SBATCH --cpus-per-task=1", lines_min) + # Thread exports default to 1 + @test any(l -> l == "export JULIA_NUM_THREADS=1", lines_min) + # ulimit is always present + @test any(l -> l == "ulimit -v unlimited", lines_min) + # Julia binary is still quoted + @test startswith(lines_min[end], "\"") + @test endswith(lines_min[end], "simple.jl --queue") + + # ulimit comes before exports (correct ordering) + idx_ulimit = findfirst(l -> l == "ulimit -v unlimited", lines) + idx_export = findfirst(l -> startswith(l, "export JULIA_NUM_THREADS"), lines) + @test idx_ulimit < idx_export end \ No newline at end of file From 7076caede2037d5ec12437ce32c614160bf3b694 Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 12:04:00 +0100 Subject: [PATCH 4/9] Add working dir option for slurm --- docs/src/scans.md | 17 ++++++++++++++-- src/Scans.jl | 49 ++++++++++++++++++++++++++++++++++------------ test/test_scans.jl | 31 ++++++++++++++++++++++++----- 3 files changed, 77 insertions(+), 20 deletions(-) diff --git a/docs/src/scans.md b/docs/src/scans.md index 75c37bd4..5a751a66 100644 --- a/docs/src/scans.md +++ b/docs/src/scans.md @@ -222,6 +222,19 @@ Scans.SlurmExec(@__FILE__, 8; project="") ### Julia binary path The generated job script uses the full path to the currently running Julia binary (obtained from `Base.julia_cmd()`), rather than relying on `julia` being on `PATH`. This ensures the same Julia version is used on compute nodes. All paths (Julia binary, working directory, project path) are quoted to handle spaces. +### Working directory +By default, `SlurmExec` creates a subdirectory `_slurm` inside the script's directory and places all Slurm-related files there: the generated `.sh` job script, stdout/stderr logs, and the queue file. This keeps the script directory clean when running large scans. + +```julia +# Default: job files go into /my_scan_slurm/ +Scans.SlurmExec(@__FILE__, 8) + +# Explicit working directory: +Scans.SlurmExec(@__FILE__, 8; workdir="/tmp/my_slurm_run") +``` + +The `workdir` is automatically created if it does not exist. + ### Full example A complete example with all options: ```julia @@ -244,7 +257,7 @@ runscan(scan) do scanidx, energy, pressure end ``` -The generated Slurm job script will look like: +The generated Slurm job script (written to `pressure_energy_slurm/pressure_energy.sh`) will look like: ```bash #!/bin/bash #SBATCH --ntasks=1 @@ -252,7 +265,7 @@ The generated Slurm job script will look like: #SBATCH -o %x_%a.stdout #SBATCH -e %x_%a.stderr #SBATCH --array=1-16 -#SBATCH --chdir "/path/to/script/directory" +#SBATCH --chdir "/path/to/script/directory/pressure_energy_slurm" #SBATCH --mem=24G ulimit -v unlimited export JULIA_NUM_THREADS=1 diff --git a/src/Scans.jl b/src/Scans.jl index 52193f1c..f3d8b4c0 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -80,7 +80,7 @@ struct CondorExec <: AbstractExec end """ - SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1) + SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1, workdir="") Execution mode which submits a scan to a Slurm queue system as an array job with `ncores` array tasks. @@ -96,6 +96,10 @@ array tasks. `#SBATCH --cpus-per-task` and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, `OPENBLAS_NUM_THREADS`, and `MKL_NUM_THREADS` in the job script. The default of `1` prevents over-subscription when many array tasks run concurrently on a shared node. +- `workdir::String`: Working directory for the Slurm job. The generated `.sh` script, + stdout/stderr files, and queue file are all placed here. If `""` (the default), a + subdirectory `_slurm` is automatically created inside the script's directory. + Pass an explicit path to use a custom directory. # Generated job script The generated SBATCH script includes: @@ -116,10 +120,12 @@ The generated SBATCH script includes: # Examples ```julia # Minimal: uses current project, 1 thread per task, no memory limit +# Job files go into /my_scan_slurm/ scan = Scan("my_scan", SlurmExec(@__FILE__, 8); energy=energies) -# Full: 24 GB per task, custom project, 2 threads -scan = Scan("my_scan", SlurmExec(@__FILE__, 8; memory="24G", project="/path/to/env", nthreads=2); +# Full: 24 GB per task, custom project, 2 threads, custom workdir +scan = Scan("my_scan", SlurmExec(@__FILE__, 8; memory="24G", project="/path/to/env", + nthreads=2, workdir="/tmp/my_slurm_run"); energy=energies) ``` """ @@ -129,10 +135,11 @@ struct SlurmExec <: AbstractExec memory::String project::String nthreads::Int + workdir::String end -function SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1) - SlurmExec(scriptfile, ncores, memory, project, nthreads) +function SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1, workdir="") + SlurmExec(scriptfile, ncores, memory, project, nthreads, workdir) end """ @@ -501,16 +508,16 @@ function _parse_memory_for_heap_hint(memory::String) end """ - _slurm_script_lines(exec::SlurmExec) -> Vector{String} + _slurm_script_lines(exec::SlurmExec, workdir::String) -> Vector{String} -Build the lines of the SBATCH job script for a `SlurmExec`. This is separated from -`runscan` to allow testing the generated script content without a Slurm installation. +Build the lines of the SBATCH job script for a `SlurmExec` with the given resolved +`workdir`. This is separated from `runscan` to allow testing the generated script content +without a Slurm installation. """ -function _slurm_script_lines(exec::SlurmExec) +function _slurm_script_lines(exec::SlurmExec, workdir::String) cmd = split(string(Base.julia_cmd()))[1] julia = strip(cmd, ['`', '\'']) script = exec.scriptfile - dir = dirname(script) cores = exec.ncores nthreads = exec.nthreads lines = [ @@ -520,7 +527,7 @@ function _slurm_script_lines(exec::SlurmExec) "#SBATCH -o %x_%a.stdout", "#SBATCH -e %x_%a.stderr", "#SBATCH --array=1-$cores", - "#SBATCH --chdir \"$dir\"", + "#SBATCH --chdir \"$workdir\"", ] if !isempty(exec.memory) push!(lines, "#SBATCH --mem=$(exec.memory)") @@ -551,14 +558,30 @@ function _slurm_script_lines(exec::SlurmExec) return lines end +""" + _resolve_slurm_workdir(exec::SlurmExec, scanname::String) -> String + +Resolve the working directory for a Slurm scan. If `exec.workdir` is empty, returns +`joinpath(dirname(exec.scriptfile), "\$(scanname)_slurm")`; otherwise returns `exec.workdir`. +""" +function _resolve_slurm_workdir(exec::SlurmExec, scanname::String) + if isempty(exec.workdir) + joinpath(dirname(exec.scriptfile), "$(scanname)_slurm") + else + exec.workdir + end +end + function runscan(f, scan::Scan{SlurmExec}) script = scan.exec.scriptfile name = scan.name + workdir = _resolve_slurm_workdir(scan.exec, name) + mkpath(workdir) @info "Submitting slurm job for $script running on $(scan.exec.ncores) cores." # Adding the --queue command-line argument below means that when running the Slurm job, # the SlurmExec is ignored even if explicitly defined inside the script. - lines = _slurm_script_lines(scan.exec) - subfile = joinpath(dirname(script), "$name.sh") + lines = _slurm_script_lines(scan.exec, workdir) + subfile = joinpath(workdir, "$name.sh") @info "Writing job file to $subfile..." open(subfile, "w") do file for l in lines diff --git a/test/test_scans.jl b/test/test_scans.jl index 0cbe88af..610ce342 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -287,14 +287,17 @@ end @test ex.ncores == 4 @test ex.memory == "" @test ex.nthreads == 1 + @test ex.workdir == "" # project defaults to current active project @test ex.project == dirname(Base.active_project()) # Full keyword args - ex2 = Scans.SlurmExec("/tmp/test.jl", 8; memory="24G", project="/custom/env", nthreads=2) + ex2 = Scans.SlurmExec("/tmp/test.jl", 8; memory="24G", project="/custom/env", + nthreads=2, workdir="/tmp/mywork") @test ex2.memory == "24G" @test ex2.project == "/custom/env" @test ex2.nthreads == 2 + @test ex2.workdir == "/tmp/mywork" # Opt out of project ex3 = Scans.SlurmExec("/tmp/test.jl", 4; project="") @@ -323,12 +326,28 @@ end @test Scans._parse_memory_for_heap_hint("10m") == "8M" end +## +@testset "Slurm workdir resolution" begin + # Default: auto-subdirectory based on scan name + ex = Scans.SlurmExec("/home/user/scripts/run.jl", 4) + @test Scans._resolve_slurm_workdir(ex, "my_scan") == "/home/user/scripts/my_scan_slurm" + + # Explicit workdir + ex2 = Scans.SlurmExec("/home/user/scripts/run.jl", 4; workdir="/tmp/custom") + @test Scans._resolve_slurm_workdir(ex2, "my_scan") == "/tmp/custom" + + # Spaces in script path + ex3 = Scans.SlurmExec("/path/with spaces/run.jl", 4) + @test Scans._resolve_slurm_workdir(ex3, "test") == "/path/with spaces/test_slurm" +end + ## @testset "Slurm script generation" begin # Full options: memory, project, nthreads ex = Scans.SlurmExec("/path/with spaces/test.jl", 16; memory="24G", project="/my project/env", nthreads=2) - lines = Scans._slurm_script_lines(ex) + workdir = "/path/with spaces/myscan_slurm" + lines = Scans._slurm_script_lines(ex, workdir) script = join(lines, "\n") # Shebang @@ -338,8 +357,8 @@ end @test any(l -> l == "#SBATCH --cpus-per-task=2", lines) @test any(l -> l == "#SBATCH --array=1-16", lines) @test any(l -> l == "#SBATCH --mem=24G", lines) - # Quoted chdir for spaces - @test any(l -> l == "#SBATCH --chdir \"/path/with spaces\"", lines) + # Quoted chdir uses workdir, not script dir + @test any(l -> l == "#SBATCH --chdir \"/path/with spaces/myscan_slurm\"", lines) # ulimit @test any(l -> l == "ulimit -v unlimited", lines) # Thread pinning exports @@ -356,7 +375,7 @@ end # Minimal options: no memory, empty project ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="") - lines_min = Scans._slurm_script_lines(ex_min) + lines_min = Scans._slurm_script_lines(ex_min, "/tmp/workdir") script_min = join(lines_min, "\n") # No --mem line @test !any(l -> startswith(l, "#SBATCH --mem"), lines_min) @@ -373,6 +392,8 @@ end # Julia binary is still quoted @test startswith(lines_min[end], "\"") @test endswith(lines_min[end], "simple.jl --queue") + # chdir points to workdir + @test any(l -> l == "#SBATCH --chdir \"/tmp/workdir\"", lines_min) # ulimit comes before exports (correct ordering) idx_ulimit = findfirst(l -> l == "ulimit -v unlimited", lines) From 120cdac786f582b4dd0523783d46f872d1a0e7dc Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 12:15:53 +0100 Subject: [PATCH 5/9] Address copilot review comments --- docs/src/scans.md | 6 +++--- src/Scans.jl | 31 ++++++++++++++++++++++--------- test/test_scans.jl | 29 +++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/docs/src/scans.md b/docs/src/scans.md index 5a751a66..3897cba6 100644 --- a/docs/src/scans.md +++ b/docs/src/scans.md @@ -187,7 +187,7 @@ Setting `memory` does three things: 2. Automatically sets Julia's `--heap-size-hint=19G` (80% of `--mem`), which tells the garbage collector to be more aggressive before reaching the limit. 3. The generated script also includes `ulimit -v unlimited`, which prevents Julia from crashing at startup due to restrictive virtual memory limits. This is safe because it only affects the virtual address space limit (which Julia needs to be large), **not** the physical RAM limit enforced by Slurm's cgroups. -The `memory` string supports `K`, `M`, `G`, and `T` suffixes, matching Slurm's `--mem` format. +The `memory` string supports `K`, `M`, `G`, and `T` suffixes, matching Slurm's `--mem` format. A bare number (e.g. `"24000"`) is treated as megabytes, matching Slurm's default convention. Invalid values (e.g. `"bad"`, `"12.5G"`) will raise an `ArgumentError` at construction time. ### Thread pinning By default, `SlurmExec` sets `nthreads=1` and exports the following environment variables in the job script: @@ -206,7 +206,7 @@ Scans.SlurmExec(@__FILE__, 8; nthreads=4, memory="24G") This sets `#SBATCH --cpus-per-task=4` and all thread environment variables to `4`. ### Julia project environment -By default, `SlurmExec` automatically detects the active Julia project environment (via `Base.active_project()`) and passes `--project=` to the Julia command in the job script. This ensures that Slurm workers use the same package versions as the submission script. +By default, `SlurmExec` automatically detects the active Julia project environment (via `Base.active_project()`) and passes `--project=` to the Julia command in the job script. This ensures that Slurm workers use the same package versions as the submission script. If no project is active (`Base.active_project()` returns `nothing`), the default is `""` and `--project` is omitted. ```julia # Uses current project automatically (the default): @@ -272,7 +272,7 @@ export JULIA_NUM_THREADS=1 export OMP_NUM_THREADS=1 export OPENBLAS_NUM_THREADS=1 export MKL_NUM_THREADS=1 -"/path/to/julia" --heap-size-hint=19G --project="." script.jl --queue +"/path/to/julia" --heap-size-hint=19G --project="." "script.jl" --queue ``` ### Combining with SSHExec diff --git a/src/Scans.jl b/src/Scans.jl index f3d8b4c0..0ffdb16f 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -80,7 +80,7 @@ struct CondorExec <: AbstractExec end """ - SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1, workdir="") + SlurmExec(scriptfile, ncores; memory="", project=, nthreads=1, workdir="") Execution mode which submits a scan to a Slurm queue system as an array job with `ncores` array tasks. @@ -88,11 +88,13 @@ array tasks. # Keyword arguments - `memory::String`: Memory per task, e.g. `"24G"`. Sets `#SBATCH --mem` and automatically derives `--heap-size-hint` (at 80% of `memory`) for the Julia process. Supports suffixes - `K`, `M`, `G`, `T`. + `K`, `M`, `G`, `T`. A bare number (e.g. `"24000"`) is treated as megabytes, matching + Slurm's default convention. Must match the format `digits[K|M|G|T]` when non-empty; an + `ArgumentError` is thrown otherwise. - `project::String`: Path to a Julia project environment. Defaults to the currently active - project (`dirname(Base.active_project())`), so Slurm workers use the same environment as - the submission script. Pass `""` to omit the `--project` flag. -- `nthreads::Int`: Number of threads per array task (default `1`). Sets + project (`dirname(Base.active_project())`), or `""` if no project is active. Pass `""` + to omit the `--project` flag. +- `nthreads::Int`: Number of threads per array task (default `1`, must be ≥ 1). Sets `#SBATCH --cpus-per-task` and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, `OPENBLAS_NUM_THREADS`, and `MKL_NUM_THREADS` in the job script. The default of `1` prevents over-subscription when many array tasks run concurrently on a shared node. @@ -138,7 +140,17 @@ struct SlurmExec <: AbstractExec workdir::String end -function SlurmExec(scriptfile, ncores; memory="", project=dirname(Base.active_project()), nthreads=1, workdir="") +function SlurmExec(scriptfile, ncores; + memory="", + project=let ap = Base.active_project() + isnothing(ap) ? "" : dirname(ap) + end, + nthreads=1, + workdir="") + nthreads >= 1 || throw(ArgumentError("`nthreads` must be ≥ 1, got $nthreads")) + if !isempty(memory) && !occursin(r"^\d+\s*[KMGT]?$"i, strip(memory)) + throw(ArgumentError("`memory` must match format \"[K|M|G|T]\", got \"$memory\"")) + end SlurmExec(scriptfile, ncores, memory, project, nthreads, workdir) end @@ -502,6 +514,8 @@ function _parse_memory_for_heap_hint(memory::String) isnothing(m) && return "" value = parse(Int, m.captures[1]) unit = uppercase(m.captures[2]) + # Slurm treats bare numbers as megabytes; mirror that for the heap hint + isempty(unit) && (unit = "M") hint = floor(Int, value * 0.8) hint < 1 && return "" return "$(hint)$(unit)" @@ -515,8 +529,7 @@ Build the lines of the SBATCH job script for a `SlurmExec` with the given resolv without a Slurm installation. """ function _slurm_script_lines(exec::SlurmExec, workdir::String) - cmd = split(string(Base.julia_cmd()))[1] - julia = strip(cmd, ['`', '\'']) + julia = first(Base.julia_cmd().exec) script = exec.scriptfile cores = exec.ncores nthreads = exec.nthreads @@ -553,7 +566,7 @@ function _slurm_script_lines(exec::SlurmExec, workdir::String) if !isempty(exec.project) juliacmd *= " --project=\"$(exec.project)\"" end - juliacmd *= " $(basename(script)) --queue" + juliacmd *= " \"$(basename(script))\" --queue" push!(lines, juliacmd) return lines end diff --git a/test/test_scans.jl b/test/test_scans.jl index 610ce342..2935d2d7 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -288,8 +288,10 @@ end @test ex.memory == "" @test ex.nthreads == 1 @test ex.workdir == "" - # project defaults to current active project - @test ex.project == dirname(Base.active_project()) + # project defaults to current active project (or "" if nothing) + ap = Base.active_project() + expected_project = isnothing(ap) ? "" : dirname(ap) + @test ex.project == expected_project # Full keyword args ex2 = Scans.SlurmExec("/tmp/test.jl", 8; memory="24G", project="/custom/env", @@ -306,6 +308,19 @@ end # SSHExec wrapping ssh = Scans.SSHExec(ex2, "myhost", "scans") @test ssh.localexec === ex2 + + # Validation: nthreads must be >= 1 + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; nthreads=0) + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; nthreads=-1) + + # Validation: memory format + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="bad") + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="24 G B") + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="12.5G") + # Valid memory formats should not throw + Scans.SlurmExec("/tmp/test.jl", 4; memory="24G") + Scans.SlurmExec("/tmp/test.jl", 4; memory="500") + Scans.SlurmExec("/tmp/test.jl", 4; memory="100M") end ## @@ -314,7 +329,7 @@ end @test Scans._parse_memory_for_heap_hint("10M") == "8M" @test Scans._parse_memory_for_heap_hint("100G") == "80G" @test Scans._parse_memory_for_heap_hint("2T") == "1T" - @test Scans._parse_memory_for_heap_hint("500") == "400" + @test Scans._parse_memory_for_heap_hint("500") == "400M" @test Scans._parse_memory_for_heap_hint("500K") == "400K" # Edge cases @test Scans._parse_memory_for_heap_hint("1T") == "" # floor(0.8) = 0 < 1 @@ -368,10 +383,12 @@ end @test any(l -> l == "export MKL_NUM_THREADS=2", lines) # Julia command (last line) juliacmd = lines[end] - @test startswith(juliacmd, "\"") # Julia binary is quoted + # Julia command uses actual julia executable path (no backticks or stray quotes) + julia_path = first(Base.julia_cmd().exec) + @test occursin(julia_path, juliacmd) @test occursin("--heap-size-hint=19G", juliacmd) @test occursin("--project=\"/my project/env\"", juliacmd) - @test endswith(juliacmd, "test.jl --queue") + @test endswith(juliacmd, "\"test.jl\" --queue") # Minimal options: no memory, empty project ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="") @@ -391,7 +408,7 @@ end @test any(l -> l == "ulimit -v unlimited", lines_min) # Julia binary is still quoted @test startswith(lines_min[end], "\"") - @test endswith(lines_min[end], "simple.jl --queue") + @test endswith(lines_min[end], "\"simple.jl\" --queue") # chdir points to workdir @test any(l -> l == "#SBATCH --chdir \"/tmp/workdir\"", lines_min) From b5f5ed46530b7f52c3705c2123b681b4bae7980d Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 12:27:25 +0100 Subject: [PATCH 6/9] Fix working dir problem --- docs/src/scans.md | 2 +- src/Scans.jl | 2 +- test/test_scans.jl | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/src/scans.md b/docs/src/scans.md index 3897cba6..ee74c0b8 100644 --- a/docs/src/scans.md +++ b/docs/src/scans.md @@ -272,7 +272,7 @@ export JULIA_NUM_THREADS=1 export OMP_NUM_THREADS=1 export OPENBLAS_NUM_THREADS=1 export MKL_NUM_THREADS=1 -"/path/to/julia" --heap-size-hint=19G --project="." "script.jl" --queue +"/path/to/julia" --heap-size-hint=19G --project="." "/path/to/script/directory/script.jl" --queue ``` ### Combining with SSHExec diff --git a/src/Scans.jl b/src/Scans.jl index 0ffdb16f..b07a7c9a 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -566,7 +566,7 @@ function _slurm_script_lines(exec::SlurmExec, workdir::String) if !isempty(exec.project) juliacmd *= " --project=\"$(exec.project)\"" end - juliacmd *= " \"$(basename(script))\" --queue" + juliacmd *= " \"$(abspath(script))\" --queue" push!(lines, juliacmd) return lines end diff --git a/test/test_scans.jl b/test/test_scans.jl index 2935d2d7..0be596d8 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -388,7 +388,8 @@ end @test occursin(julia_path, juliacmd) @test occursin("--heap-size-hint=19G", juliacmd) @test occursin("--project=\"/my project/env\"", juliacmd) - @test endswith(juliacmd, "\"test.jl\" --queue") + # Script path is absolute (so it can be found from the workdir) + @test endswith(juliacmd, "\"/path/with spaces/test.jl\" --queue") # Minimal options: no memory, empty project ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="") @@ -408,7 +409,7 @@ end @test any(l -> l == "ulimit -v unlimited", lines_min) # Julia binary is still quoted @test startswith(lines_min[end], "\"") - @test endswith(lines_min[end], "\"simple.jl\" --queue") + @test endswith(lines_min[end], "\"/tmp/simple.jl\" --queue") # chdir points to workdir @test any(l -> l == "#SBATCH --chdir \"/tmp/workdir\"", lines_min) From 452b82089e624b6398a7187ce2ba0c0a1c59596c Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 15:04:51 +0100 Subject: [PATCH 7/9] add batch mode for slurm --- docs/src/scans.md | 39 +++++++++++++++++++++++++++++++-------- src/Scans.jl | 33 +++++++++++++++++++++++++-------- test/test_scans.jl | 21 +++++++++++++++++++++ 3 files changed, 77 insertions(+), 16 deletions(-) diff --git a/docs/src/scans.md b/docs/src/scans.md index ee74c0b8..877b9d41 100644 --- a/docs/src/scans.md +++ b/docs/src/scans.md @@ -158,7 +158,7 @@ julia> HDF5.h5open("pressure_energy_example_collected.h5", "r") do fi Importantly, in our example here this file is less than one megabyte in size, whereas the `scanoutput` folder totals over 600 megabytes. To store the statistics as well, `stats` can be given as a special keyword argument to `scansave`. Because the arrays are not always the same size (see above), in the file these are stored in an array which is large enough to fit the longest and padded with `NaN`s. The number of actual statisics points available for each simulation is then stored in a special dataset `valid_length`. ## Execution on Slurm -[`SlurmExec`](@ref Scans.SlurmExec) creates and submits a Slurm array job, where each array task processes scan points via a file-based queue (internally using [`QueueExec`](@ref Scans.QueueExec)). This means the array tasks automatically balance load among themselves -- if one simulation finishes early, that task picks up the next unprocessed scan point. +[`SlurmExec`](@ref Scans.SlurmExec) creates and submits a Slurm array job. By default, array tasks process scan points via a file-based queue (`:queue` mode, internally using [`QueueExec`](@ref Scans.QueueExec)). Alternatively, `:batch` mode pre-assigns scan points to array tasks, giving each task its own fixed chunk with no shared state. ### Basic usage ```julia @@ -235,18 +235,40 @@ Scans.SlurmExec(@__FILE__, 8; workdir="/tmp/my_slurm_run") The `workdir` is automatically created if it does not exist. +### Array mode +The `arraymode` keyword controls how scan points are distributed across Slurm array tasks: + +- **`:queue`** (default): Array tasks dynamically pick up work from a shared file-based queue. This provides automatic load balancing — if one simulation finishes early, that task picks up the next unprocessed scan point. Uses [`QueueExec`](@ref Scans.QueueExec) internally. + +- **`:batch`**: Each array task gets a pre-assigned chunk of scan points. With `ncores == length(scan)`, each task runs exactly one scan point. No queue file or file locking is needed, giving complete memory isolation between tasks. Uses [`BatchExec`](@ref Scans.BatchExec) internally. + +`:batch` mode is particularly useful when: +- You want strict memory isolation (each scan point in its own process). +- Simulations have similar run times, so load balancing is not critical. +- You are running on a shared filesystem where file locking can be slow. + +```julia +# Queue mode (default): 8 tasks share the workload dynamically +Scans.SlurmExec(@__FILE__, 8) + +# Batch mode: one task per scan point, complete isolation +Scans.SlurmExec(@__FILE__, length(energies); arraymode=:batch, memory="24G") +``` + ### Full example -A complete example with all options: +A complete example with all options, using `:batch` mode for one task per scan point: ```julia using Luna energies = collect(range(50e-6, 200e-6; length=64)) pressures = collect(0.6:0.4:1.4) -exec = Scans.SlurmExec(@__FILE__, 16; - memory="24G", # 24 GB per task, GC hint at ~19 GB - nthreads=1, # single-threaded (default) - project=".") # use current directory as project +N = length(energies) * length(pressures) # total number of scan points +exec = Scans.SlurmExec(@__FILE__, N; + memory="24G", # 24 GB per task, GC hint at ~19 GB + nthreads=1, # single-threaded (default) + project=".", # use current directory as project + arraymode=:batch) # one task per scan point scan = Scan("pressure_energy", exec; energy=energies) addvariable!(scan, :pressure, pressures) @@ -264,7 +286,7 @@ The generated Slurm job script (written to `pressure_energy_slurm/pressure_energ #SBATCH --cpus-per-task=1 #SBATCH -o %x_%a.stdout #SBATCH -e %x_%a.stderr -#SBATCH --array=1-16 +#SBATCH --array=1-192 #SBATCH --chdir "/path/to/script/directory/pressure_energy_slurm" #SBATCH --mem=24G ulimit -v unlimited @@ -272,8 +294,9 @@ export JULIA_NUM_THREADS=1 export OMP_NUM_THREADS=1 export OPENBLAS_NUM_THREADS=1 export MKL_NUM_THREADS=1 -"/path/to/julia" --heap-size-hint=19G --project="." "/path/to/script/directory/script.jl" --queue +"/path/to/julia" --heap-size-hint=19G --project="." "/path/to/script/directory/script.jl" --batch 192,$SLURM_ARRAY_TASK_ID ``` +With `arraymode=:queue` (the default), the last line would instead end with `--queue`. ### Combining with SSHExec `SlurmExec` can be wrapped in [`SSHExec`](@ref Scans.SSHExec) to transfer the script to a remote Slurm cluster and submit it there: diff --git a/src/Scans.jl b/src/Scans.jl index b07a7c9a..01c8ed68 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -80,7 +80,7 @@ struct CondorExec <: AbstractExec end """ - SlurmExec(scriptfile, ncores; memory="", project=, nthreads=1, workdir="") + SlurmExec(scriptfile, ncores; memory="", project=, nthreads=1, workdir="", arraymode=:queue) Execution mode which submits a scan to a Slurm queue system as an array job with `ncores` array tasks. @@ -102,6 +102,12 @@ array tasks. stdout/stderr files, and queue file are all placed here. If `""` (the default), a subdirectory `_slurm` is automatically created inside the script's directory. Pass an explicit path to use a custom directory. +- `arraymode::Symbol`: How scan points are distributed across array tasks (default `:queue`). + - `:queue`: Array tasks dynamically pick up work from a shared file-based queue + ([`QueueExec`](@ref)). Good when tasks have varying run times. + - `:batch`: Each array task gets a pre-assigned chunk of scan points (via `--batch`). + With `ncores == length(scan)`, each task runs exactly one scan point. No queue file + or file locking is needed, giving complete memory isolation between tasks. # Generated job script The generated SBATCH script includes: @@ -113,18 +119,21 @@ The generated SBATCH script includes: `julia`, ensuring the same Julia version is used on compute nodes. - All paths (Julia binary, `--chdir` directory, `--project` path) are quoted to handle spaces in paths. -- Each array task runs the script with `--queue`, so internally a [`QueueExec`](@ref) is - used for file-based load balancing across array tasks. +- Each array task runs the script with `--queue` or `--batch` depending on `arraymode`, + using the corresponding execution mode internally. !!! note `scriptfile` must **always** be `@__FILE__` # Examples ```julia -# Minimal: uses current project, 1 thread per task, no memory limit -# Job files go into /my_scan_slurm/ +# Minimal: uses current project, 1 thread per task, queue mode (default) scan = Scan("my_scan", SlurmExec(@__FILE__, 8); energy=energies) +# Batch mode: one array task per scan point, complete memory isolation +scan = Scan("my_scan", SlurmExec(@__FILE__, length(energies); arraymode=:batch, + memory="24G"); energy=energies) + # Full: 24 GB per task, custom project, 2 threads, custom workdir scan = Scan("my_scan", SlurmExec(@__FILE__, 8; memory="24G", project="/path/to/env", nthreads=2, workdir="/tmp/my_slurm_run"); @@ -138,6 +147,7 @@ struct SlurmExec <: AbstractExec project::String nthreads::Int workdir::String + arraymode::Symbol end function SlurmExec(scriptfile, ncores; @@ -146,12 +156,14 @@ function SlurmExec(scriptfile, ncores; isnothing(ap) ? "" : dirname(ap) end, nthreads=1, - workdir="") + workdir="", + arraymode=:queue) nthreads >= 1 || throw(ArgumentError("`nthreads` must be ≥ 1, got $nthreads")) if !isempty(memory) && !occursin(r"^\d+\s*[KMGT]?$"i, strip(memory)) throw(ArgumentError("`memory` must match format \"[K|M|G|T]\", got \"$memory\"")) end - SlurmExec(scriptfile, ncores, memory, project, nthreads, workdir) + arraymode in (:queue, :batch) || throw(ArgumentError("`arraymode` must be :queue or :batch, got :$arraymode")) + SlurmExec(scriptfile, ncores, memory, project, nthreads, workdir, arraymode) end """ @@ -566,7 +578,12 @@ function _slurm_script_lines(exec::SlurmExec, workdir::String) if !isempty(exec.project) juliacmd *= " --project=\"$(exec.project)\"" end - juliacmd *= " \"$(abspath(script))\" --queue" + juliacmd *= " \"$(abspath(script))\"" + if exec.arraymode == :batch + juliacmd *= " --batch $cores,\$SLURM_ARRAY_TASK_ID" + else + juliacmd *= " --queue" + end push!(lines, juliacmd) return lines end diff --git a/test/test_scans.jl b/test/test_scans.jl index 0be596d8..481fd79f 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -321,6 +321,14 @@ end Scans.SlurmExec("/tmp/test.jl", 4; memory="24G") Scans.SlurmExec("/tmp/test.jl", 4; memory="500") Scans.SlurmExec("/tmp/test.jl", 4; memory="100M") + + # arraymode defaults to :queue + @test ex.arraymode == :queue + # Explicit batch mode + ex_batch = Scans.SlurmExec("/tmp/test.jl", 64; arraymode=:batch) + @test ex_batch.arraymode == :batch + # Invalid arraymode + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; arraymode=:invalid) end ## @@ -417,4 +425,17 @@ end idx_ulimit = findfirst(l -> l == "ulimit -v unlimited", lines) idx_export = findfirst(l -> startswith(l, "export JULIA_NUM_THREADS"), lines) @test idx_ulimit < idx_export + + # Batch mode: uses --batch instead of --queue + ex_batch = Scans.SlurmExec("/tmp/run.jl", 64; arraymode=:batch, project="") + lines_batch = Scans._slurm_script_lines(ex_batch, "/tmp/work") + juliacmd_batch = lines_batch[end] + @test occursin("--batch 64,\$SLURM_ARRAY_TASK_ID", juliacmd_batch) + @test !occursin("--queue", juliacmd_batch) + + # Queue mode (default): uses --queue + ex_queue = Scans.SlurmExec("/tmp/run.jl", 8; project="") + lines_queue = Scans._slurm_script_lines(ex_queue, "/tmp/work") + @test endswith(lines_queue[end], "--queue") + @test !occursin("--batch", lines_queue[end]) end \ No newline at end of file From 8194d281ef05c518682a7991aa72dc63dfafb603 Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 15:15:28 +0100 Subject: [PATCH 8/9] Address PR review comments --- src/Scans.jl | 47 +++++++++++++++++++++++++++++++++++----------- test/test_scans.jl | 35 ++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/src/Scans.jl b/src/Scans.jl index 01c8ed68..b924ee08 100644 --- a/src/Scans.jl +++ b/src/Scans.jl @@ -83,17 +83,19 @@ end SlurmExec(scriptfile, ncores; memory="", project=, nthreads=1, workdir="", arraymode=:queue) Execution mode which submits a scan to a Slurm queue system as an array job with `ncores` -array tasks. +array tasks (`ncores` must be ≥ 1). # Keyword arguments - `memory::String`: Memory per task, e.g. `"24G"`. Sets `#SBATCH --mem` and automatically derives `--heap-size-hint` (at 80% of `memory`) for the Julia process. Supports suffixes `K`, `M`, `G`, `T`. A bare number (e.g. `"24000"`) is treated as megabytes, matching - Slurm's default convention. Must match the format `digits[K|M|G|T]` when non-empty; an - `ArgumentError` is thrown otherwise. + Slurm's default convention. Must match the strict format `[K|M|G|T]` when + non-empty (no internal whitespace); the value is normalized at construction time + (stripped and uppercased). An `ArgumentError` is thrown for invalid values. - `project::String`: Path to a Julia project environment. Defaults to the currently active project (`dirname(Base.active_project())`), or `""` if no project is active. Pass `""` - to omit the `--project` flag. + to omit the `--project` flag. Relative paths are resolved against `dirname(scriptfile)` + when generating the job script. Must not contain double quotes or newlines. - `nthreads::Int`: Number of threads per array task (default `1`, must be ≥ 1). Sets `#SBATCH --cpus-per-task` and exports `JULIA_NUM_THREADS`, `OMP_NUM_THREADS`, `OPENBLAS_NUM_THREADS`, and `MKL_NUM_THREADS` in the job script. The default of `1` @@ -101,7 +103,8 @@ array tasks. - `workdir::String`: Working directory for the Slurm job. The generated `.sh` script, stdout/stderr files, and queue file are all placed here. If `""` (the default), a subdirectory `_slurm` is automatically created inside the script's directory. - Pass an explicit path to use a custom directory. + Pass an explicit path to use a custom directory. Must not contain double quotes or + newlines. - `arraymode::Symbol`: How scan points are distributed across array tasks (default `:queue`). - `:queue`: Array tasks dynamically pick up work from a shared file-based queue ([`QueueExec`](@ref)). Good when tasks have varying run times. @@ -158,9 +161,21 @@ function SlurmExec(scriptfile, ncores; nthreads=1, workdir="", arraymode=:queue) + ncores >= 1 || throw(ArgumentError("`ncores` must be ≥ 1, got $ncores")) nthreads >= 1 || throw(ArgumentError("`nthreads` must be ≥ 1, got $nthreads")) - if !isempty(memory) && !occursin(r"^\d+\s*[KMGT]?$"i, strip(memory)) - throw(ArgumentError("`memory` must match format \"[K|M|G|T]\", got \"$memory\"")) + # Normalize and validate memory: strip whitespace, enforce strict format + memory = strip(memory) + if !isempty(memory) + m = match(r"^(\d+)([KMGT]?)$"i, memory) + isnothing(m) && throw(ArgumentError( + "`memory` must match format \"[K|M|G|T]\", got \"$memory\"")) + memory = m.captures[1] * uppercase(m.captures[2]) + end + # Validate project and workdir: no quotes or newlines (shell injection prevention) + for (name, val) in [("project", project), ("workdir", workdir)] + if occursin(r"[\"\n\r]", val) + throw(ArgumentError("`$name` must not contain quotes or newlines, got \"$val\"")) + end end arraymode in (:queue, :batch) || throw(ArgumentError("`arraymode` must be :queue or :batch, got :$arraymode")) SlurmExec(scriptfile, ncores, memory, project, nthreads, workdir, arraymode) @@ -528,9 +543,17 @@ function _parse_memory_for_heap_hint(memory::String) unit = uppercase(m.captures[2]) # Slurm treats bare numbers as megabytes; mirror that for the heap hint isempty(unit) && (unit = "M") - hint = floor(Int, value * 0.8) - hint < 1 && return "" - return "$(hint)$(unit)" + units = ["K", "M", "G", "T"] + uidx = findfirst(==(unit), units) + isnothing(uidx) && return "" + # Compute 80% hint; if < 1 in current unit, downscale (e.g. 1T -> 800G) + while true + hint = floor(Int, value * 0.8) + hint >= 1 && return "$(hint)$(units[uidx])" + uidx == 1 && return "" + value *= 1024 + uidx -= 1 + end end """ @@ -576,7 +599,9 @@ function _slurm_script_lines(exec::SlurmExec, workdir::String) end end if !isempty(exec.project) - juliacmd *= " --project=\"$(exec.project)\"" + # Resolve relative project paths against the script directory, not the workdir + project = isabspath(exec.project) ? exec.project : abspath(joinpath(dirname(script), exec.project)) + juliacmd *= " --project=\"$project\"" end juliacmd *= " \"$(abspath(script))\"" if exec.arraymode == :batch diff --git a/test/test_scans.jl b/test/test_scans.jl index 481fd79f..25802a7c 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -279,6 +279,9 @@ end end end +# Slurm tests only run on Unix (Slurm doesn't exist on Windows and paths differ) +if !Sys.iswindows() + ## @testset "SlurmExec construction" begin # Backward compatibility: positional args only @@ -309,18 +312,29 @@ end ssh = Scans.SSHExec(ex2, "myhost", "scans") @test ssh.localexec === ex2 + # Validation: ncores must be >= 1 + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 0) + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", -1) + # Validation: nthreads must be >= 1 @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; nthreads=0) @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; nthreads=-1) - # Validation: memory format + # Validation: memory format (no internal whitespace allowed) @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="bad") - @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="24 G B") + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="24 G") @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; memory="12.5G") # Valid memory formats should not throw Scans.SlurmExec("/tmp/test.jl", 4; memory="24G") Scans.SlurmExec("/tmp/test.jl", 4; memory="500") Scans.SlurmExec("/tmp/test.jl", 4; memory="100M") + # Memory is normalized: lowercase -> uppercase, whitespace stripped + @test Scans.SlurmExec("/tmp/test.jl", 4; memory="24g").memory == "24G" + @test Scans.SlurmExec("/tmp/test.jl", 4; memory=" 24G ").memory == "24G" + + # Validation: project and workdir must not contain quotes or newlines + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; project="path\"bad") + @test_throws ArgumentError Scans.SlurmExec("/tmp/test.jl", 4; workdir="path\nbad") # arraymode defaults to :queue @test ex.arraymode == :queue @@ -339,8 +353,11 @@ end @test Scans._parse_memory_for_heap_hint("2T") == "1T" @test Scans._parse_memory_for_heap_hint("500") == "400M" @test Scans._parse_memory_for_heap_hint("500K") == "400K" + # Unit downscaling: 1T -> 800G, 1G -> 800M, 1M -> 800K + @test Scans._parse_memory_for_heap_hint("1T") == "819G" + @test Scans._parse_memory_for_heap_hint("1G") == "819M" + @test Scans._parse_memory_for_heap_hint("1M") == "819K" # Edge cases - @test Scans._parse_memory_for_heap_hint("1T") == "" # floor(0.8) = 0 < 1 @test Scans._parse_memory_for_heap_hint("") == "" @test Scans._parse_memory_for_heap_hint("abc") == "" @test Scans._parse_memory_for_heap_hint("12.5G") == "" # no float support @@ -395,10 +412,18 @@ end julia_path = first(Base.julia_cmd().exec) @test occursin(julia_path, juliacmd) @test occursin("--heap-size-hint=19G", juliacmd) + # Absolute project path is passed through as-is @test occursin("--project=\"/my project/env\"", juliacmd) # Script path is absolute (so it can be found from the workdir) @test endswith(juliacmd, "\"/path/with spaces/test.jl\" --queue") + # Relative project path is resolved against script directory + ex_rel = Scans.SlurmExec("/home/user/scripts/test.jl", 4; + project=".", memory="", nthreads=1) + lines_rel = Scans._slurm_script_lines(ex_rel, "/home/user/scripts/test_slurm") + juliacmd_rel = lines_rel[end] + @test occursin("--project=\"/home/user/scripts\"", juliacmd_rel) + # Minimal options: no memory, empty project ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="") lines_min = Scans._slurm_script_lines(ex_min, "/tmp/workdir") @@ -438,4 +463,6 @@ end lines_queue = Scans._slurm_script_lines(ex_queue, "/tmp/work") @test endswith(lines_queue[end], "--queue") @test !occursin("--batch", lines_queue[end]) -end \ No newline at end of file +end + +end # !Sys.iswindows() \ No newline at end of file From e4da034b90cf2d90eeb895c2693c68a45a3b43a8 Mon Sep 17 00:00:00 2001 From: John Travers Date: Sun, 5 Apr 2026 15:17:22 +0100 Subject: [PATCH 9/9] fix tests --- test/test_scans.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_scans.jl b/test/test_scans.jl index 25802a7c..3a829561 100644 --- a/test/test_scans.jl +++ b/test/test_scans.jl @@ -422,7 +422,8 @@ end project=".", memory="", nthreads=1) lines_rel = Scans._slurm_script_lines(ex_rel, "/home/user/scripts/test_slurm") juliacmd_rel = lines_rel[end] - @test occursin("--project=\"/home/user/scripts\"", juliacmd_rel) + resolved_project = abspath(joinpath("/home/user/scripts", ".")) + @test occursin("--project=\"$resolved_project\"", juliacmd_rel) # Minimal options: no memory, empty project ex_min = Scans.SlurmExec("/tmp/simple.jl", 4; memory="", project="")