Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 28 additions & 28 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,43 @@ SHELL := /bin/bash
all: install

install_uv:
@source ~/.bashrc;
if ! command -v uv &> /dev/null; then
echo "Installing uv ...";
"${SHELL}" <(curl -fsSL https://astral.sh/uv/install.sh);
else
echo "uv is already installed.";
source ~/.bashrc; \
if ! command -v uv &> /dev/null; then \
echo "Installing uv ..."; \
"${SHELL}" <(curl -fsSL https://astral.sh/uv/install.sh); \
else \
echo "uv is already installed."; \
fi

install_pixi:
@source ~/.bashrc;
if ! command -v pixi &> /dev/null; then
echo "Installing pixi ...";
export PIXI_VERSION=v0.54.0;
"${SHELL}" <(curl -fsSL https://pixi.sh/install.sh);
else
echo "Pixi is already installed.";
source ~/.bashrc; \
if ! command -v pixi &> /dev/null; then \
echo "Installing pixi ..."; \
export PIXI_VERSION=v0.54.0; \
"${SHELL}" <(curl -fsSL https://pixi.sh/install.sh); \
else \
echo "Pixi is already installed."; \
fi

install_pipx: install_pixi
@source ~/.bashrc;
if ! command -v pipx &> /dev/null; then
echo "Installing pipx...";
~/.pixi/bin/pixi global install pipx;
else
echo "pipx is already installed.";
source ~/.bashrc; \
if ! command -v pipx &> /dev/null; then \
echo "Installing pipx..."; \
~/.pixi/bin/pixi global install pipx; \
else \
echo "pipx is already installed."; \
fi

install_fd: install_pixi
@source ~/.bashrc;
if ! command -v fd &> /dev/null; then
echo "Installing fd ...";
pixi global install fd-find;
else
echo "fd is already installed.";
source ~/.bashrc; \
if ! command -v fd &> /dev/null; then \
echo "Installing fd ..."; \
pixi global install fd-find; \
else \
echo "fd is already installed."; \
fi

install: install_pixi install_uv install_pipx install_fd
@source ~/.bashrc;
pipx install -f .;
echo "slurm_run is installed globally";
source ~/.bashrc; \
pipx install -f .; \
echo "slurm_run is installed globally"
15 changes: 15 additions & 0 deletions slurm_run/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .submit import (
slurm_run,
get_image_and_sbatch_dest,
dump_code_image,
mkimg,
job_name_active,
rerun_job_by_id,
rerun_jobs_by_tag,
rerun_jobs_by_name,
SubmissionConfig,
SlurmSubmitContext,
)
from .jobs import list_jobs_for_user
from .backend import SlurmBackend, LocalBackend, SSHBackend
from .tailscale import ensure_tailscale_ready, backend_for_cluster
102 changes: 102 additions & 0 deletions slurm_run/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Backend abstraction for SLURM command execution.

Two concrete implementations:
- LocalBackend: calls sbatch/sacct as local subprocesses (current/default behavior).
- SSHBackend: SSHes into a login node; uses system ssh/scp binaries (no new deps).
"""

import shlex
import shutil
import subprocess
from abc import ABC, abstractmethod
from getpass import getuser
from pathlib import Path


class SlurmBackend(ABC):
@abstractmethod
def run_command(self, cmd: list[str]) -> str:
"""Execute command on the backend host, return stdout."""
...

@abstractmethod
def upload_file(self, local: Path, remote: Path) -> None:
"""Copy a local file to the remote path."""
...

@abstractmethod
def get_home_dir(self) -> Path:
"""Return the home directory on the execution host."""
...

@abstractmethod
def makedirs(self, path: Path) -> None:
"""Create directory (and parents) on the execution host."""
...


class LocalBackend(SlurmBackend):
"""Runs commands locally — the current default behavior."""

def run_command(self, cmd: list[str]) -> str:
return subprocess.check_output(cmd, text=True)

def upload_file(self, local: Path, remote: Path) -> None:
shutil.copy2(local, remote)

def get_home_dir(self) -> Path:
return Path.home()

def makedirs(self, path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)


class SSHBackend(SlurmBackend):
"""Runs commands on a remote login node via SSH/SCP.

Uses the system ssh/scp binaries and the user's existing SSH keys/config.
No additional Python dependencies required.
"""

def __init__(
self,
host: str,
user: str | None = None,
port: int = 22,
key_file: str | None = None,
):
self.host = host
self.user = user or getuser()
self.port = port
self.key_file = key_file
self._home_dir: Path | None = None

def _ssh_base_args(self) -> list[str]:
args = ["ssh", "-p", str(self.port), "-o", "BatchMode=yes"]
if self.key_file:
args += ["-i", self.key_file]
args.append(f"{self.user}@{self.host}")
return args

def run_command(self, cmd: list[str]) -> str:
# shlex.join produces a properly shell-quoted command string so that
# paths with spaces survive the remote shell invocation.
ssh_cmd = self._ssh_base_args() + [shlex.join(cmd)]
return subprocess.check_output(ssh_cmd, text=True)

def upload_file(self, local: Path, remote: Path) -> None:
dest = f"{self.user}@{self.host}:{remote}"
scp_args = ["scp", "-P", str(self.port)]
if self.key_file:
scp_args += ["-i", self.key_file]
scp_args += [str(local), dest]
subprocess.check_call(scp_args)

def get_home_dir(self) -> Path:
if self._home_dir is None:
result = self.run_command(["sh", "-c", "echo $HOME"])
self._home_dir = Path(result.strip())
return self._home_dir

def makedirs(self, path: Path) -> None:
self.run_command(["mkdir", "-p", str(path)])
61 changes: 60 additions & 1 deletion slurm_run/cli.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,76 @@
import os

import click

from .backend import LocalBackend, SSHBackend
from .submit import submit
from .jobs import jobs
from .dev import dev
from .tailscale import backend_for_cluster, ensure_tailscale_ready
from gcflow_api.auth import ensure_authenticated


@click.group()
def main():
@click.option(
"--ssh-host",
envvar="SLURM_SSH_HOST",
default=None,
help="Login node hostname for remote submission (or set SLURM_SSH_HOST).",
)
@click.option(
"--ssh-user",
envvar="SLURM_SSH_USER",
default=None,
help="SSH username (default: current user; or set SLURM_SSH_USER).",
)
@click.option(
"--ssh-port",
envvar="SLURM_SSH_PORT",
default=22,
type=int,
help="SSH port (default: 22; or set SLURM_SSH_PORT).",
)
@click.option(
"--ssh-key",
envvar="SLURM_SSH_KEY",
default=None,
help="Path to SSH private key (or set SLURM_SSH_KEY).",
)
@click.pass_context
def main(ctx, ssh_host, ssh_user, ssh_port, ssh_key):
ensure_authenticated()
ctx.ensure_object(dict)

if ssh_host:
ctx.obj["backend"] = SSHBackend(
host=ssh_host,
user=ssh_user,
port=ssh_port,
key_file=ssh_key,
)
else:
ctx.obj["backend"] = LocalBackend()


@main.command()
@click.option(
"--cluster",
required=True,
type=click.Choice(["reef", "kelp"]),
help="Cluster to log into. Opens an interactive SSH session to login-<cluster>-<username>.",
)
def login(cluster):
"""Open an interactive SSH session to the cluster login node."""
from getpass import getuser
username = getuser()
ssh_host = f"login-{cluster}-{username}"
ensure_tailscale_ready(ssh_host)
os.execvp("ssh", ["ssh", ssh_host])


main.add_command(submit, name="submit")
main.add_command(jobs, name="jobs")
main.add_command(dev, name="dev")


if __name__ == "__main__":
Expand Down
Loading
Loading