From 584303936a1b9f380c60aadabc84410eaf0ddb6f Mon Sep 17 00:00:00 2001 From: LanderOtto Date: Sat, 28 Mar 2026 17:03:25 +0100 Subject: [PATCH 1/3] Fix node names in the server + Add wait munge.key creation in the execution --- openpbs/execution/Dockerfile | 4 ++++ openpbs/execution/config/supervisord.conf | 2 +- openpbs/execution/scripts/run-munge | 11 +++++++++++ openpbs/execution/scripts/run-openpbs | 6 +++++- openpbs/server/Dockerfile | 2 ++ openpbs/server/scripts/run-munge | 9 ++++++++- openpbs/server/scripts/run-openpbs | 12 +++++++++--- 7 files changed, 40 insertions(+), 6 deletions(-) create mode 100755 openpbs/execution/scripts/run-munge diff --git a/openpbs/execution/Dockerfile b/openpbs/execution/Dockerfile index 546954d..fab72d1 100644 --- a/openpbs/execution/Dockerfile +++ b/openpbs/execution/Dockerfile @@ -47,8 +47,12 @@ ENV PATH="${PATH}:/opt/pbs/bin" COPY config/supervisord.conf \ /etc/supervisor/conf.d/ +COPY config/pbs.conf \ + /etc/pbs.conf + COPY scripts/healthcheck \ scripts/run-sshd \ + scripts/run-munge \ scripts/run-openpbs \ /bin/ diff --git a/openpbs/execution/config/supervisord.conf b/openpbs/execution/config/supervisord.conf index 545edfe..a3223e1 100644 --- a/openpbs/execution/config/supervisord.conf +++ b/openpbs/execution/config/supervisord.conf @@ -15,7 +15,7 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface serverurl=unix:///var/run/supervisor.sock [program:munge] -command=gosu munge /usr/sbin/munged --foreground +command=run-munge autostart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 diff --git a/openpbs/execution/scripts/run-munge b/openpbs/execution/scripts/run-munge new file mode 100755 index 0000000..e9711de --- /dev/null +++ b/openpbs/execution/scripts/run-munge @@ -0,0 +1,11 @@ +#!/bin/bash -e + +MUNGE_KEY=/etc/munge/munge.key +while [[ ! -s "${MUNGE_KEY}" || \ + "$(stat -c "%U" "${MUNGE_KEY}")" != "munge" \ + "$(stat -c "%a" "${MUNGE_KEY}")" != "400" ]]; do + echo "Waiting for munge.key..." + sleep 2 +done + +gosu munge /usr/sbin/munged --foreground \ No newline at end of file diff --git a/openpbs/execution/scripts/run-openpbs b/openpbs/execution/scripts/run-openpbs index c05682c..862a90a 100755 --- a/openpbs/execution/scripts/run-openpbs +++ b/openpbs/execution/scripts/run-openpbs @@ -1,10 +1,14 @@ -#!/bin/bash +#!/bin/bash -e if [ -z "${PBS_SERVER_HOST_NAME}" ]; then >&2 echo "Missing environment variable PBS_SERVER_HOST_NAME containing the OpenPBS server hostname" exit 1 fi +if [ ! -f "/etc/pbs.conf" ]; then + >&2 echo "Missing /etc/pbs.conf file" + exit 1 +fi sed -i "s/__PBS_SERVER_HOST_NAME__/${PBS_SERVER_HOST_NAME}/g" /etc/pbs.conf source /etc/pbs.conf diff --git a/openpbs/server/Dockerfile b/openpbs/server/Dockerfile index ce0890d..0dee28d 100644 --- a/openpbs/server/Dockerfile +++ b/openpbs/server/Dockerfile @@ -42,6 +42,8 @@ RUN curl -fsSL -O https://vcdn.altair.com/rl/OpenPBS/openpbs_23.06.06.rockylinux && ln -s /usr/lib64/libmunge.so.2 /usr/lib64/libmunge.so \ && adduser hpcuser +RUN ln -s /usr/bin/pg_resetwal /usr/bin/pg_resetxlog + ENV PATH="${PATH}:/opt/pbs/bin" COPY config/pbs.conf \ diff --git a/openpbs/server/scripts/run-munge b/openpbs/server/scripts/run-munge index b4c9e9f..eb3c54a 100755 --- a/openpbs/server/scripts/run-munge +++ b/openpbs/server/scripts/run-munge @@ -1,5 +1,12 @@ -#!/bin/bash +#!/bin/bash -e + +if [ -f "/etc/munge/munge.key" ]; then + >&2 echo "File /etc/munge/munge.key already exists" + exit 1 +fi create-munge-key +chown munge:munge /etc/munge/munge.key +chmod 400 /etc/munge/munge.key gosu munge /usr/sbin/munged --foreground \ No newline at end of file diff --git a/openpbs/server/scripts/run-openpbs b/openpbs/server/scripts/run-openpbs index 27f7eae..9518204 100755 --- a/openpbs/server/scripts/run-openpbs +++ b/openpbs/server/scripts/run-openpbs @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e set -m @@ -14,6 +14,12 @@ if [ -z "${PBS_EXECUTION_NODES}" ]; then exit 1 fi +echo "PBS_SERVER_HOST_NAME=${PBS_SERVER_HOST_NAME}" >> /var/spool/pbs/pbs_environment + +if [ ! -f "/etc/pbs.conf" ]; then + >&2 echo "Missing /etc/pbs.conf file" + exit 1 +fi sed -i "s/__PBS_SERVER_HOST_NAME__/${PBS_SERVER_HOST_NAME}/g" /etc/pbs.conf source /etc/pbs.conf @@ -33,9 +39,9 @@ ${PBS_EXEC}/bin/qmgr -c "set server job_history_enable = True" ${PBS_EXEC}/bin/qmgr -c "set server job_history_duration = 00:05:00" echo "Configure OpenPBS scheduler" -${PBS_EXEC}/bin/qmgr -c "set sched sched_host = localhost" +${PBS_EXEC}/bin/qmgr -c "set sched sched_host = ${PBS_SERVER_HOST_NAME}" -for NODE in $(seq 0 $((PBS_EXECUTION_NODES-1))); do +for NODE in $(seq 1 ${PBS_EXECUTION_NODES}); do echo "Create node ${PBS_EXECUTION_HOST_NAME_PREFIX}-${NODE}" ${PBS_EXEC}/bin/qmgr -c "create node ${PBS_EXECUTION_HOST_NAME_PREFIX}-${NODE} queue=workq" done From 91eb1175d4952db34ea351efd4ad62773ab00059 Mon Sep 17 00:00:00 2001 From: LanderOtto Date: Wed, 20 May 2026 12:25:47 +0200 Subject: [PATCH 2/3] Fix long FQDN failures and improve startup robustness - Add FQDN length check on server startup using getconf HOST_NAME_MAX - Add run-sched wrapper that waits for pbs.conf.ready before starting - Set autorestart=false for pbs_server to prevent crash loop - Fix munge.key ownership/permission wait conditions - Add munge wait loop before pbs_mom starts - Add supervisord priority to ensure munge starts first - Update README with FQDN warning and fix typos --- openpbs/README.md | 8 +++++--- openpbs/execution/config/supervisord.conf | 2 ++ openpbs/execution/scripts/run-munge | 6 +++--- openpbs/execution/scripts/run-openpbs | 5 +++++ openpbs/server/Dockerfile | 1 + openpbs/server/config/supervisord.conf | 3 ++- openpbs/server/scripts/run-openpbs | 17 +++++++++++++++++ openpbs/server/scripts/run-sched | 8 ++++++++ 8 files changed, 43 insertions(+), 7 deletions(-) create mode 100755 openpbs/server/scripts/run-sched diff --git a/openpbs/README.md b/openpbs/README.md index 0c1406d..1e1f896 100644 --- a/openpbs/README.md +++ b/openpbs/README.md @@ -7,7 +7,7 @@ This repository contains the source code of different container images: - `alphaunito/openpbs-server:23.06.06`, which runs the OpenPBS control plane - `alphaunito/openpbs-execution:23.06.06`, which runs a OpenPBS compute node -Plus, it also contains a [docker-compose.yml](./docker-compose.yml) file that can deplyo an entire OpenPBS cluster with a single controller and a set of compute nodes. All these components are detailed below +Plus, it also contains a [docker-compose.yml](./docker-compose.yml) file that can deploy an entire OpenPBS cluster with a single controller and a set of compute nodes. All these components are detailed below ## OpenPBS Server @@ -29,11 +29,11 @@ To correctly register the execution nodes, an `openpbs-server` container needs 2 - The `PBS_EXECUTION_NODES` variable must contain the number of compute nodes that OpenPBS should manage. If this variable is not set, the container displays an error message and terminates - The `PBS_EXECUTION_HOST_NAME_PREFIX` variable should contain the prefix of the hostname used to identify compute nodes. If this variable is not set, the container displays an error message and terminates -Note that all the compute nodes in the simulated HPC cluster should have a reachable hostname equal to `"${PBS_EXECUTION_NODES}${X}"`, where `X` is an integer in the range `[1, ${PBS_EXECUTION_NODES}]` +Note that all the compute nodes in the simulated HPC cluster should have a reachable hostname equal to `"${PBS_EXECUTION_HOST_NAME_PREFIX}-${X}"`, where `X` is an integer in the range `[1, ${PBS_EXECUTION_NODES}]`. ## OpenPBS Execution -The `pbs_mom` process is the compute node daemon for OpenPBS. It places jobs into execution as directed by the server, establishes resource usage limits, monitors the job's usage, and notifies the server when the job completes. The `openpbs-execution` Docker image can be build and published using the following commands +The `pbs_mom` process is the compute node daemon for OpenPBS. It places jobs into execution as directed by the server, establishes resource usage limits, monitors the job's usage, and notifies the server when the job completes. The `openpbs-execution` Docker image can be build and published using the following commands. ```bash docker build -t alphaunito/openpbs-execution:23.06.06 execution @@ -48,6 +48,8 @@ The `openpbs-server` and `openpbs-execution` images described above can be used Note that the `openpbs-server` node should have an identifiable hostname, as compute nodes must register with the control plane to be addressable. In Docker Compose, an explicit hostname can be set for a given service using the `hostname` keyword. +**Beware of long hostnames.** The Docker FQDN (`.`) includes the compose project name as part of the domain. If the FQDN exceeds PBS's `HOST_NAME_MAX` (64), `pbs_mom` fails with `Failed to get fullhostname`, jobs abort with `Exit_status = -3`, and the server holds them after too many retries. Use a short project name (e.g. `docker compose -p openpbs up` or `export COMPOSE_PROJECT_NAME=openpbs`). + To allow for unprivileged workloads, an `hpcuser` has been configured inside the images. Commands can be executed by explicitly impersonating this user, through the `--user hpcuser` flag. For example ```bash diff --git a/openpbs/execution/config/supervisord.conf b/openpbs/execution/config/supervisord.conf index a3223e1..acc0944 100644 --- a/openpbs/execution/config/supervisord.conf +++ b/openpbs/execution/config/supervisord.conf @@ -17,6 +17,7 @@ serverurl=unix:///var/run/supervisor.sock [program:munge] command=run-munge autostart=true +priority=10 stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 redirect_stderr=true @@ -24,6 +25,7 @@ redirect_stderr=true [program:pbs_mom] command=run-openpbs autostart=true +priority=20 stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 redirect_stderr=true diff --git a/openpbs/execution/scripts/run-munge b/openpbs/execution/scripts/run-munge index e9711de..11bb9be 100755 --- a/openpbs/execution/scripts/run-munge +++ b/openpbs/execution/scripts/run-munge @@ -2,9 +2,9 @@ MUNGE_KEY=/etc/munge/munge.key while [[ ! -s "${MUNGE_KEY}" || \ - "$(stat -c "%U" "${MUNGE_KEY}")" != "munge" \ - "$(stat -c "%a" "${MUNGE_KEY}")" != "400" ]]; do - echo "Waiting for munge.key..." + "$(stat -c "%U" "${MUNGE_KEY}" 2>/dev/null)" != "munge" || \ + "$(stat -c "%a" "${MUNGE_KEY}" 2>/dev/null)" != "400" ]]; do + echo "Waiting for munge.key with correct ownership (munge) and permissions (400)..." sleep 2 done diff --git a/openpbs/execution/scripts/run-openpbs b/openpbs/execution/scripts/run-openpbs index 862a90a..17a0faf 100755 --- a/openpbs/execution/scripts/run-openpbs +++ b/openpbs/execution/scripts/run-openpbs @@ -13,4 +13,9 @@ sed -i "s/__PBS_SERVER_HOST_NAME__/${PBS_SERVER_HOST_NAME}/g" /etc/pbs.conf source /etc/pbs.conf +until munge -n >/dev/null 2>&1; do + echo "Waiting for munged to become ready..." + sleep 1 +done + ${PBS_EXEC}/sbin/pbs_mom -N diff --git a/openpbs/server/Dockerfile b/openpbs/server/Dockerfile index 0dee28d..3fa1b20 100644 --- a/openpbs/server/Dockerfile +++ b/openpbs/server/Dockerfile @@ -56,6 +56,7 @@ COPY scripts/healthcheck scripts/run-munge \ scripts/run-openpbs \ scripts/run-sshd \ + scripts/run-sched \ /bin/ HEALTHCHECK --start-period=10s CMD healthcheck diff --git a/openpbs/server/config/supervisord.conf b/openpbs/server/config/supervisord.conf index e9ffb88..3d24d83 100644 --- a/openpbs/server/config/supervisord.conf +++ b/openpbs/server/config/supervisord.conf @@ -29,7 +29,7 @@ stdout_logfile_maxbytes=0 redirect_stderr=true [program:pbs_sched] -command=/opt/pbs/sbin/pbs_sched -N +command=run-sched autostart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 @@ -38,6 +38,7 @@ redirect_stderr=true [program:pbs_server] command=run-openpbs autostart=true +autorestart=false stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 redirect_stderr=true diff --git a/openpbs/server/scripts/run-openpbs b/openpbs/server/scripts/run-openpbs index 9518204..2b58cbe 100755 --- a/openpbs/server/scripts/run-openpbs +++ b/openpbs/server/scripts/run-openpbs @@ -24,6 +24,23 @@ sed -i "s/__PBS_SERVER_HOST_NAME__/${PBS_SERVER_HOST_NAME}/g" /etc/pbs.conf source /etc/pbs.conf +MAX_HOSTNAME_LEN=$(getconf HOST_NAME_MAX 2>/dev/null || echo 64) +for NODE in $(seq 1 ${PBS_EXECUTION_NODES}); do + NODE_NAME="${PBS_EXECUTION_HOST_NAME_PREFIX}-${NODE}" + NODE_IP=$(getent hosts "${NODE_NAME}" | awk '{print $1}' 2>/dev/null || true) + if [ -n "$NODE_IP" ]; then + FQDN=$(python3 -c "import socket; print(socket.gethostbyaddr('${NODE_IP}')[0])" 2>/dev/null || echo "") + if [ -n "$FQDN" ] && [ ${#FQDN} -ge ${MAX_HOSTNAME_LEN} ]; then + >&2 echo "ERROR: FQDN for node ${NODE_NAME} is ${#FQDN} chars (>= ${MAX_HOSTNAME_LEN})." + >&2 echo "PBS cannot resolve hostnames this long. Use a shorter project name" + >&2 echo "or set explicit short hostnames in docker-compose.yml." + exit 1 + fi + fi +done + +touch /tmp/pbs.conf.ready + if [ ! -f "${PBS_HOME}/pbs_version" ]; then echo "PBS Home directory ${PBS_HOME} needs updating." echo "Running ${PBS_EXEC}/libexec/pbs_habitat to update it." diff --git a/openpbs/server/scripts/run-sched b/openpbs/server/scripts/run-sched new file mode 100755 index 0000000..dbd38c6 --- /dev/null +++ b/openpbs/server/scripts/run-sched @@ -0,0 +1,8 @@ +#!/bin/bash -e + +while [[ ! -f "/tmp/pbs.conf.ready" ]]; do + echo "Waiting for /tmp/pbs.conf.ready..." + sleep 2 +done + +/opt/pbs/sbin/pbs_sched -N \ No newline at end of file From 111c73b872bcad48a237fdb485cb61d01641393e Mon Sep 17 00:00:00 2001 From: LanderOtto Date: Wed, 20 May 2026 12:45:38 +0200 Subject: [PATCH 3/3] Remove run-sched, pbs.conf.ready, supervisord priority - run-sched file-based sync was overengineered; pbs_sched can run directly since /etc/pbs.conf is substituted before it launches - /tmp/pbs.conf.ready was unused after removing the sync - supervisord priority was redundant (alphabetical order + munge wait loop already handle ordering) --- openpbs/execution/config/supervisord.conf | 2 -- openpbs/server/Dockerfile | 1 - openpbs/server/config/supervisord.conf | 2 +- openpbs/server/scripts/run-openpbs | 1 - openpbs/server/scripts/run-sched | 8 -------- 5 files changed, 1 insertion(+), 13 deletions(-) delete mode 100755 openpbs/server/scripts/run-sched diff --git a/openpbs/execution/config/supervisord.conf b/openpbs/execution/config/supervisord.conf index acc0944..a3223e1 100644 --- a/openpbs/execution/config/supervisord.conf +++ b/openpbs/execution/config/supervisord.conf @@ -17,7 +17,6 @@ serverurl=unix:///var/run/supervisor.sock [program:munge] command=run-munge autostart=true -priority=10 stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 redirect_stderr=true @@ -25,7 +24,6 @@ redirect_stderr=true [program:pbs_mom] command=run-openpbs autostart=true -priority=20 stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 redirect_stderr=true diff --git a/openpbs/server/Dockerfile b/openpbs/server/Dockerfile index 3fa1b20..0dee28d 100644 --- a/openpbs/server/Dockerfile +++ b/openpbs/server/Dockerfile @@ -56,7 +56,6 @@ COPY scripts/healthcheck scripts/run-munge \ scripts/run-openpbs \ scripts/run-sshd \ - scripts/run-sched \ /bin/ HEALTHCHECK --start-period=10s CMD healthcheck diff --git a/openpbs/server/config/supervisord.conf b/openpbs/server/config/supervisord.conf index 3d24d83..73d280f 100644 --- a/openpbs/server/config/supervisord.conf +++ b/openpbs/server/config/supervisord.conf @@ -29,7 +29,7 @@ stdout_logfile_maxbytes=0 redirect_stderr=true [program:pbs_sched] -command=run-sched +command=/opt/pbs/sbin/pbs_sched -N autostart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 diff --git a/openpbs/server/scripts/run-openpbs b/openpbs/server/scripts/run-openpbs index 2b58cbe..8cb7af0 100755 --- a/openpbs/server/scripts/run-openpbs +++ b/openpbs/server/scripts/run-openpbs @@ -39,7 +39,6 @@ for NODE in $(seq 1 ${PBS_EXECUTION_NODES}); do fi done -touch /tmp/pbs.conf.ready if [ ! -f "${PBS_HOME}/pbs_version" ]; then echo "PBS Home directory ${PBS_HOME} needs updating." diff --git a/openpbs/server/scripts/run-sched b/openpbs/server/scripts/run-sched deleted file mode 100755 index dbd38c6..0000000 --- a/openpbs/server/scripts/run-sched +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -e - -while [[ ! -f "/tmp/pbs.conf.ready" ]]; do - echo "Waiting for /tmp/pbs.conf.ready..." - sleep 2 -done - -/opt/pbs/sbin/pbs_sched -N \ No newline at end of file