From bde3ab3433b16a015cc6e12169882b2ef98525ef Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Tue, 10 Mar 2026 15:51:27 -0700 Subject: [PATCH 01/66] Fix SPINDLE_DEBUG + SPINDLE_TEST in SPANK plugin --- src/slurm_plugin/slurm_plugin.c | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 5401b337..365e2b27 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -53,6 +53,7 @@ static int set_spindle_args(spank_t spank, spindle_args_t *params, int argc, cha static int get_spindle_args(spank_t spank, spindle_args_t *params); #endif +static int forward_environment_to_slurmstepd(spank_t spank); static int launchFE(char **hostlist, spindle_args_t *params); static int launchBE(spank_t spank, spindle_args_t *params); static int prepApp(spank_t spank, spindle_args_t *params); @@ -80,6 +81,32 @@ struct spank_option spank_options[] = SPANK_OPTIONS_TABLE_END }; +static int forward_environment_to_slurmstepd(spank_t spank) +{ + char *debugEnv, *testEnv, *tmpEnv; + + debugEnv= readSpankEnv(spank, "SPINDLE_DEBUG"); + testEnv = readSpankEnv(spank, "SPINDLE_TEST"); + tmpEnv = readSpankEnv(spank, "TMPDIR"); + + if (debugEnv) { + setenv("SPINDLE_DEBUG", debugEnv, 1); + free(debugEnv); + } + + if (testEnv) { + setenv("SPINDLE_TEST", testEnv, 1); + free(testEnv); + } + + if (tmpEnv) { + setenv("TMPDIR", tmpEnv, 1); + free(tmpEnv); + } + + return 0; +} + int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) { spank_context_t context; @@ -107,8 +134,13 @@ int slurm_spank_task_init(spank_t spank, int site_argc, char *site_argv[]) return 0; } - push_env(spank, &env); + // We need to acquire the job environment before we do anything that + // will spawn the log daemon so that SPINDLE_DEBUG and SPINDLE_TEST + // are set appropriately. + forward_environment_to_slurmstepd(spank); + sdprintf(1, "Beginning spindle plugin\n"); + push_env(spank, &env); result = process_spindle_args(spank, site_argc, site_argv, ¶ms, NULL, NULL); if (result == -1) { From dcb530e75e7a7fd62f21b4c82655d30f339d9d67 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:08 -0700 Subject: [PATCH 02/66] Cachepath: parse_loc utilities. --- src/client/beboot/parseloc.h | 7 ++ src/utils/parseloc.c | 145 +++++++++++++++++++++++++++++------ 2 files changed, 127 insertions(+), 25 deletions(-) diff --git a/src/client/beboot/parseloc.h b/src/client/beboot/parseloc.h index c5362e2e..1731906a 100644 --- a/src/client/beboot/parseloc.h +++ b/src/client/beboot/parseloc.h @@ -24,6 +24,13 @@ extern "C" { #include "spindle_launch.h" char *parse_location(char *loc, number_t number); +char *parse_location_noerr(char *loc, number_t number); +char *realize(char *path); +char **parse_colonsep_prefixes(char *colonsep_list, number_t number); +int is_local_prefix(const char *path, char **local_prefixes); +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); #if defined(__cplusplus) } diff --git a/src/utils/parseloc.c b/src/utils/parseloc.c index 00cdcf24..3ec20047 100644 --- a/src/utils/parseloc.c +++ b/src/utils/parseloc.c @@ -22,6 +22,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include #include +#include #if !defined(USE_PLUGIN_DEBUG) #include "spindle_debug.h" @@ -34,13 +35,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ccwarns.h" #include "spindle_launch.h" -#if defined(__cplusplus) -extern "C" { -#endif - char *parse_location(char *loc, number_t number); -#if defined(__cplusplus) -} -#endif +extern int spindle_mkdir(char *orig_path); #if defined(CUSTOM_GETENV) extern char *custom_getenv(); @@ -168,38 +163,64 @@ char *parse_location_noerr(char *loc, number_t number) **/ char *realize(char *path) { + int local_errno; char *result; - char *origpath, *cur_slash = NULL, *trailing; - struct stat buf; + char *origpath, *cur_slash = NULL, *prev_slash = NULL; + struct stat *buf = calloc( 1, sizeof( struct stat ) ); char newpath[MAX_PATH_LEN+1]; int lastpos; newpath[MAX_PATH_LEN] = '\0'; origpath = strdup(path); - for (;;) { - if (stat(origpath, &buf) != -1) - break; - if (cur_slash) - *cur_slash = '/'; + errno=0; + while( stat( origpath, buf ) == -1 ){ + local_errno = errno; + debug_printf("Failed to stat '%s' (%s).\n", origpath, strerror(local_errno)); + prev_slash = cur_slash; cur_slash = strrchr(origpath, '/'); - if (!cur_slash) - break; - *cur_slash = '\0'; + if( prev_slash ) + *prev_slash = '/'; + if( cur_slash ) + *cur_slash = '\0'; + else{ + debug_printf("Nothing in the original path can be stat'ed. (%s)\n", path); + return NULL; + } } - if (cur_slash) - trailing = cur_slash + 1; - else - trailing = ""; + errno = 0; result = realpath(origpath, newpath); if (!result) { + local_errno = errno; + err_printf( + "Error: realpath(3) failed to create canonical version of '%s' (%s). Returning '%s'.\n", + origpath, strerror(local_errno), path ); + errno = 0; + int rc = stat( origpath, buf ); + local_errno = errno; + err_printf( + " Statting that path results in rc=%d, errno=%d, error='%s'.\n", + rc, local_errno, strerror(local_errno)); free(origpath); - return path; + return NULL; } + free(buf); - strncat(newpath, "/", MAX_PATH_LEN); - strncat(newpath, trailing, MAX_PATH_LEN); - newpath[MAX_PATH_LEN] = '\0'; + if( cur_slash ){ + if( strlen( newpath ) + strlen( cur_slash+1 ) > MAX_PATH_LEN ){ + err_printf( + "Error: The realized path exceeds MAX_PATH_LEN (%d).\n" + " Original path: '%s'\n" + " Statable part: '%s'\n" + " Canonical version: '%s'\n" + " Returning original path.\n", + MAX_PATH_LEN, path, origpath, newpath); + free(origpath); + return path; + } + strncat(newpath, "/", 2); + strncat(newpath, cur_slash+1, MAX_PATH_LEN - strlen( newpath )); + } free(origpath); lastpos = strlen(newpath)-1; @@ -280,3 +301,77 @@ int is_local_prefix(const char *path, char **local_prefixes) { return 0; } +/* validateCandidatePath determines if candidatePath passes parse_location(), realize(), and spindle_mkdir(), which is to say, can + * spindle create a directory from this path? + * + * If not NULL, then realizedPath, parsedPath, and/or symbolicPath will hold the respective intermediate/final results. + * + * Return 1 if the candidatePath is valid, otherwise 0. + */ +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ){ + int rc; + char *parsedCandidatePath, *realizedCandidatePath; + parsedCandidatePath = parse_location( candidatePath, number ); + if( parsedCandidatePath ){ + realizedCandidatePath = realize( parsedCandidatePath ); + if( realizedCandidatePath ){ + rc = spindle_mkdir( parsedCandidatePath ); + if( 0 == rc ){ + if( symbolicPath) *symbolicPath = candidatePath; + if( parsedPath ) *parsedPath = parsedCandidatePath; + if( realizedPath) *realizedPath = realizedCandidatePath; + return 1; + }else{ + debug_printf2("Unable to create directory %s, moving on to the next candidate.\n", realizedCandidatePath ); + } + }else{ + debug_printf2( "Unable to realize candidate %s, moving on to the next candidate.\n", parsedCandidatePath ); + } + }else{ + debug_printf2("Unable to parse candidate %s, moving on to the next candidate.\n", candidatePath ); + } + return 0; +} + +/** + * determineValidCachePaths() works exclusively with the cachepaths parameter. Because not all paths may be valid on all + * compute nodes, and because we want to have all nodes reach a consensus on which cache path to use, we + * determine the validity of all paths in the origPathList, save the intermediate results, and return a bit + * index to the user. Via allReduce() all nodes reach a consensus on the set of valid paths, and retrieves + * that informatino via getValidPathByIndex(). + */ +static char *realizedCachePaths[64], *parsedCachePaths[64], *symbolicCachePaths[64]; + +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ){ + + char *saveptr, *candidatePath, *pathList = strdup( origPathList ); + uint64_t bitoffset = 0; + + *validBitIdx = 0; + debug_printf2("origPathList='%s', number='%lu'.\n", origPathList, number ); + + candidatePath = strtok_r( pathList, ":", &saveptr ); + while( NULL != candidatePath && bitoffset < 64 ){ + *validBitIdx |= validateCandidatePath( + candidatePath, + &realizedCachePaths[bitoffset], + &parsedCachePaths[bitoffset], + &symbolicCachePaths[bitoffset], number ) << bitoffset; + bitoffset++; + candidatePath = strtok_r( NULL, ":", &saveptr ); + } + free( pathList ); +} + +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ){ + uint64_t bitoffset = 0; + if (!validBitIdx){ + return; + } + while( (bitoffset < 64) && (((1 << bitoffset) & validBitIdx) == 0) ){ + bitoffset++; + } + if( realizedCachePath ) *realizedCachePath = realizedCachePaths[bitoffset]; + if( parsedCachePath ) *parsedCachePath = parsedCachePaths[bitoffset]; + if( symbolicCachePath ) *symbolicCachePath = symbolicCachePaths[bitoffset]; +} From d96b8a4aa87b7e249a57bc07e7d323612eb784ef Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:45 -0700 Subject: [PATCH 03/66] Cachepath: remove/rename [orig_]location. --- src/client/beboot/spindle_bootstrap.c | 2 +- src/client/client/client.c | 15 +++++++-------- src/client/client/intercept.h | 1 + src/client/client/intercept_exec.c | 5 +---- src/client/client/intercept_readlink.c | 16 +++++++++++----- src/client/client/should_intercept.c | 25 +++++++++++++++---------- src/client/client/should_intercept.h | 1 + 7 files changed, 37 insertions(+), 28 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 7805a9db..98424d50 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -114,7 +114,6 @@ static void setup_environment() setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_LOCATION", location, 1); - setenv("LDCS_ORIG_LOCATION", orig_location, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) @@ -161,6 +160,7 @@ static int parse_cmdline(int argc, char *argv[]) } symbolic_location = argv[i++]; + i++; // Skip over candidate_cachepaths. number_s = argv[i++]; number = (number_t) strtoul(number_s, NULL, 0); opts_s = argv[i++]; diff --git a/src/client/client/client.c b/src/client/client/client.c index a50bb228..d7827d4b 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -41,6 +41,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "exec_util.h" #include "intercept.h" #include "fixlocale.h" +#include "should_intercept.h" errno_location_t app_errno_location; @@ -70,11 +71,8 @@ static const ElfW(Phdr) *libc_phdrs, *interp_phdrs; static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; -/* location has the realize'd path to the local file cache. orig_location is not realized and - * may contain symlinks - */ -char *location; -char *orig_location; +static char *location; +static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; number_t number; static int have_stat_patches; @@ -201,7 +199,6 @@ static int init_server_connection() return 0; location = getenv("LDCS_LOCATION"); - orig_location = getenv("LDCS_ORIG_LOCATION"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -221,7 +218,6 @@ static int init_server_connection() debug_printf("Disabling environment variables because we're not following forks\n"); unsetenv("LD_AUDIT"); unsetenv("LDCS_LOCATION"); - unsetenv("LDCS_ORIG_LOCATION"); unsetenv("LDCS_NUMBER"); unsetenv("LDCS_CONNECTION"); unsetenv("LDCS_RANKINFO"); @@ -265,6 +261,9 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); + set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); + set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); @@ -472,7 +471,7 @@ char *client_library_load(const char *name) char *orig_file_name = (char *) name; if (is_in_spindle_cache(name)) { - debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, location); + debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, chosen_realized_cachepath); memset(fixed_name, 0, MAX_PATH_LEN+1); send_orig_path_request(ldcsid, orig_file_name, fixed_name); orig_file_name = fixed_name; diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index 4ace2328..aae968f7 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,6 +89,7 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 14b555ed..8480b75e 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -142,7 +142,6 @@ static char **removeEnvironmentStrs(char **envp) if (strIsPrefix("LD", envp[i])) { if (strIsPrefix("LD_AUDIT=", envp[i]) || strIsPrefix("LDCS_LOCATION=", envp[i]) || - strIsPrefix("LDCS_ORIG_LOCATION=", envp[i]) || strIsPrefix("LDCS_CONNECTION=", envp[i]) || strIsPrefix("LDCS_RANKINFO=", envp[i]) || strIsPrefix("LDCS_OPTIONS=", envp[i]) || @@ -177,7 +176,6 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp unsetf("SPINDLE"); unsetf("LD_AUDIT"); unsetf("LDCS_LOCATION"); - unsetf("LDCS_ORIG_LOCATION"); unsetf("LDCS_CONNECTION"); unsetf("LDCS_RANKINFO"); unsetf("LDCS_OPTIONS"); @@ -198,13 +196,12 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 10; + new_size = orig_size + 20; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); propogateEnvironmentStr(envp, newenv, &pos, "LD_AUDIT"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_LOCATION"); - propogateEnvironmentStr(envp, newenv, &pos, "LDCS_ORIG_LOCATION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_CONNECTION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_RANKINFO"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_OPTIONS"); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index 0a326330..2799baee 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,19 +31,25 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -extern char *location; +static char *cachepath; + +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + chosen_parsed_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; - int location_len, result; + int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; - location_len = strlen(location); + cachepath_len = strlen(cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(location, buf, location_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); - result = send_orig_path_request(ldcsid, buf+location_len+1, tmp); + result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) return -1; debug_printf2("readlink translated spindle local path %s to %s\n", buf, tmp); diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index bfabbbf8..3a348d3d 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,22 +29,27 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); +static char *cachepath, *orig_cachepath; + +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + orig_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} -extern char *location; -extern char *orig_location; int is_in_spindle_cache(const char *pathname) { - static int location_size = 0; - static int orig_location_size = 0; - if (!location_size) { - location_size = strlen(location); + static int cachepath_size = 0; + static int orig_cachepath_size = 0; + if (!cachepath_size) { + cachepath_size = strlen(cachepath); } - if (!orig_location_size) { - orig_location_size = strlen(orig_location); + if (!orig_cachepath_size) { + orig_cachepath_size = strlen(orig_cachepath); } - return ((strncmp(pathname, location, location_size) == 0) || - (strncmp(pathname, orig_location, orig_location_size) == 0)); + return ((strncmp(pathname, cachepath, cachepath_size) == 0) || + (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index f6a9b510..6a545913 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,6 +27,7 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From 2ff3bfb9cf7e5b829cefc03caeca7a3084d9409e Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:51:51 -0700 Subject: [PATCH 04/66] Cachepath: Configure-time support. --- config.h.in | 3 +++ configure | 16 ++++++++++++++++ configure.common.ac | 5 +++++ src/client/config.h.in | 3 +++ src/client/configure | 16 ++++++++++++++++ src/fe/config.h.in | 3 +++ src/fe/configure | 16 ++++++++++++++++ src/server/config.h.in | 3 +++ src/server/configure | 16 ++++++++++++++++ 9 files changed, 81 insertions(+) diff --git a/config.h.in b/config.h.in index 8c7a94db..aae4c0da 100644 --- a/config.h.in +++ b/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/configure b/configure index e9a4f95a..00d3f031 100755 --- a/configure +++ b/configure @@ -847,6 +847,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1590,6 +1591,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16662,6 +16665,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16698,6 +16709,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/configure.common.ac b/configure.common.ac index e8d311f4..9f360a17 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -17,6 +17,10 @@ AC_ARG_WITH(default-num-ports, [AS_HELP_STRING([--with-default-numports=NUM],[Number of TCP/IP ports to scan for Spindle server communication])], [NUM_COBO_PORTS=${withval}], [NUM_COBO_PORTS=$DEFAULT_NUM_COBO_PORTS]) +AC_ARG_WITH(cachepaths, + [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], + [CACHEPATHS=${withval}], + [CACHEPATHS=$DEFAULT_LOC]) AC_ARG_WITH(localstorage, [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], [SPINDLE_LOC=${withval}], @@ -29,6 +33,7 @@ AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle] AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) AC_DEFINE_UNQUOTED([SPINDLE_LOC],"[$SPINDLE_LOC]",[The default local directory for Spindle]) +AC_DEFINE_UNQUOTED([CACHEPATHS],"[$CACHEPATHS]",[Colon-separated list of potential back-end cache directories]) AC_DEFINE_UNQUOTED([SPINDLE_LOCAL_PREFIX],"[$SPINDLE_LOCAL_PREFIX]",[The default colon-separated list of directories that Spindle will not cache files out of]) TESTRM=unknown diff --git a/src/client/config.h.in b/src/client/config.h.in index 6c5eae90..a42cc0ab 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/client/configure b/src/client/configure index 236528aa..a07d16b7 100755 --- a/src/client/configure +++ b/src/client/configure @@ -810,6 +810,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1532,6 +1533,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -12587,6 +12590,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -12623,6 +12634,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index 357feea0..5057ef33 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/fe/configure b/src/fe/configure index 75088c9a..3d73ec17 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -831,6 +831,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1570,6 +1571,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16437,6 +16440,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16473,6 +16484,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/server/config.h.in b/src/server/config.h.in index 6eea236a..dc9439b3 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/server/configure b/src/server/configure index a747c739..92b48ef7 100755 --- a/src/server/configure +++ b/src/server/configure @@ -837,6 +837,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1567,6 +1568,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16434,6 +16437,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16470,6 +16481,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF From bc09adc77ed586b5f61ba0ad23512eb7dcbcc378 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:52:29 -0700 Subject: [PATCH 05/66] Cachepath: Internal messaging for path resolution --- src/client/client_comlib/client_api.c | 53 ++++++++++++ src/client/client_comlib/client_api.h | 1 + src/fe/startup/spindle_fe.cc | 18 +++++ src/include/ldcs_api.h | 3 + .../auditserver/ldcs_audit_server_handlers.c | 80 +++++++++++++++++++ .../auditserver/ldcs_audit_server_md_cobo.c | 6 ++ src/server/comlib/ldcs_api_util.c | 3 + 7 files changed, 164 insertions(+) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 96390dca..267b8a93 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -37,6 +37,59 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + ldcs_message_t message; + char buffer[MAX_PATH_LEN+1]; + buffer[MAX_PATH_LEN] = '\0'; + + message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; + message.header.len = MAX_PATH_LEN; + message.data = buffer; + + COMM_LOCK; + + debug_printf3("sending message of type: request_location_path.\n" ); + client_send_msg(fd, &message); + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_realized_cachepath ){ + *chosen_realized_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_parsed_cachepath ){ + *chosen_parsed_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_symbolic_cachepath ){ + *chosen_symbolic_cachepath = strdup( buffer ); + } + + return 0; +} + int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { ldcs_message_t message; char buffer[MAX_PATH_LEN+1+sizeof(int)]; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 74f82346..982c4b1c 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,6 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); int get_python_prefix(int fd, char **prefix); diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index cb53023b..2c2879f5 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -41,6 +41,7 @@ static const char *logging_file = NULL; #endif static const char spindle_bootstrap[] = LIBEXECDIR "/spindle_bootstrap"; static bool sendAndWaitForAlive(); +static void determineCachepathConsensus(); #define STARTUP_TIMEOUT 60 @@ -71,6 +72,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); buffer_size += args->location ? strlen(args->location) + 1 : 1; + buffer_size += args->candidate_cachepaths ? strlen(args->candidate_cachepaths) + 1 : 1; buffer_size += args->pythonprefix ? strlen(args->pythonprefix) + 1 : 1; buffer_size += args->preloadfile ? strlen(args->preloadfile) + 1 : 1; buffer_size += args->numa_files ? strlen(args->numa_files) + 1 : 1; @@ -91,6 +93,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) pack_param(args->startup_type, buf, pos); pack_param(args->shm_cache_size, buf, pos); pack_param(args->location, buf, pos); + pack_param(args->candidate_cachepaths, buf, pos); pack_param(args->pythonprefix, buf, pos); pack_param(args->preloadfile, buf, pos); pack_param(args->bundle_timeout_ms, buf, pos); @@ -230,6 +233,7 @@ int getApplicationArgsFE(spindle_args_t *params, int *spindle_argc, char ***spin (*spindle_argv)[n++] = strdup(uniqueid_s); } (*spindle_argv)[n++] = strdup(params->location); + (*spindle_argv)[n++] = strdup(params->candidate_cachepaths); (*spindle_argv)[n++] = strdup(number_s); (*spindle_argv)[n++] = strdup(opt_s); (*spindle_argv)[n++] = strdup(cachesize_s); @@ -395,9 +399,11 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Start FE server */ debug_printf("spindle_args_t { number = %lu; port = %u; num_ports = %u; opts = %lu; unique_id = %lu; " "use_launcher = %u; startup_type = %u; shm_cache_size = %u; location = %s; " + "cachepaths = %s; " "pythonprefix = %s; preloadfile = %s; bundle_timeout_ms = %u; bundle_cachesize_kb = %u }\n", (unsigned long) params->number, params->port, params->num_ports, params->opts, params->unique_id, params->use_launcher, params->startup_type, params->shm_cache_size, params->location, + params->candidate_cachepaths, params->pythonprefix, params->preloadfile, params->bundle_timeout_ms, params->bundle_cachesize_kb); printSpindleFlags(params->opts); @@ -427,6 +433,7 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Wait for servers to indicate startup */ sendAndWaitForAlive(); + determineCachepathConsensus(); return 0; } @@ -483,6 +490,17 @@ void markRSHPidReapedFE() clear_fe_rsh_pid(); } +static void determineCachepathConsensus( void ){ + ldcs_message_t consensus_req_msg; + consensus_req_msg.header.type = LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS; + consensus_req_msg.header.len = 0; + consensus_req_msg.data = NULL; + int result = ldcs_audit_server_fe_broadcast(&consensus_req_msg, NULL); + if (result == -1) { + debug_printf("Failure sending cachepath consensus message\n"); + } +} + static bool sendAndWaitForAlive() { int result; diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index e8ffa43d..e6ccbafb 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -85,6 +85,9 @@ typedef enum { LDCS_MSG_PICKONE_RESP, LDCS_MSG_ALIVE_REQ, LDCS_MSG_ALIVE_RESP, + LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS, + LDCS_MSG_CHOSEN_CACHEPATH_REQUEST, + LDCS_MSG_CHOSEN_CACHEPATH, LDCS_MSG_UNKNOWN } ldcs_message_ids_t; diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 8a18de24..98c6807a 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -178,7 +178,10 @@ static int handle_setup_alias(ldcs_process_data_t *procdata, char *pathname, cha static int handle_client_dirlists_req(ldcs_process_data_t *procdata, int nc); static int handle_close_client_query(ldcs_process_data_t *procdata, int nc); static int handle_alive_msg(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc); +extern void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); /** * Query from client to server. Returns info about client's rank in server data structures. **/ @@ -1890,6 +1893,8 @@ int handle_client_message(ldcs_process_data_t *procdata, int nc, ldcs_message_t return handle_client_pickone_msg(procdata, nc, msg); case LDCS_MSG_END: return handle_client_end(procdata, nc); + case LDCS_MSG_CHOSEN_CACHEPATH_REQUEST: + return handle_chosen_cachepath_request(procdata, nc); default: err_printf("Received unexpected message from client %d: %d\n", nc, (int) msg->header.type); assert(0); @@ -1987,6 +1992,8 @@ int handle_server_message(ldcs_process_data_t *procdata, node_peer_t peer, ldcs_ case LDCS_MSG_ALIVE_REQ: case LDCS_MSG_ALIVE_RESP: return handle_alive_msg(procdata, msg); + case LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS: + return handle_cachepath_consensus(procdata, msg); default: err_printf("Received unexpected message from node: %d\n", (int) msg->header.type); assert(0); @@ -2945,6 +2952,79 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs } } +/** + * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which of the locations, commpaths, and cachepaths are + * available across all of the servers. + */ + +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ + + int num_children = ldcs_audit_server_md_get_num_children(procdata); + + if (num_children) { + spindle_broadcast(procdata, msg); + msgbundle_force_flush(procdata); + } + + ldcs_audit_server_md_consensus(procdata, msg); + + if( procdata->cachepath_bitidx == 0 ){ + err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); + procdata->cachepath = procdata->location; + }else{ + // ldcs_audit_server_filemngt_init() does it's own realize() pass. + getValidCachePathByIndex( procdata->cachepath_bitidx, + &procdata->cachepath, + &procdata->parsed_cachepath, + &procdata->symbolic_cachepath); + } + + debug_printf3("Initializing file cache location %s\n", procdata->location); + ldcs_audit_server_filemngt_init(procdata->cachepath); + + test_printf(" cachepath=%s\n", procdata->cachepath); + return 0; +} + +/** + * Handle LDCS_MSG_CHOSEN_CACHEPATH_REQUEST + */ +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc){ + ldcs_message_t msg; + int connid; + ldcs_client_t *client; + + assert(nc != -1); + client = procdata->client_table + nc; + connid = client->connid; + if (client->state != LDCS_CLIENT_STATUS_ACTIVE || connid < 0) + return 0; + + + msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; + + msg.header.len = strlen(procdata->cachepath) + 1; + msg.data = procdata->cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->parsed_cachepath) + 1; + msg.data = procdata->parsed_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->symbolic_cachepath) + 1; + msg.data = procdata->symbolic_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + return 0; +} + + /** * Handle alive message, which is a broadcast/response ping through all servers */ diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 08c9b952..d8b5442f 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -401,3 +401,9 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) cobo_get_num_childs(&num_childs); return num_childs; } + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ + if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ + cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); + } +} diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index 2bc2455d..b6beb56d 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -91,6 +91,9 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_PICKONE_RESP); STR_CASE(LDCS_MSG_ALIVE_REQ); STR_CASE(LDCS_MSG_ALIVE_RESP); + STR_CASE(LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH_REQUEST); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH); STR_CASE(LDCS_MSG_UNKNOWN); } return "unknown"; From f876d40bb521a865ffaab54c1f2c4f23820002c3 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:53:25 -0700 Subject: [PATCH 06/66] Cachepath: Adds cobo_allreduce() --- src/cobo/cobo.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/cobo/ldcs_cobo.h | 16 ++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 785a5d03..ad2ea878 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1434,6 +1434,48 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } +int cobo_allreduce( int64_t *pval, cobo_op_t op ){ + + /* if i have any children, receive their data */ + int64_t child_val; + for(int i=cobo_num_child-1; i>=0; i--) { + /* read int64_t from child */ + if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { + err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); + exit(1); + } + + /* compare child's val to our current val */ + switch( op ){ + case COBO_OP_MIN: if( child_val < *pval ) *pval = child_val; break; + case COBO_OP_MAX: if( child_val > *pval ) *pval = child_val; break; + case COBO_OP_BITWISE_AND: *pval &= child_val; break; + case COBO_OP_BITWISE_OR: *pval |= child_val; break; + case COBO_OP_LOGICAL_AND: *pval = *pval && child_val; break; + case COBO_OP_LOGICAL_OR: *pval = *pval || child_val; break; + case COBO_OP_SUM: *pval += child_val; break; + case COBO_OP_NOOP: break; + default: + err_printf("Illegal op (%d). Ignoring.\n", op); + break; + } + } + + /* forward data to parent if we're not rank 0, otherwise set the recvbuf */ + if (cobo_me != 0) { + /* not the root, so forward our reduction result to our parent */ + if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { + err_printf("Sending reduced data to parent failed\n"); + exit(1); + } + } + + /* broadcast result of reduction from rank 0 to all tasks */ + cobo_bcast_tree(pval, sizeof(int64_t)); + + return COBO_SUCCESS; +} + /* provide list of ports and number of ports as input, get number of tasks and my rank as output */ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* num_ranks) { diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index edacd4b1..30cc9673 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -55,6 +55,7 @@ extern "C" { #define cobo_allgather COMBINE(COBO_NAMESPACE, cobo_allgather) #define cobo_alltoall COMBINE(COBO_NAMESPACE, cobo_alltoall ) #define cobo_allgather_str COMBINE(COBO_NAMESPACE, cobo_allgather_str) +#define cobo_allreduce COMBINE(COBO_NAMESPACE, cobo_allreduce) #define cobo_server_open COMBINE(COBO_NAMESPACE, cobo_server_open) #define cobo_server_close COMBINE(COBO_NAMESPACE, cobo_server_close) #define cobo_server_get_root_socket COMBINE(COBO_NAMESPACE, cobo_server_get_root_socket) @@ -67,6 +68,19 @@ extern "C" { #define cobo_register_preconnect_cb COMBINE(COBO_NAMESPACE, cobo_register_preconnect_cb) #endif +// Used for cobo_allreduce(). +typedef enum{ + COBO_OP_MIN, + COBO_OP_MAX, + COBO_OP_BITWISE_AND, + COBO_OP_BITWISE_OR, + COBO_OP_LOGICAL_AND, + COBO_OP_LOGICAL_OR, + COBO_OP_SUM, + COBO_OP_NOOP, + NUM_COBO_OP +} cobo_op_t; + /* * ========================================================================== * ========================================================================== @@ -128,6 +142,8 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); +int cobo_allreduce(int64_t *pval, cobo_op_t op); + /* * ========================================================================== * ========================================================================== From fdd22ccb735fab6838d4f922af9129bcd061d9ce Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:54:01 -0700 Subject: [PATCH 07/66] Cachepath: Adds parameters to config_mgr --- src/fe/startup/config_mgr.cc | 23 +++++++++++++++++++++++ src/fe/startup/config_mgr.h | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index 36a30b20..eb710b36 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -56,6 +56,12 @@ using namespace std; #define SPINDLE_LOC_STR "$TMPDIR" #endif +#if defined(CACHEPATHS) +#define SPINDLE_CACHEPATHS_STR CACHEPATHS +#else +#define SPINDLE_CACHEPATHS_STR "$TMPDIR" +#endif + #if defined(SPINDLE_LOCAL_PREFIX) #define SPINDLE_LOCAL_PREFIX_STR SPINDLE_LOCAL_PREFIX #else @@ -269,6 +275,8 @@ void initOptionsList() "Strip debug and symbol information from binaries before distributing them." }, { confLocation, "location", shortLocation, groupMisc, cvString, {}, SPINDLE_LOC_STR, "Back-end directory for storing relocated files. Should be a non-shared location such as a ramdisk." }, + { confCachePaths, "cachepaths", shortCachePaths, groupMisc, cvString, {}, SPINDLE_CACHEPATHS_STR, + "Colon-separated list of candidate paths for cached libraries."}, { confNoclean, "noclean", shortNoClean, groupMisc, cvBool, {}, "false", "Don't remove local file cache after execution." }, { confDisableLogging, "disable-logging", shortDisableLogging, groupMisc, cvBool, {}, DISABLE_LOGGING_STR, @@ -740,6 +748,21 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const args.location = strdup(loc.c_str()); break; } + case confCachePaths:{ + // Paramemter values are colon-separated lists of paths. + // Append "/spindle.$NUMBER" to each path in the list. + string paths = strresult; + size_t idx = paths.find(":"); + string number_var_with_colon("/spindle.$NUMBER:"); + string number_var_without_colon("/spindle.$NUMBER"); + while( idx != string::npos ){ + paths.replace(idx, 1, number_var_with_colon); + idx = paths.find(":", idx + number_var_with_colon.size()); + }; + paths += number_var_without_colon; + args.candidate_cachepaths = strdup(paths.c_str()); + break; + } case confCachePrefix: case confPythonPrefix: if (args.pythonprefix) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 8e70daa6..27be1ae8 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -30,6 +30,7 @@ enum SpindleConfigID { confPort, confNumPorts, confLocation, + confCachePaths, confCachePrefix, confPythonPrefix, confLocalPrefix, @@ -125,7 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso + shortPatchLdso, + shortCachePaths, }; enum CmdlineGroups { From 6a60e27fcfd10d41ff6417ab605b54ac6ee424b8 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:55:39 -0700 Subject: [PATCH 08/66] Cachepath: adds flux parameter support --- src/flux/flux-spindle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/flux/flux-spindle.c b/src/flux/flux-spindle.c index 6c99341a..65de5ed9 100644 --- a/src/flux/flux-spindle.c +++ b/src/flux/flux-spindle.c @@ -382,7 +382,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) const char *relocaout = NULL, *reloclibs = NULL, *relocexec = NULL, *relocpython = NULL; const char *followfork = NULL, *preload = NULL, *level = NULL; const char *pyprefix = NULL, *location = NULL; - char *numafiles = NULL; + char *numafiles = NULL, *cachepaths = NULL; if (flux_shell_getopt_unpack (shell, "spindle", "o", &opts) < 0) return -1; @@ -404,7 +404,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) * supplied by the user, but not unpacked (This handles typos, etc). */ if (json_unpack_ex (opts, &error, JSON_STRICT, - "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s}", + "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s s?s}", "noclean", &noclean, "nostrip", &nostrip, "push", &push, @@ -419,7 +419,8 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) "numa", &numa, "numa-files", &numafiles, "preload", &preload, - "level", &level) < 0) + "level", &level, + "cachepaths", &cachepaths) < 0) logerrno_printf_and_return(1, "Error in spindle option: %s\n", error.text); if (noclean) @@ -462,6 +463,9 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) free (ctx->params.pythonprefix); ctx->params.pythonprefix = tmp; } + if( cachepaths ){ + ctx->params.candidate_cachepaths = cachepaths; + } if (location) { ctx->params.location = (char *) location; } From f873dcf10cc306d06f327ed498caa0fbd4fd45be Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:05 -0700 Subject: [PATCH 09/66] Cachepath: Adds logging support. --- src/logging/spindle_logd.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/logging/spindle_logd.cc b/src/logging/spindle_logd.cc index 53eb5dc0..45d58a7d 100644 --- a/src/logging/spindle_logd.cc +++ b/src/logging/spindle_logd.cc @@ -202,7 +202,7 @@ class TestVerifier std::vector err_strings; std::set > target_libs; std::set > libs_loaded; - char *location; + char *cachepath; void logerror(std::string s) { @@ -249,7 +249,8 @@ class TestVerifier tmp_s = getenv("TEMPDIR"); if (!tmp_s) tmp_s = "/tmp"; - location = strdup(tmp_s); + // These are reasonable fallbacks that should be replaced via messages, below. + cachepath = strdup(tmp_s); } ~TestVerifier() @@ -268,7 +269,7 @@ class TestVerifier strstr(filename, "retzero") == NULL && strstr(filename, ".py") == NULL) return true; - bool is_from_temp = (strstr(filename, location) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); + bool is_from_temp = (strstr(filename, cachepath) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); bool is_local_test = strstr(filename, "liblocal") != NULL; if (is_from_temp && !is_local_test && ret_code == -1) { @@ -294,12 +295,12 @@ class TestVerifier char buffer[4096]; int ret; - if (strstr(s, " location=" ) == s ){ - free( location ); - const char *loc_start = strstr( s, "=") + 1; - size_t loc_len = strlen( loc_start ); - location = strdup( loc_start ); - location[ loc_len - 1 ] = '\0'; // Remove trailing '\n'. + if (strstr(s, " cachepath=" ) == s ){ + free( cachepath ); + const char *cachepath_start = strstr( s, "=") + 1; + size_t cachepath_len = strlen( cachepath_start ); + cachepath = strdup( cachepath_start ); + cachepath[ cachepath_len - 1 ] = '\0'; // Remove trailing '\n'. } if (strstr(s, "open(") == s) { const char *first_quote, *last_quote, *equals; From 846b34b110d706226128fce8b79facbfbaeca498 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:34 -0700 Subject: [PATCH 10/66] Cachepath: Removes out-of-root cleanup checks. --- src/server/auditserver/cleanup_proc.cc | 4 ++-- testsuite/test_driver.c | 23 ----------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/server/auditserver/cleanup_proc.cc b/src/server/auditserver/cleanup_proc.cc index c23a9f7d..a3d3ddcc 100644 --- a/src/server/auditserver/cleanup_proc.cc +++ b/src/server/auditserver/cleanup_proc.cc @@ -72,7 +72,8 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) continue; if (strncmp(prefix_dir, componentpath.c_str(), prefix_size) != 0) { - err_printf("Tried to clean a file %s that wasn't in our prefix %s\n", componentpath.c_str(), prefix_dir); + // We have multiple directory roots. Not a problem if the directory + // we're looking for isn't in this one. continue; } unlink(componentpath.c_str()); @@ -83,7 +84,6 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) sort(ordered_dirs.begin(), ordered_dirs.end(), longest_str_first); for (vector::iterator i = ordered_dirs.begin(); i != ordered_dirs.end(); i++) { if (strncmp(prefix_dir, i->c_str(), prefix_size) != 0) { - err_printf("Tried to rmdir directory %s that wasn't in our prefix %s\n", i->c_str(), prefix_dir); continue; } rmdir(i->c_str()); diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index 13f63fea..a0b46a75 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1169,27 +1169,6 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } -static int checkLinkForLeak(const char *path, const char *spindle_loc) -{ - char link_target[4096]; - int result, error; - memset(link_target, 0, sizeof(link_target)); - - result = readlink(path, link_target, sizeof(link_target)); - if (result == -1) { - error = errno; - err_printf("Failed to read link %s: %s\n", path, strerror(error)); - return -1; - } - - if (strstr(link_target, spindle_loc)) { - err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); - return -1; - } - - return 0; -} - static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) { if (strstr(path, spindle_loc)) { @@ -1286,9 +1265,7 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); - checkLinkForLeak(path, spindle_loc); } - checkLinkForLeak("/proc/self/exe", spindle_loc); /** * Check link_maps for leaked spindle paths From 7db2da13c29cf05304bb605b428e81f96f43092f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:57:06 -0700 Subject: [PATCH 11/66] Cachepath: Set of small, miscellaneous patches. --- src/include/spindle_launch.h | 5 +++++ src/server/auditserver/ldcs_audit_server_md.h | 3 +++ src/server/auditserver/ldcs_audit_server_process.c | 5 +++-- src/server/auditserver/ldcs_audit_server_process.h | 5 +++++ src/server/startup/spindle_be.cc | 4 +++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 47a4d92e..5476734d 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -126,6 +126,11 @@ typedef struct { /* The local-disk location where Spindle will store its cache */ char *location; + /* Path[s] for cached libraries. */ + char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ + char *chosen_cachepath; /* The consensus path (same across all nodes). */ + uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ + /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index eb5bf9f6..ba7943e2 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -107,6 +107,9 @@ int ldcs_audit_server_md_broadcast_noncontig(ldcs_process_data_t *ldcs_process_d int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); + #if defined(__cplusplus) } diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 312095be..dca91c20 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -142,6 +142,9 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; + ldcs_process_data.cachepaths = args->candidate_cachepaths; + ldcs_process_data.cachepath = args->chosen_cachepath; + ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -191,8 +194,6 @@ int ldcs_audit_server_process(spindle_args_t *args) } ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; - debug_printf3("Initializing file cache location %s\n", ldcs_process_data.location); - ldcs_audit_server_filemngt_init(ldcs_process_data.location); if (ldcs_process_data.opts & OPT_PROCCLEAN) init_cleanup_proc(ldcs_process_data.location); diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 18b3320a..9ba1675e 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -126,6 +126,11 @@ struct ldcs_process_data_struct ldcs_dist_model_t dist_model; ldcs_client_t* client_table; char *location; + char *cachepaths; + char *cachepath; + char *symbolic_cachepath; + char *parsed_cachepath; + int64_t cachepath_bitidx; char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 7493c020..4f583756 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -59,6 +59,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->startup_type, buf, pos); unpack_param(args->shm_cache_size, buf, pos); unpack_param(args->location, buf, pos); + unpack_param(args->candidate_cachepaths, buf, pos); unpack_param(args->pythonprefix, buf, pos); unpack_param(args->preloadfile, buf, pos); unpack_param(args->bundle_timeout_ms, buf, pos); @@ -152,7 +153,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i debug_printf("Translated location from %s to %s\n", args.location, new_location); free(args.location); args.location = new_location; - test_printf(" location=%s\n", args.location); + + determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); result = ldcs_audit_server_process(&args); if (result == -1) { From 1cd9c27ccfe8a73aec2286e244030a5c0cdb4ff5 Mon Sep 17 00:00:00 2001 From: Barry Date: Tue, 21 Oct 2025 12:21:37 -0700 Subject: [PATCH 12/66] Fixes per Matt's comments. --- src/client/client/intercept_exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 8480b75e..27928d35 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -196,7 +196,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 20; + new_size = orig_size + 9; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); From 4ba3a6718f5a704f17337aad3868bf99b23339ca Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 12:20:35 -0700 Subject: [PATCH 13/66] Single source of truth for client cachepath. Previously, chosen_realized_cachepath was copied into set_intercept_readlink_cachepath() chosen_realized_cachepath and chosen_parsed_cachepath were copied into set_should_intercept_cachepath() This PR removes both setter functions and makes the original pointers global. --- src/client/client/client.c | 7 ++----- src/client/client/intercept.h | 1 - src/client/client/intercept_readlink.c | 12 +++--------- src/client/client/should_intercept.c | 17 +++++------------ src/client/client/should_intercept.h | 1 - 5 files changed, 10 insertions(+), 28 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index d7827d4b..f803e75b 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -72,7 +72,7 @@ static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; static char *location; -static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; +char *chosen_realized_cachepath, *chosen_parsed_cachepath; number_t number; static int have_stat_patches; @@ -261,10 +261,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); - set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index aae968f7..4ace2328 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,7 +89,6 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index 2799baee..37d440e4 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,23 +31,17 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -static char *cachepath; - -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - chosen_parsed_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; + extern char *chosen_realized_cachepath; - cachepath_len = strlen(cachepath); + cachepath_len = strlen(chosen_realized_cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(chosen_realized_cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index 3a348d3d..cee4e43c 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,27 +29,20 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); -static char *cachepath, *orig_cachepath; - -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - orig_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} - int is_in_spindle_cache(const char *pathname) { static int cachepath_size = 0; static int orig_cachepath_size = 0; + extern char *chosen_realized_cachepath, *chosen_parsed_cachepath; if (!cachepath_size) { - cachepath_size = strlen(cachepath); + cachepath_size = strlen(chosen_realized_cachepath); } if (!orig_cachepath_size) { - orig_cachepath_size = strlen(orig_cachepath); + orig_cachepath_size = strlen(chosen_parsed_cachepath); } - return ((strncmp(pathname, cachepath, cachepath_size) == 0) || - (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); + return ((strncmp(pathname, chosen_realized_cachepath, cachepath_size) == 0) || + (strncmp(pathname, chosen_parsed_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index 6a545913..f6a9b510 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,7 +27,6 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From b0fb21e4518c18fc0ba6e077fcc9ddc03948cf68 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 14:39:23 -0700 Subject: [PATCH 14/66] Comments the cachepath variables. --- .../auditserver/ldcs_audit_server_process.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 9ba1675e..f9f4c30c 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -125,12 +125,15 @@ struct ldcs_process_data_struct int exit_readys_recvd; ldcs_dist_model_t dist_model; ldcs_client_t* client_table; - char *location; - char *cachepaths; - char *cachepath; - char *symbolic_cachepath; - char *parsed_cachepath; - int64_t cachepath_bitidx; + char *location; /* Single user-specified path for fifo, daemons, etc. */ + /* (Everything except the cachepath.) */ + char *cachepaths; /* Up to 64 colon-separated list of candidate cachepaths. */ + char *cachepath; /* The earliest path in the list available to all servers. */ + /* (Environment variables replaced, symbolic links realized.) */ + char *symbolic_cachepath; /* The original representation of the cachepath. */ + char *parsed_cachepath; /* The cachepath with environment variables replaced. */ + /* (Symbolic links, if any, remain.) */ + int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; From 504e4b27a401406817f3cff38a7ac553e64e78c7 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 13:07:21 -0700 Subject: [PATCH 15/66] Removes internal vars from spindle_launch.h Removes chosen_cachepath and cachepath_bitindex from spindle_launch.h Updates initialization of matching variables in ldcs_process_data. determineValidCachePaths() moved from spindle_be.cc to ldcs_audit_server_process.c to get ldcs_process_data visibility. Added #include "parseloc.h" to ldcs_audit_server_process.c to get declaration of determineValidCachePaths(). Relocated "parseloc.h" to src/util so ldcs_audit_server_process.c could find it. Trued up signedness of types caused my making "parseloc.h" more visible, e.g., cachepath_bitidx is now uint64_t everywhere. --- src/cobo/cobo.c | 2 +- src/cobo/ldcs_cobo.h | 2 +- src/include/spindle_launch.h | 2 -- src/server/auditserver/ldcs_audit_server_process.c | 9 +++++++-- src/server/auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 2 -- src/{client/beboot => utils}/parseloc.h | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) rename src/{client/beboot => utils}/parseloc.h (91%) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index ad2ea878..59c2f809 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1434,7 +1434,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } -int cobo_allreduce( int64_t *pval, cobo_op_t op ){ +int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* if i have any children, receive their data */ int64_t child_val; diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 30cc9673..fafbda6a 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -142,7 +142,7 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); -int cobo_allreduce(int64_t *pval, cobo_op_t op); +int cobo_allreduce(uint64_t *pval, cobo_op_t op); /* * ========================================================================== diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 5476734d..81c0728e 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -128,8 +128,6 @@ typedef struct { /* Path[s] for cached libraries. */ char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ - char *chosen_cachepath; /* The consensus path (same across all nodes). */ - uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index dca91c20..566cad01 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -37,6 +37,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "msgbundle.h" #include "exitnote.h" #include "cleanup_proc.h" +#include "parseloc.h" //#define GPERFTOOLS #if defined(GPERFTOOLS) @@ -143,8 +144,8 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; ldcs_process_data.cachepaths = args->candidate_cachepaths; - ldcs_process_data.cachepath = args->chosen_cachepath; - ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; + ldcs_process_data.cachepath = NULL; + ldcs_process_data.cachepath_bitidx = 0; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -230,6 +231,10 @@ int ldcs_audit_server_process(spindle_args_t *args) if (fd != -1) { ldcs_listen_register_fd(fd, serverid, forceExitCB, (void *) &ldcs_process_data); } + determineValidCachePaths( + &ldcs_process_data.cachepath_bitidx, + ldcs_process_data.cachepaths, + ldcs_process_data.number ); return 0; } diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index f9f4c30c..1495cebd 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -133,7 +133,7 @@ struct ldcs_process_data_struct char *symbolic_cachepath; /* The original representation of the cachepath. */ char *parsed_cachepath; /* The cachepath with environment variables replaced. */ /* (Symbolic links, if any, remain.) */ - int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ + uint64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 4f583756..733d3244 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -154,8 +154,6 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i free(args.location); args.location = new_location; - determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); - result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); diff --git a/src/client/beboot/parseloc.h b/src/utils/parseloc.h similarity index 91% rename from src/client/beboot/parseloc.h rename to src/utils/parseloc.h index 1731906a..a99409c3 100644 --- a/src/client/beboot/parseloc.h +++ b/src/utils/parseloc.h @@ -28,7 +28,7 @@ char *parse_location_noerr(char *loc, number_t number); char *realize(char *path); char **parse_colonsep_prefixes(char *colonsep_list, number_t number); int is_local_prefix(const char *path, char **local_prefixes); -static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); From 10a72336bb78ab7c07ef9f05f73e1d0d014f83ad Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:18:59 -0700 Subject: [PATCH 16/66] Client cachepath message now uses single response. The three-message-reply response is now a single message with two strings. The symbolic version of the cachepath is no longer communicated as it was not being used. --- src/client/client/client.c | 2 +- src/client/client_comlib/client_api.c | 31 ++++--------------- src/client/client_comlib/client_api.h | 2 +- .../auditserver/ldcs_audit_server_handlers.c | 19 +++--------- 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index f803e75b..ac8d4b63 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -261,7 +261,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 267b8a93..cdff2bb5 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -36,9 +36,9 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) - -int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath){ ldcs_message_t message; char buffer[MAX_PATH_LEN+1]; buffer[MAX_PATH_LEN] = '\0'; @@ -59,32 +59,13 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose err_printf("Got unexpected message of type %d\n", (int) message.header.type); assert(0); } + char *local_crc = strdup( buffer ); + char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); if( chosen_realized_cachepath ){ - *chosen_realized_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + *chosen_realized_cachepath = local_crc; } if( chosen_parsed_cachepath ){ - *chosen_parsed_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); - } - if( chosen_symbolic_cachepath ){ - *chosen_symbolic_cachepath = strdup( buffer ); + *chosen_parsed_cachepath = local_cpc; } return 0; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 982c4b1c..3d7c41be 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,7 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); -int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath); int get_python_prefix(int fd, char **prefix); diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 98c6807a..c8e76d54 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -3003,21 +3003,12 @@ static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; - msg.header.len = strlen(procdata->cachepath) + 1; - msg.data = procdata->cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->parsed_cachepath) + 1; - msg.data = procdata->parsed_cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->symbolic_cachepath) + 1; - msg.data = procdata->symbolic_cachepath; + msg.header.len = strlen(procdata->cachepath) + 1 + strlen(procdata->parsed_cachepath) + 1; + msg.data = calloc( 1, msg.header.len ); + strcpy( msg.data, procdata->cachepath ); + strcpy( &msg.data[ strlen(procdata->cachepath)+1 ], procdata->parsed_cachepath ); ldcs_send_msg(connid, &msg); + free( msg.data ); procdata->server_stat.clientmsg.cnt++; procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; From d63aabda8c09750cda385da0302e7a786af75d5f Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:33:07 -0700 Subject: [PATCH 17/66] Removes assert(0) in network error paths. --- src/client/client_comlib/client_api.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index cdff2bb5..4b0f9ded 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -57,7 +57,7 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } char *local_crc = strdup( buffer ); char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); @@ -102,7 +102,7 @@ int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { if (message.header.type != LDCS_MSG_FILE_QUERY_ANSWER) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } if (message.header.len > sizeof(int)) { @@ -195,7 +195,7 @@ int send_existance_test(int fd, char *path, int *exists) if (message.header.type != LDCS_MSG_EXISTS_ANSWER || message.header.len != sizeof(uint32_t)) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } memcpy(exists, buffer, sizeof(*exists)); @@ -232,7 +232,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath) if (message.header.type != LDCS_MSG_ORIGPATH_ANSWER || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } strncpy(newpath, buffer, MAX_PATH_LEN+1); @@ -380,7 +380,7 @@ int send_ldso_info_request(int fd, const char *ldso_path, char *result_path) if (message.header.type != LDCS_MSG_LOADER_DATA_RESP) { err_printf("Got unexpected message after ldso req: %d\n", (int) message.header.type); - assert(0); + return -1; } return 0; } @@ -422,7 +422,7 @@ int send_rankinfo_query(int fd, int *mylrank, int *mylsize, int *mymdrank, int * if (message.header.type != LDCS_MSG_MYRANKINFO_QUERY_ANSWER || message.header.len != 4*sizeof(int)) { err_printf("Received incorrect response to rankinfo query %d\n", message.header.type); *mylrank = *mylsize = *mymdrank = *mymdsize = -1; - assert(0); + return -1; } p = (int *) message.data; @@ -457,7 +457,7 @@ int send_procmaps_query(int fd, int pid, char *result) if (message.header.type != LDCS_MSG_PROCMAPS_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } memcpy(result, buffer, MAX_PATH_LEN); @@ -488,7 +488,7 @@ int send_pickone_query(int fd, char *key, int *result) if (message.header.type != LDCS_MSG_PICKONE_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } *result = *((int *) message.data); From bc944b98ccbd3e751931de5b9d205ab918ea5d07 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:18:40 -0700 Subject: [PATCH 18/66] Renames ldcs_audit_server_md_consensus(). New name is ldcs_audit_server_md_allreduce_AND(). If we get to the point where we're using other allreduce operations we can solve the problem of duplicating the op list in md-land and cobo-land. For now, we're only using one op in md-land, so the op can go into the function name. --- src/server/auditserver/ldcs_audit_server_handlers.c | 2 +- src/server/auditserver/ldcs_audit_server_md.h | 2 +- src/server/auditserver/ldcs_audit_server_md_cobo.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index c8e76d54..29107bad 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2966,7 +2966,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag msgbundle_force_flush(procdata); } - ldcs_audit_server_md_consensus(procdata, msg); + ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index ba7943e2..a4640370 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -108,7 +108,7 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ); #if defined(__cplusplus) diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index d8b5442f..27393a55 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -402,8 +402,6 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) return num_childs; } -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ - if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ - cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); - } +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ){ + cobo_allreduce( val, COBO_OP_BITWISE_AND ); } From a8ddaeb98cdafd6449b9ca0dab1ade077bc270bd Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:27:27 -0700 Subject: [PATCH 19/66] Adds explicit enum values to CmdlineShortOptions. --- src/fe/startup/config_mgr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 27be1ae8..c2d3cd7e 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -126,8 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso, - shortCachePaths, + shortPatchLdso = 299, + shortCachePaths = 300, }; enum CmdlineGroups { From 5a612962562d49f7818d91e3c6699a5b8fd7beee Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:38:19 -0700 Subject: [PATCH 20/66] Return instead of exit on network errors. --- src/cobo/cobo.c | 58 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 59c2f809..65741008 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -160,7 +160,7 @@ static char* cobo_getenv(char* envvar, int type) char* str = getenv(envvar); if (str == NULL && type == ENV_REQUIRED) { err_printf("Missing required environment variable: %s\n", envvar); - exit(1); + return NULL; } return str; } @@ -171,7 +171,7 @@ static void* cobo_malloc(size_t n, char* msg) void* p = malloc(n); if (!p) { err_printf("Call to malloc(%lu) failed: %s (%m errno %d)\n", n, msg, errno); - exit(1); + return NULL; } return p; } @@ -513,7 +513,7 @@ static int cobo_connect_hostname(char* hostname, int rank) break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -768,7 +768,7 @@ static int cobo_open_tree() if (sockfd < 0) { err_printf("Creating parent socket (socket() %m errno=%d)\n", errno); - exit(1); + return -1; } setCloseOnExec(sockfd); @@ -817,7 +817,7 @@ static int cobo_open_tree() if (!port_is_bound) { /* TODO: would like to send an abort back to server */ err_printf("Failed to open socket on any port\n"); - exit(1); + return -1; } /* accept a connection from parent and receive socket table */ @@ -837,7 +837,7 @@ static int cobo_open_tree() break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -907,26 +907,26 @@ static int cobo_open_tree() /* read our rank number */ if (cobo_read_fd(cobo_parent_fd, &cobo_me, sizeof(int)) < 0) { err_printf("Receiving my rank from parent failed\n"); - exit(1); + return -1; } /* discover how many ranks are in our world */ if (cobo_read_fd(cobo_parent_fd, &cobo_nprocs, sizeof(int)) < 0) { err_printf("Receiving number of tasks from parent failed\n"); - exit(1); + return -1; } /* read the size of the hostlist (in bytes) */ if (cobo_read_fd(cobo_parent_fd, &cobo_hostlist_size, sizeof(int)) < 0) { err_printf("Receiving size of hostname table from parent failed\n"); - exit(1); + return -1; } /* allocate space for the hostlist and read it in */ cobo_hostlist = (void*) cobo_malloc(cobo_hostlist_size, "Hostlist data buffer"); if (cobo_read_fd(cobo_parent_fd, cobo_hostlist, cobo_hostlist_size) < 0) { err_printf("Receiving hostname table from parent failed\n"); - exit(1); + return -1; } /* @@ -969,7 +969,7 @@ static int cobo_open_tree() if (cobo_child_fd[i] == -1) { err_printf("Failed to connect to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* tell child what rank he is and forward the hostname table to him */ @@ -978,7 +978,7 @@ static int cobo_open_tree() if (forward != COBO_SUCCESS) { err_printf("Failed to forward hostname table to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* free the child hostname string */ @@ -1033,7 +1033,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_me != 0) { if (cobo_read_fd(cobo_parent_fd, buf, size) < 0) { err_printf("Receiving broadcast data from parent failed\n"); - exit(1); + return -1; } } @@ -1042,7 +1042,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } @@ -1058,7 +1058,7 @@ int cobo_bcast_down(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } return rc; @@ -1080,7 +1080,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(child_val)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's max to our current max */ @@ -1094,7 +1094,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, &max_val, sizeof(max_val)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } else { /* we're the root, got the result, set the recvbuf */ @@ -1129,7 +1129,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_read_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Gathering data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1138,7 +1138,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_me != 0) { if (cobo_write_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Sending gathered data to parent failed\n"); - exit(1); + return -1; } cobo_free(bigbuf); } @@ -1158,7 +1158,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) bigbuf = (void*) cobo_malloc(bigcount, "Temporary scatter buffer in cobo_scatter_tree"); if (cobo_read_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Receiving scatter data from parent failed\n"); - exit(1); + return -1; } } @@ -1169,7 +1169,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_write_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Scattering data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1257,7 +1257,7 @@ int cobo_bcast(void* buf, int sendcount, int root) rc = cobo_bcast_tree(buf, sendcount); } else { err_printf("Cannot execute bcast from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1283,7 +1283,7 @@ int cobo_gather(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_gather_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute gather to non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1309,7 +1309,7 @@ int cobo_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_scatter_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute scatter from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1354,7 +1354,7 @@ int cobo_alltoall(void* sendbuf, int sendcount, void* recvbuf) int rc = COBO_SUCCESS; err_printf("Cannot execute alltoall\n"); - exit(1); + return -1; cobo_gettimeofday(&end); debug_printf3("Exiting cobo_alltoall(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); @@ -1442,7 +1442,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* read int64_t from child */ if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's val to our current val */ @@ -1466,7 +1466,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } @@ -1524,7 +1524,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_ports = cobo_int_dup(portlist, num_ports); if (cobo_ports == NULL) { err_printf("Failed to copy port list\n"); - exit(1); + return -1; } /* open the tree */ @@ -1533,7 +1533,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* /* need to check that tree opened successfully before returning, so do a barrier */ if (cobo_barrier() != COBO_SUCCESS) { err_printf("Failed to open tree\n"); - exit(1); + return -1; } if (cobo_me == 0) { From 1823f938c89e0e9b242e4720483972e6a1264671 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 06:54:47 -0700 Subject: [PATCH 21/66] Apply rename to configuration and parameters. --- config.h.in | 6 +++--- configure | 17 +++++++++-------- configure.common.ac | 12 ++++++------ src/client/config.h.in | 6 +++--- src/client/configure | 17 +++++++++-------- src/fe/config.h.in | 6 +++--- src/fe/configure | 17 +++++++++-------- src/fe/startup/config_mgr.cc | 16 ++++++++-------- src/fe/startup/config_mgr.h | 4 ++-- src/flux/sessionmgr.c | 8 ++++---- src/server/config.h.in | 6 +++--- src/server/configure | 17 +++++++++-------- 12 files changed, 68 insertions(+), 64 deletions(-) diff --git a/config.h.in b/config.h.in index aae4c0da..921fe5d8 100644 --- a/config.h.in +++ b/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -134,9 +137,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/configure b/configure index 00d3f031..05524e9f 100755 --- a/configure +++ b/configure @@ -848,7 +848,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1593,7 +1593,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16673,11 +16674,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16685,7 +16686,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16705,7 +16706,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/configure.common.ac b/configure.common.ac index 9f360a17..5438173a 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -21,18 +21,18 @@ AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], [CACHEPATHS=$DEFAULT_LOC]) -AC_ARG_WITH(localstorage, - [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], - [SPINDLE_LOC=${withval}], - [SPINDLE_LOC=$DEFAULT_LOC]) +AC_ARG_WITH(commpath, + [AS_HELP_STRING([--with-compath=DIR],[Back-end directory for communication and housekeeping])], + [COMMPATH=${withval}], + [COMMPATH=$DEFAULT_LOC]) AC_ARG_WITH(default-local-prefix, [AS_HELP_STRING([--with-default-local-prefix=DIRS],[Colon-seperated list of directories that Spindle will not cache files out of])], [SPINDLE_LOCAL_PREFIX=${withval}], - [SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC"]) + [SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH"]) AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle]) AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) -AC_DEFINE_UNQUOTED([SPINDLE_LOC],"[$SPINDLE_LOC]",[The default local directory for Spindle]) +AC_DEFINE_UNQUOTED([COMMPATH],"[$COMMPATH]",[Back-end directory for communication and housekeeping]) AC_DEFINE_UNQUOTED([CACHEPATHS],"[$CACHEPATHS]",[Colon-separated list of potential back-end cache directories]) AC_DEFINE_UNQUOTED([SPINDLE_LOCAL_PREFIX],"[$SPINDLE_LOCAL_PREFIX]",[The default colon-separated list of directories that Spindle will not cache files out of]) diff --git a/src/client/config.h.in b/src/client/config.h.in index a42cc0ab..d78a3e7f 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -121,9 +124,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/client/configure b/src/client/configure index a07d16b7..aef2c665 100755 --- a/src/client/configure +++ b/src/client/configure @@ -811,7 +811,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1535,7 +1535,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -12598,11 +12599,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -12610,7 +12611,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -12630,7 +12631,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index 5057ef33..3ac96e42 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -166,9 +169,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/fe/configure b/src/fe/configure index 3d73ec17..b5c73f35 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -832,7 +832,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1573,7 +1573,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16448,11 +16449,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16460,7 +16461,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16480,7 +16481,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index eb710b36..16c4a0f8 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -50,10 +50,10 @@ using namespace std; #define SPINDLE_NUM_PORTS_STR "250" #endif -#if defined(SPINDLE_LOC) -#define SPINDLE_LOC_STR SPINDLE_LOC +#if defined(COMMPATH) +#define SPINDLE_COMMPATH_STR COMMPATH #else -#define SPINDLE_LOC_STR "$TMPDIR" +#define SPINDLE_COMMPATH_STR "$TMPDIR" #endif #if defined(CACHEPATHS) @@ -273,8 +273,8 @@ void initOptionsList() "Provides a text file containing a white-space separated list of files that should be relocated to each node before execution begins" }, { confStrip, "strip", shortStrip, groupMisc, cvBool, {}, "true", "Strip debug and symbol information from binaries before distributing them." }, - { confLocation, "location", shortLocation, groupMisc, cvString, {}, SPINDLE_LOC_STR, - "Back-end directory for storing relocated files. Should be a non-shared location such as a ramdisk." }, + { confCommPath, "commpath", shortCommPath, groupMisc, cvString, {}, SPINDLE_COMMPATH_STR, + "Back-end directory communication and housekeeping. Should be a non-shared location such as a ramdisk." }, { confCachePaths, "cachepaths", shortCachePaths, groupMisc, cvString, {}, SPINDLE_CACHEPATHS_STR, "Colon-separated list of candidate paths for cached libraries."}, { confNoclean, "noclean", shortNoClean, groupMisc, cvBool, {}, "false", @@ -743,9 +743,9 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const case confNumPorts: args.num_ports = numresult; break; - case confLocation: { - string loc = strresult + "/spindle.$NUMBER"; - args.location = strdup(loc.c_str()); + case confCommPath: { + string path = strresult + "/spindle.$NUMBER"; + args.location = strdup(path.c_str()); break; } case confCachePaths:{ diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index c2d3cd7e..ec3c8135 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -29,7 +29,7 @@ enum SpindleConfigID { confCmdlineNewgroup, confPort, confNumPorts, - confLocation, + confCommPath, confCachePaths, confCachePrefix, confPythonPrefix, @@ -83,7 +83,7 @@ enum CmdlineShortOptions { shortAuditType = 'k', shortRelocSO = 'l', shortNoClean = 'n', - shortLocation = 'o', + shortCommPath = 'o', shortPush = 'p', shortPull = 'q', shortPythonPrefix = 'r', diff --git a/src/flux/sessionmgr.c b/src/flux/sessionmgr.c index 49324a2f..17027163 100644 --- a/src/flux/sessionmgr.c +++ b/src/flux/sessionmgr.c @@ -109,16 +109,16 @@ char **strip_start_from_argv(int argc, char **argv) extern char *parse_location(char *loc, int number); extern int spindle_mkdir(char *orig_path); -#if !defined(SPINDLE_LOC) -#error SPINDLE_LOC must be defined in config.h +#if !defined(COMMPATH) +#error COMMPATH must be defined in config.h #endif const char *get_session_dir() { int result; char *dir; - dir = parse_location((char *) (SPINDLE_LOC "/spindle_session"), 0); + dir = parse_location((char *) (COMMPATH "/spindle_session"), 0); if (!dir) { - spindle_debug_printf(1, "ERROR: Could not parse directory for spindle session location from %s/spindle_session\n", SPINDLE_LOC); + spindle_debug_printf(1, "ERROR: Could not parse directory for spindle session location from %s/spindle_session\n", COMMPATH); return NULL; } diff --git a/src/server/config.h.in b/src/server/config.h.in index dc9439b3..87fbf990 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -148,9 +151,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/server/configure b/src/server/configure index 92b48ef7..bf356eca 100755 --- a/src/server/configure +++ b/src/server/configure @@ -838,7 +838,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1570,7 +1570,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16445,11 +16446,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16457,7 +16458,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16477,7 +16478,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF From 9bef38b1be79787ed1ce8bc101baefea3ee2ae8b Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 12:29:41 -0700 Subject: [PATCH 22/66] Renaming location variables/fields to commpath. --- src/fe/startup/config_mgr.cc | 2 +- src/fe/startup/parse_launcher.cc | 4 ++-- src/fe/startup/spindle_fe.cc | 10 +++++----- src/flux/flux-spindle.c | 8 ++++---- src/include/spindle_launch.h | 4 ++-- .../auditserver/ldcs_audit_server_handlers.c | 12 ++++++------ .../auditserver/ldcs_audit_server_process.c | 14 +++++++------- .../auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 16 ++++++++-------- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index 16c4a0f8..1b9d9ade 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -745,7 +745,7 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const break; case confCommPath: { string path = strresult + "/spindle.$NUMBER"; - args.location = strdup(path.c_str()); + args.commpath = strdup(path.c_str()); break; } case confCachePaths:{ diff --git a/src/fe/startup/parse_launcher.cc b/src/fe/startup/parse_launcher.cc index f05ba18a..4484e998 100644 --- a/src/fe/startup/parse_launcher.cc +++ b/src/fe/startup/parse_launcher.cc @@ -292,7 +292,7 @@ void ModifyArgv::modifyCmdLine() snprintf(options_str, 32, "%lu", (unsigned long) params->opts); string options(options_str); - string location(params->location); + string commpath(params->commpath); char number_str[32]; snprintf(number_str, 32, "%lu", (unsigned long) params->number); @@ -319,7 +319,7 @@ void ModifyArgv::modifyCmdLine() if (p == parser->appExecutableAt()) { #if defined(os_bluegene) string bg_env_str = parser->getParser()->getBGString(); - parser->getParser()->addBGEnvStr(n, new_argv, bg_env_str, default_libstr, intercept_libstr, location, number, options, shmcache_size); + parser->getParser()->addBGEnvStr(n, new_argv, bg_env_str, default_libstr, intercept_libstr, commpath, number, options, shmcache_size); #else char **a_argv; int a_argc; diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index 2c2879f5..a038e201 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -71,7 +71,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) buffer_size += sizeof(number_t); buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); - buffer_size += args->location ? strlen(args->location) + 1 : 1; + buffer_size += args->commpath ? strlen(args->commpath) + 1 : 1; buffer_size += args->candidate_cachepaths ? strlen(args->candidate_cachepaths) + 1 : 1; buffer_size += args->pythonprefix ? strlen(args->pythonprefix) + 1 : 1; buffer_size += args->preloadfile ? strlen(args->preloadfile) + 1 : 1; @@ -92,7 +92,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) pack_param(args->use_launcher, buf, pos); pack_param(args->startup_type, buf, pos); pack_param(args->shm_cache_size, buf, pos); - pack_param(args->location, buf, pos); + pack_param(args->commpath, buf, pos); pack_param(args->candidate_cachepaths, buf, pos); pack_param(args->pythonprefix, buf, pos); pack_param(args->preloadfile, buf, pos); @@ -232,7 +232,7 @@ int getApplicationArgsFE(spindle_args_t *params, int *spindle_argc, char ***spin (*spindle_argv)[n++] = strdup(numports_s); (*spindle_argv)[n++] = strdup(uniqueid_s); } - (*spindle_argv)[n++] = strdup(params->location); + (*spindle_argv)[n++] = strdup(params->commpath); (*spindle_argv)[n++] = strdup(params->candidate_cachepaths); (*spindle_argv)[n++] = strdup(number_s); (*spindle_argv)[n++] = strdup(opt_s); @@ -398,11 +398,11 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Start FE server */ debug_printf("spindle_args_t { number = %lu; port = %u; num_ports = %u; opts = %lu; unique_id = %lu; " - "use_launcher = %u; startup_type = %u; shm_cache_size = %u; location = %s; " + "use_launcher = %u; startup_type = %u; shm_cache_size = %u; commpath = %s; " "cachepaths = %s; " "pythonprefix = %s; preloadfile = %s; bundle_timeout_ms = %u; bundle_cachesize_kb = %u }\n", (unsigned long) params->number, params->port, params->num_ports, params->opts, params->unique_id, - params->use_launcher, params->startup_type, params->shm_cache_size, params->location, + params->use_launcher, params->startup_type, params->shm_cache_size, params->commpath, params->candidate_cachepaths, params->pythonprefix, params->preloadfile, params->bundle_timeout_ms, params->bundle_cachesize_kb); diff --git a/src/flux/flux-spindle.c b/src/flux/flux-spindle.c index 65de5ed9..101bb6ac 100644 --- a/src/flux/flux-spindle.c +++ b/src/flux/flux-spindle.c @@ -381,7 +381,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) int numa = 0; const char *relocaout = NULL, *reloclibs = NULL, *relocexec = NULL, *relocpython = NULL; const char *followfork = NULL, *preload = NULL, *level = NULL; - const char *pyprefix = NULL, *location = NULL; + const char *pyprefix = NULL, *commpath = NULL; char *numafiles = NULL, *cachepaths = NULL; if (flux_shell_getopt_unpack (shell, "spindle", "o", &opts) < 0) @@ -415,7 +415,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) "reloc-exec", &relocexec, "reloc-python", &relocpython, "python-prefix", &pyprefix, - "location", &location, + "commpath", &commpath, "numa", &numa, "numa-files", &numafiles, "preload", &preload, @@ -466,8 +466,8 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) if( cachepaths ){ ctx->params.candidate_cachepaths = cachepaths; } - if (location) { - ctx->params.location = (char *) location; + if (commpath) { + ctx->params.commpath = (char *) commpath; } if (level) { if (strcmp(level, "high") == 0) { diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 81c0728e..ca7b8d3d 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -123,8 +123,8 @@ typedef struct { /* Size of client shared memory cache */ unsigned int shm_cache_size; - /* The local-disk location where Spindle will store its cache */ - char *location; + /* The local-disk location for communication and housekeeping. */ + char *commpath; /* Path[s] for cached libraries. */ char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 29107bad..91d8779c 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -202,7 +202,7 @@ static int handle_client_info_msg(ldcs_process_data_t *procdata, int nc, ldcs_me else if(msg->header.type == LDCS_MSG_LOCATION) { strncpy(client->remote_location, msg->data, sizeof(client->remote_location)-1); client->remote_location[sizeof(client->remote_location)-1] = '\0'; - debug_printf2("Server recvd location %s from %d\n", msg->data, nc); + debug_printf2("Server recvd remote_location %s from %d\n", msg->data, nc); } else if (msg->header.type == LDCS_MSG_CPU) { int clientcpu; @@ -2953,7 +2953,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs } /** - * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which of the locations, commpaths, and cachepaths are + * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which cachepaths are * available across all of the servers. */ @@ -2969,8 +2969,8 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ - err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); - procdata->cachepath = procdata->location; + err_printf("No valid cachepath path available. Falling back to \"commpath\" path (%s).\n", procdata->commpath); + procdata->cachepath = procdata->commpath; }else{ // ldcs_audit_server_filemngt_init() does it's own realize() pass. getValidCachePathByIndex( procdata->cachepath_bitidx, @@ -2979,7 +2979,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag &procdata->symbolic_cachepath); } - debug_printf3("Initializing file cache location %s\n", procdata->location); + debug_printf3("Initializing file cache cachepath %s\n", procdata->cachepath); ldcs_audit_server_filemngt_init(procdata->cachepath); test_printf(" cachepath=%s\n", procdata->cachepath); @@ -3224,7 +3224,7 @@ int exit_note_cb(int fd, int serverid, void *data) eresult = -1; } - result = handleExitNote(fd, procdata->location); + result = handleExitNote(fd, procdata->commpath); if (result == -1) { debug_printf("handleExitNote failed\n"); eresult = -1; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 566cad01..a73a7b7f 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -113,7 +113,7 @@ void startprofile(spindle_args_t *args) char hostname[257]; char *home = getenv("HOME"); if (!home || !*home) - home = ldcs_process_data.location; + home = ldcs_process_data.commpath; gethostname(hostname, sizeof(hostname)); snprintf(filename, 4096, "%s/spindled.%lu.%s.%d.prof", home, (unsigned long) args->number, hostname, getpid()); ProfilerStart(filename); @@ -142,7 +142,7 @@ int ldcs_audit_server_process(spindle_args_t *args) startprofile(args); debug_printf3("Initializing server data structures\n"); - ldcs_process_data.location = args->location; + ldcs_process_data.commpath = args->commpath; ldcs_process_data.cachepaths = args->candidate_cachepaths; ldcs_process_data.cachepath = NULL; ldcs_process_data.cachepath_bitidx = 0; @@ -196,11 +196,11 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; if (ldcs_process_data.opts & OPT_PROCCLEAN) - init_cleanup_proc(ldcs_process_data.location); + init_cleanup_proc(ldcs_process_data.commpath); debug_printf3("Initializing connections for clients at %s and %lu\n", - ldcs_process_data.location, (unsigned long) ldcs_process_data.number); - serverid = ldcs_create_server(ldcs_process_data.location, ldcs_process_data.number); + ldcs_process_data.commpath, (unsigned long) ldcs_process_data.number); + serverid = ldcs_create_server(ldcs_process_data.commpath, ldcs_process_data.number); if (serverid == -1) { err_printf("Unable to setup area for client connections\n"); return -1; @@ -216,7 +216,7 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_listen_register_fd(fd, serverid, &_ldcs_server_CB, (void *) &ldcs_process_data); if (args->opts & OPT_BEEXIT) { - fd = createExitNote(args->location); + fd = createExitNote(args->commpath); if (fd != -1) { ldcs_listen_register_fd(fd, serverid, exit_note_cb, (void *) &ldcs_process_data); } @@ -254,7 +254,7 @@ int ldcs_audit_server_run() _ldcs_server_stat_print(&ldcs_process_data.server_stat); - debug_printf("destroy server (%s,%lu)\n", ldcs_process_data.location, (unsigned long) ldcs_process_data.number); + debug_printf("destroy server (%s,%lu)\n", ldcs_process_data.commpath, (unsigned long) ldcs_process_data.number); ldcs_destroy_server(ldcs_process_data.serverid); /* destroy md support (multi-daemon) */ diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 1495cebd..82b60023 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -125,7 +125,7 @@ struct ldcs_process_data_struct int exit_readys_recvd; ldcs_dist_model_t dist_model; ldcs_client_t* client_table; - char *location; /* Single user-specified path for fifo, daemons, etc. */ + char *commpath; /* Single user-specified path for fifo, daemons, etc. */ /* (Everything except the cachepath.) */ char *cachepaths; /* Up to 64 colon-separated list of candidate cachepaths. */ char *cachepath; /* The earliest path in the list available to all servers. */ diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 733d3244..fa0eccb3 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -58,7 +58,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->use_launcher, buf, pos); unpack_param(args->startup_type, buf, pos); unpack_param(args->shm_cache_size, buf, pos); - unpack_param(args->location, buf, pos); + unpack_param(args->commpath, buf, pos); unpack_param(args->candidate_cachepaths, buf, pos); unpack_param(args->pythonprefix, buf, pos); unpack_param(args->preloadfile, buf, pos); @@ -144,15 +144,15 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i assert(args.port == port); - /* Expand environment variables in location. */ - char *new_location = parse_location(args.location, args.number); - if (!new_location) { - err_printf("Failed to convert location %s\n", args.location); + /* Expand environment variables in commpath. */ + char *new_commpath = parse_location(args.commpath, args.number); + if (!new_commpath) { + err_printf("Failed to convert commpath %s\n", args.commpath); return -1; } - debug_printf("Translated location from %s to %s\n", args.location, new_location); - free(args.location); - args.location = new_location; + debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); + free(args.commpath); + args.commpath = new_commpath; result = ldcs_audit_server_process(&args); if (result == -1) { From 0db1331c7e30096010e4e1868ae6383343466ec9 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 12:45:10 -0700 Subject: [PATCH 23/66] Renames enums. --- src/client/client/client.c | 4 ++-- src/client/client/intercept_exec.c | 6 +++--- src/client/client_comlib/client_api.c | 2 +- src/fe/startup/parse_launcher_args.cc | 12 ++++++------ src/include/ldcs_api.h | 2 +- src/server/auditserver/ldcs_audit_server_handlers.c | 4 ++-- .../auditserver/ldcs_audit_server_md_msocket.c | 2 +- src/server/comlib/ldcs_api_util.c | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index ac8d4b63..1635e060 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -198,7 +198,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_LOCATION"); + location = getenv("LDCS_COMMPATH"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -217,7 +217,7 @@ static int init_server_connection() if (!(opts & OPT_FOLLOWFORK)) { debug_printf("Disabling environment variables because we're not following forks\n"); unsetenv("LD_AUDIT"); - unsetenv("LDCS_LOCATION"); + unsetenv("LDCS_COMMPATH"); unsetenv("LDCS_NUMBER"); unsetenv("LDCS_CONNECTION"); unsetenv("LDCS_RANKINFO"); diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 27928d35..bae9eba3 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -141,7 +141,7 @@ static char **removeEnvironmentStrs(char **envp) continue; if (strIsPrefix("LD", envp[i])) { if (strIsPrefix("LD_AUDIT=", envp[i]) || - strIsPrefix("LDCS_LOCATION=", envp[i]) || + strIsPrefix("LDCS_COMMPATH=", envp[i]) || strIsPrefix("LDCS_CONNECTION=", envp[i]) || strIsPrefix("LDCS_RANKINFO=", envp[i]) || strIsPrefix("LDCS_OPTIONS=", envp[i]) || @@ -175,7 +175,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp unsetf = orig_unsetenv ? orig_unsetenv : unsetenv; unsetf("SPINDLE"); unsetf("LD_AUDIT"); - unsetf("LDCS_LOCATION"); + unsetf("LDCS_COMMPATH"); unsetf("LDCS_CONNECTION"); unsetf("LDCS_RANKINFO"); unsetf("LDCS_OPTIONS"); @@ -201,7 +201,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); propogateEnvironmentStr(envp, newenv, &pos, "LD_AUDIT"); - propogateEnvironmentStr(envp, newenv, &pos, "LDCS_LOCATION"); + propogateEnvironmentStr(envp, newenv, &pos, "LDCS_COMMPATH"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_CONNECTION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_RANKINFO"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_OPTIONS"); diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 4b0f9ded..f5e07cc0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -349,7 +349,7 @@ int send_cpu(int fd, int cpu) { int send_location(int fd, char *location) { ldcs_message_t message; - message.header.type = LDCS_MSG_LOCATION; + message.header.type = LDCS_MSG_COMMPATH; message.header.len = strlen(location)+1; message.data = location; diff --git a/src/fe/startup/parse_launcher_args.cc b/src/fe/startup/parse_launcher_args.cc index 84cf7f23..710b009d 100644 --- a/src/fe/startup/parse_launcher_args.cc +++ b/src/fe/startup/parse_launcher_args.cc @@ -127,7 +127,7 @@ static cmdoption_t openmpi_options[] = { }; -static const char *openmpi_bg_env_str = "-x LD_AUDIT=%s -x LDCS_LOCATION=%s -x LDCS_NUMBER=%s -x LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *openmpi_bg_env_str = "-x LD_AUDIT=%s -x LDCS_COMMPATH=%s -x LDCS_NUMBER=%s -x LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; static const unsigned int openmpi_size = (sizeof(openmpi_options) / sizeof(cmdoption_t)); static cmdoption_t srun_options[] = { @@ -218,11 +218,11 @@ static cmdoption_t srun_options[] = { { NULL, "--usage", 0 }, { "-V", "--version", 0 } }; -static const char *srun_bg_env_str = "--runjob-opts=--envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_str_nopreload = "--runjob-opts=--envs LD_AUDIT=%s%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str = "%s --envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str_preload = "%sLD_PRELOAD=%s:%s LD_AUDIT=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str_nopreload = "%s LD_AUDIT=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_str = "--runjob-opts=--envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_str_nopreload = "--runjob-opts=--envs LD_AUDIT=%s%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str = "%s --envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str_preload = "%sLD_PRELOAD=%s:%s LD_AUDIT=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str_nopreload = "%s LD_AUDIT=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; static const unsigned int srun_size (sizeof(srun_options) / sizeof(cmdoption_t)); diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index e6ccbafb..49ba45e3 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -45,7 +45,7 @@ typedef enum { LDCS_MSG_END, LDCS_MSG_CWD, LDCS_MSG_PID, - LDCS_MSG_LOCATION, + LDCS_MSG_COMMPATH, LDCS_MSG_CPU, LDCS_MSG_MYRANKINFO_QUERY, LDCS_MSG_MYRANKINFO_QUERY_ANSWER, diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 91d8779c..0129c25d 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -199,7 +199,7 @@ static int handle_client_info_msg(ldcs_process_data_t *procdata, int nc, ldcs_me client->remote_pid=mypid; debug_printf2("Server recvd pid %d from %d\n", mypid, nc); } - else if(msg->header.type == LDCS_MSG_LOCATION) { + else if(msg->header.type == LDCS_MSG_COMMPATH) { strncpy(client->remote_location, msg->data, sizeof(client->remote_location)-1); client->remote_location[sizeof(client->remote_location)-1] = '\0'; debug_printf2("Server recvd remote_location %s from %d\n", msg->data, nc); @@ -1866,7 +1866,7 @@ int handle_client_message(ldcs_process_data_t *procdata, int nc, ldcs_message_t switch (msg->header.type) { case LDCS_MSG_CWD: case LDCS_MSG_PID: - case LDCS_MSG_LOCATION: + case LDCS_MSG_COMMPATH: case LDCS_MSG_CPU: return handle_client_info_msg(procdata, nc, msg); case LDCS_MSG_PYTHONPREFIX_REQ: diff --git a/src/server/auditserver/ldcs_audit_server_md_msocket.c b/src/server/auditserver/ldcs_audit_server_md_msocket.c index c9d616c2..6db18bd4 100644 --- a/src/server/auditserver/ldcs_audit_server_md_msocket.c +++ b/src/server/auditserver/ldcs_audit_server_md_msocket.c @@ -62,7 +62,7 @@ int ldcs_audit_server_md_init ( ldcs_process_data_t *ldcs_process_data ) { int rc=0; char* ldcs_nportsstr=getenv("LDCS_NPORTS"); - char* ldcs_locmodstr=getenv("LDCS_LOCATION_MOD"); + char* ldcs_locmodstr=getenv("LDCS_COMMPATH_MOD"); int usedport; int serverfd, serverid, i; diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index b6beb56d..5ea51ab8 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -51,7 +51,7 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_END); STR_CASE(LDCS_MSG_CWD); STR_CASE(LDCS_MSG_PID); - STR_CASE(LDCS_MSG_LOCATION); + STR_CASE(LDCS_MSG_COMMPATH); STR_CASE(LDCS_MSG_CPU); STR_CASE(LDCS_MSG_MYRANKINFO_QUERY); STR_CASE(LDCS_MSG_MYRANKINFO_QUERY_ANSWER); From e322f7107e5ac0aeda7decb1fa0a059af7c3e003 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 5 Nov 2025 15:28:52 -0800 Subject: [PATCH 24/66] Use strdup() for commpath instead of stack var. Unlikely it would ever make a difference, but this is much more correct. --- src/server/startup/spindle_be.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index fa0eccb3..c3e7bbd9 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -152,7 +152,7 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i } debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); free(args.commpath); - args.commpath = new_commpath; + args.commpath = strdup(new_commpath); result = ldcs_audit_server_process(&args); if (result == -1) { From 3acca2c5db74c816bcecb0aec03f1edb4014d30d Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 12 Nov 2025 10:13:59 -0800 Subject: [PATCH 25/66] Adds LDCS_COMMPATH --- src/client/client/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index 1635e060..69488b10 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -198,7 +198,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_COMMPATH"); + location = getenv("LDCS_LOCATION"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); From 06876b640dab05dec43f2a3955a299d5706afa6b Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 12:56:57 -0800 Subject: [PATCH 26/66] Restores checkLinkForLeak() to test_driver.c --- testsuite/test_driver.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index a0b46a75..544bfd20 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1169,6 +1169,28 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } +static int checkLinkForLeak(const char *path, const char *spindle_loc) +{ + char link_target[4096]; + int result, error; + memset(link_target, 0, sizeof(link_target)); + + result = readlink(path, link_target, sizeof(link_target)); + if (result == -1) { + error = errno; + err_printf("Failed to read link %s: %s\n", path, strerror(error)); + return -1; + } + + if (strstr(link_target, spindle_loc)) { + err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); + return -1; + } + + return 0; +} + + static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) { if (strstr(path, spindle_loc)) { @@ -1265,7 +1287,9 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); + checkLinkForLeak(path, spindle_loc); } + checkLinkForLeak("/proc/self/exe", spindle_loc); /** * Check link_maps for leaked spindle paths From ce7af376be5f77bae71693ab448685e26b4caec5 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 15:24:58 -0800 Subject: [PATCH 27/66] Replacing "location" with "commpath" as needed. src/client/beboot/spindle_bootstrap.c Moved orig_location from static global to local Renamed symbolic_location to symbolic_commpath Renamed orig_location to orig_commpath Renamed location to commpath Renamed LDCS_LOCATION to LDCS_COMMPATH src/client/client/client.c Renamed LDCS_LOCATION to LDCS_COMMPATH Renamed location to commpath --- src/client/beboot/spindle_bootstrap.c | 24 ++++++++++++------------ src/client/client/client.c | 22 +++++++++++----------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 98424d50..d5f8a197 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -53,7 +53,7 @@ static int rankinfo[4]={-1,-1,-1,-1}; number_t number; static int use_cache; static unsigned int cachesize; -static char *location, *number_s, *orig_location, *symbolic_location; +static char *commpath, *number_s, *symbolic_commpath; static char **cmdline; static char *executable; static char *client_lib; @@ -91,7 +91,7 @@ extern char *realize(char *path); static int establish_connection() { debug_printf2("Opening connection to server\n"); - ldcsid = client_open_connection(location, number); + ldcsid = client_open_connection(commpath, number); if (ldcsid == -1) return -1; @@ -113,7 +113,7 @@ static void setup_environment() connection_str = client_get_connection_string(ldcsid); setenv("LD_AUDIT", client_lib, 1); - setenv("LDCS_LOCATION", location, 1); + setenv("LDCS_COMMPATH", commpath, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) @@ -159,7 +159,7 @@ static int parse_cmdline(int argc, char *argv[]) daemon_args[i - 3] = NULL; } - symbolic_location = argv[i++]; + symbolic_commpath = argv[i++]; i++; // Skip over candidate_cachepaths. number_s = argv[i++]; number = (number_t) strtoul(number_s, NULL, 0); @@ -173,7 +173,7 @@ static int parse_cmdline(int argc, char *argv[]) return 0; } -static void launch_daemon(char *location) +static void launch_daemon(char *commpath) { /*grand-child fork, then execv daemon. By grand-child forking we ensure that the app won't get confused by seeing an unknown process as a child. */ @@ -183,12 +183,12 @@ static void launch_daemon(char *location) char unique_file[MAX_PATH_LEN+1]; char buffer[32]; - result = spindle_mkdir(location); + result = spindle_mkdir(commpath); if (result == -1) { debug_printf("Exiting due to spindle_mkdir error\n"); exit(-1); } - snprintf(unique_file, MAX_PATH_LEN, "%s/spindle_daemon_pid", location); + snprintf(unique_file, MAX_PATH_LEN, "%s/spindle_daemon_pid", commpath); unique_file[MAX_PATH_LEN] = '\0'; fd = open(unique_file, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd == -1) { @@ -343,14 +343,14 @@ int main(int argc, char *argv[]) } } - orig_location = parse_location(symbolic_location, number); - if (!orig_location) { + char *orig_commpath = parse_location(symbolic_commpath, number); + if (!orig_commpath) { return -1; } - location = realize(orig_location); + commpath = realize(orig_commpath); if (daemon_args) { - launch_daemon(location); + launch_daemon(commpath); } result = establish_connection(); @@ -374,7 +374,7 @@ int main(int argc, char *argv[]) #else shm_cache_limit = cachesize; #endif - shmcache_init(location, number, cachesize, shm_cache_limit); + shmcache_init(commpath, number, cachesize, shm_cache_limit); use_cache = 1; } diff --git a/src/client/client/client.c b/src/client/client/client.c index 69488b10..0f0b6047 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -71,7 +71,7 @@ static const ElfW(Phdr) *libc_phdrs, *interp_phdrs; static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; -static char *location; +static char *commpath; char *chosen_realized_cachepath, *chosen_parsed_cachepath; number_t number; static int have_stat_patches; @@ -198,7 +198,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_LOCATION"); + commpath = getenv("LDCS_COMMPATH"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -207,9 +207,9 @@ static int init_server_connection() opts = strtoul(opts_s, NULL, 10); shm_cachesize = atoi(cachesize_s) * 1024; - if (strchr(location, '$')) { - location = parse_location(location, number); - if (!location) { + if (strchr(commpath, '$')) { + commpath = parse_location(commpath, number); + if (!commpath) { exit(-1); } } @@ -231,14 +231,14 @@ static int init_server_connection() #else shm_cache_limit = shm_cachesize; #endif - shmcache_init(location, number, shm_cachesize, shm_cache_limit); + shmcache_init(commpath, number, shm_cachesize, shm_cache_limit); } if (connection) { /* boostrapper established the connection for us. Reuse it. */ debug_printf("Recreating existing connection to server\n"); - debug_printf3("location = %s, number = %lu, connection = %s, rankinfo = %s\n", - location, (unsigned long) number, connection, rankinfo_s); + debug_printf3("commpath = %s, number = %lu, connection = %s, rankinfo = %s\n", + commpath, (unsigned long) number, connection, rankinfo_s); ldcsid = client_register_connection(connection); if (ldcsid == -1) return -1; @@ -248,13 +248,13 @@ static int init_server_connection() } else { /* Establish a new connection */ - debug_printf("open connection to ldcs %s %lu\n", location, (unsigned long) number); - ldcsid = client_open_connection(location, number); + debug_printf("open connection to ldcs %s %lu\n", commpath, (unsigned long) number); + ldcsid = client_open_connection(commpath, number); if (ldcsid == -1) return -1; send_pid(ldcsid); - send_location(ldcsid, location); + send_location(ldcsid, commpath); send_rankinfo_query(ldcsid, rankinfo+0, rankinfo+1, rankinfo+2, rankinfo+3); #if defined(LIBNUMA) if (opts & OPT_NUMA) From 73d13fccd920fe295c01c51722e9f07717b32454 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 16:06:35 -0800 Subject: [PATCH 28/66] Continues location rename. src/client/client_comlib/client_api.c Added setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc); testsuite/test_driver.c Replaced LDCS_LOCATION and LDCS_ORIG_LOCATION checks for cachepath with LDCS_CHOSEN_PARSED_CACHEPATH Replaced spindle_loc with cachepath --- src/client/client_comlib/client_api.c | 2 ++ testsuite/test_driver.c | 48 +++++++++++++-------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index f5e07cc0..827acbf0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -67,6 +67,8 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if( chosen_parsed_cachepath ){ *chosen_parsed_cachepath = local_cpc; } + // Required by testsuite/test_driver.c + setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc, 1); return 0; } diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index 544bfd20..01bf3238 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1169,7 +1169,7 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } -static int checkLinkForLeak(const char *path, const char *spindle_loc) +static int checkLinkForLeak(const char *path, const char *cachepath) { char link_target[4096]; int result, error; @@ -1182,8 +1182,8 @@ static int checkLinkForLeak(const char *path, const char *spindle_loc) return -1; } - if (strstr(link_target, spindle_loc)) { - err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); + if (strstr(link_target, cachepath)) { + err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, cachepath); return -1; } @@ -1191,10 +1191,10 @@ static int checkLinkForLeak(const char *path, const char *spindle_loc) } -static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) +static int checkPathForLeak(const char *what, const char *path, const char *cachepath) { - if (strstr(path, spindle_loc)) { - err_printf("%s: Path '%s' leaks spindle path with '%s'\n", what, path, spindle_loc); + if (strstr(path, cachepath)) { + err_printf("%s: Path '%s' leaks spindle path with '%s'\n", what, path, cachepath); return -1; } return 0; @@ -1202,14 +1202,14 @@ static int checkPathForLeak(const char *what, const char *path, const char *spin static int leak_check_cb(struct dl_phdr_info *p, size_t psize, void *opaque) { - char *spindle_loc = (char *) opaque; + char *cachepath = (char *) opaque; if (!p->dlpi_name || p->dlpi_name[0] == '\0') return 0; - checkPathForLeak("dl_iterate_phdr", p->dlpi_name, spindle_loc); + checkPathForLeak("dl_iterate_phdr", p->dlpi_name, cachepath); return 0; } -static int check_proc_maps(char *path, char *spindle_loc) +static int check_proc_maps(char *path, char *cachepath) { int fd, error, result; struct stat statbuf; @@ -1248,8 +1248,8 @@ static int check_proc_maps(char *path, char *spindle_loc) maps[filesize] = '\0'; close(fd); - if (strstr(maps, spindle_loc)) { - err_printf("Found leaked spindle path '%s' in maps '%s'\n", spindle_loc, path); + if (strstr(maps, cachepath)) { + err_printf("Found leaked spindle path '%s' in maps '%s'\n", cachepath, path); return -1; } @@ -1259,17 +1259,15 @@ static int check_proc_maps(char *path, char *spindle_loc) void check_for_path_leaks() { - char *spindle_loc = NULL; + char *cachepath = NULL; DIR *proc_fds = NULL; struct dirent *d; char path[4096]; struct link_map *lm; char *dlerr_msg = NULL; - spindle_loc = getCacheLocation("LDCS_LOCATION"); - if (!spindle_loc) - spindle_loc = getCacheLocation("LDCS_ORIG_LOCATION"); - if (!spindle_loc) { + cachepath = getCacheLocation("LDCS_CHOSEN_PARSED_CACHEPATH"); + if (!cachepath) { err_printf("Failed to calculate cache location"); goto done; } @@ -1287,9 +1285,9 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); - checkLinkForLeak(path, spindle_loc); + checkLinkForLeak(path, cachepath); } - checkLinkForLeak("/proc/self/exe", spindle_loc); + checkLinkForLeak("/proc/self/exe", cachepath); /** * Check link_maps for leaked spindle paths @@ -1297,22 +1295,22 @@ void check_for_path_leaks() for (lm = _r_debug.r_map; lm != NULL; lm = lm->l_next) { if (!lm->l_name || lm->l_name[0] == '\0') continue; - checkPathForLeak("link_map", lm->l_name, spindle_loc); + checkPathForLeak("link_map", lm->l_name, cachepath); } /** * Check libraries in dl_iterate_phdr for leaked paths **/ - dl_iterate_phdr(leak_check_cb, spindle_loc); + dl_iterate_phdr(leak_check_cb, cachepath); /** * Check /proc/pid/maps under various aliases for leaked names **/ - check_proc_maps("/proc/self/maps", spindle_loc); + check_proc_maps("/proc/self/maps", cachepath); snprintf(path, sizeof(path), "/proc/self/task/%d/maps", getpid()); - check_proc_maps(path, spindle_loc); + check_proc_maps(path, cachepath); snprintf(path, sizeof(path), "/proc/%d/maps", getpid()); - check_proc_maps(path, spindle_loc); + check_proc_maps(path, cachepath); /** * Check that dlerror doesn't leak the /__not_exists/ prefix @@ -1324,8 +1322,8 @@ void check_for_path_leaks() } done: - if (spindle_loc) - free(spindle_loc); + if (cachepath) + free(cachepath); if (proc_fds) closedir(proc_fds); } From b49a922347ffdfd3178a949e8ad6270ecef2aede Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 7 Dec 2025 11:04:26 -0800 Subject: [PATCH 29/66] Fixes -Wsign-compare warning in new code. --- src/cobo/cobo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 65741008..2d1a53a9 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1437,7 +1437,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* if i have any children, receive their data */ - int64_t child_val; + uint64_t child_val; for(int i=cobo_num_child-1; i>=0; i--) { /* read int64_t from child */ if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { From ab21dedd37c762eb5cdc22caca9db915329b97d0 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 7 Dec 2025 11:33:43 -0800 Subject: [PATCH 30/66] LDCS_CHOSEN_PARSED_CACHEPATH set in bootstrap. --- src/client/beboot/spindle_bootstrap.c | 6 +++++- src/client/client_comlib/client_api.c | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index d5f8a197..d0e06439 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -107,13 +107,17 @@ static void setup_environment() { char rankinfo_str[256]; snprintf(rankinfo_str, 256, "%d %d %d %d %d", ldcsid, rankinfo[0], rankinfo[1], rankinfo[2], rankinfo[3]); - + char *connection_str = NULL; if (opts & OPT_RELOCAOUT) connection_str = client_get_connection_string(ldcsid); + char *chosen_parsed_cachepath; + send_cachepath_query( ldcsid , NULL, &chosen_parsed_cachepath); + setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_COMMPATH", commpath, 1); + setenv("LDCS_CHOSEN_PARSED_CACHEPATH", chosen_parsed_cachepath, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 827acbf0..f5e07cc0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -67,8 +67,6 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if( chosen_parsed_cachepath ){ *chosen_parsed_cachepath = local_cpc; } - // Required by testsuite/test_driver.c - setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc, 1); return 0; } From 30362bcbd8113bc3b9ca9e2ad05757381f6341dc Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 18 Dec 2025 19:46:36 -0800 Subject: [PATCH 31/66] Updates test_driver.c to ignore FIFO files. All tests pass with both distinct and identical commpaths/cachepaths. --- testsuite/test_driver.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index 01bf3238..aae6c5c8 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1283,6 +1283,11 @@ void check_for_path_leaks() for (d = readdir(proc_fds); d != NULL; d = readdir(proc_fds)) { if (d->d_name[0] == '.') continue; + // Ignore Spindle fifo files for now. + if ( strncmp( "315", d->d_name, 3 ) == 0 ) + continue; + if ( strncmp( "316", d->d_name, 3 ) == 0 ) + continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); checkLinkForLeak(path, cachepath); From 78b45b42b5fc84286842d6901b568162108773cc Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 12 Jan 2026 01:38:42 -0800 Subject: [PATCH 32/66] Sets TMPDIR=/tmp in each Dockerfile Additionally populates /etc/environment just in case ssh is used to set up the servers. --- containers/spindle-flux-ubuntu/Dockerfile | 2 ++ containers/spindle-serial-ubuntu/Dockerfile | 2 ++ containers/spindle-slurm-ubuntu/base/Dockerfile | 2 ++ containers/spindle-slurm-ubuntu/testing/Dockerfile | 2 ++ 4 files changed, 8 insertions(+) diff --git a/containers/spindle-flux-ubuntu/Dockerfile b/containers/spindle-flux-ubuntu/Dockerfile index 3af607a8..57badbfc 100644 --- a/containers/spindle-flux-ubuntu/Dockerfile +++ b/containers/spindle-flux-ubuntu/Dockerfile @@ -5,6 +5,8 @@ FROM fluxrm/flux-sched:${flux_sched_version} AS builder ARG replicas=4 ENV workers=${replicas} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ && apt-get -qq install -y --no-install-recommends \ diff --git a/containers/spindle-serial-ubuntu/Dockerfile b/containers/spindle-serial-ubuntu/Dockerfile index 3070596e..62c0cbf5 100644 --- a/containers/spindle-serial-ubuntu/Dockerfile +++ b/containers/spindle-serial-ubuntu/Dockerfile @@ -1,6 +1,8 @@ ARG ubuntu_version=noble FROM ubuntu:${ubuntu_version} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ # install latest pkg utils: diff --git a/containers/spindle-slurm-ubuntu/base/Dockerfile b/containers/spindle-slurm-ubuntu/base/Dockerfile index d4724276..d6eee9f2 100644 --- a/containers/spindle-slurm-ubuntu/base/Dockerfile +++ b/containers/spindle-slurm-ubuntu/base/Dockerfile @@ -1,6 +1,8 @@ ARG UBUNTU_VERSION=noble FROM ubuntu:${UBUNTU_VERSION} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN apt-get update \ && DEBIAN_FRONTEND="noninteractive" apt-get -qq install -y --no-install-recommends \ diff --git a/containers/spindle-slurm-ubuntu/testing/Dockerfile b/containers/spindle-slurm-ubuntu/testing/Dockerfile index 91d6901c..99768535 100644 --- a/containers/spindle-slurm-ubuntu/testing/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing/Dockerfile @@ -2,6 +2,8 @@ ARG BASE_VERSION=latest FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing From 2e0db40e0a413825682dc85168ca0fb0a1940f62 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 10:41:36 -0800 Subject: [PATCH 33/66] Restores --with-localstorage to generate error. The option is marked as obsolete in configure --help and will cause an error in configure if it is specified. As updates the CI configure scripts to use --with-cachepaths and --with-commpath instead of --with-localstorage. --- configure | 9 +++++++++ configure.common.ac | 4 ++++ containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- .../spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing/scripts/build_spindle.sh | 2 +- src/client/configure | 9 +++++++++ src/fe/configure | 9 +++++++++ src/server/configure | 9 +++++++++ 8 files changed, 43 insertions(+), 3 deletions(-) diff --git a/configure b/configure index 05524e9f..d1b936c0 100755 --- a/configure +++ b/configure @@ -847,6 +847,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1591,6 +1592,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16666,6 +16669,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/configure.common.ac b/configure.common.ac index 5438173a..c507b053 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -17,6 +17,10 @@ AC_ARG_WITH(default-num-ports, [AS_HELP_STRING([--with-default-numports=NUM],[Number of TCP/IP ports to scan for Spindle server communication])], [NUM_COBO_PORTS=${withval}], [NUM_COBO_PORTS=$DEFAULT_NUM_COBO_PORTS]) +AC_ARG_WITH(localstorage, + [AS_HELP_STRING([--with-localstorage=DIR (obsolete)],[Use --with-cachepaths and --with-commpath instead.])], + [AC_MSG_ERROR(requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead.)], + []) AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 9257f85e..283e4451 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 37b6491a..4fee85b4 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index 6943e49a..17e7197f 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/src/client/configure b/src/client/configure index aef2c665..b854bc84 100755 --- a/src/client/configure +++ b/src/client/configure @@ -810,6 +810,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1533,6 +1534,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -12591,6 +12594,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/src/fe/configure b/src/fe/configure index b5c73f35..7384873e 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -831,6 +831,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1571,6 +1572,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16441,6 +16444,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/src/server/configure b/src/server/configure index bf356eca..adac8311 100755 --- a/src/server/configure +++ b/src/server/configure @@ -837,6 +837,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1568,6 +1569,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16438,6 +16441,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} From cc1ed050b014707e9b0583c72f8f80c96eac7646 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 16:27:54 -0800 Subject: [PATCH 34/66] Updates spank plugin to use commpath. Replaces args->location with args->commpath. --- src/slurm_plugin/slurm_plugin.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 365e2b27..e5d1ccdb 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -71,6 +71,7 @@ static int enable_spindle = 0; extern char **environ; extern char *parse_location(char *loc, number_t number); +extern char *realize(char *path); struct spank_option spank_options[] = { @@ -260,7 +261,7 @@ static unique_id_t getUniqueID(spank_t spank) static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv, unique_id_t unique_id) { int result; - char *oldlocation; + char *symbolic_commpath, *orig_commpath; char *err_string; args->unique_id = unique_id; @@ -279,10 +280,14 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv args->use_launcher = slurm_plugin_launcher; args->startup_type = startup_external; - oldlocation = args->location; + symbolic_commpath = args->commpath; + orig_commpath = parse_location(xmbolic_commpath, args->number); + if( !orig_commpath ){ + return -1; + } + args->commpath = realize(orig_commpath) + current_spank = spank; - args->location = parse_location(oldlocation, args->number); - free(oldlocation); return 0; } @@ -695,16 +700,16 @@ static int handleExit(void *params, char **output_str) return 0; } - if (!args.location) { - sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); + if (!args.commpath) { + sdprintf(2, "WARNING: spindleExitBE not called since commpath is NULL\n"); } else { // The task_exit callback is run for _each proc_, so we use // isBEProc to pick only one proc per node to call spindleExitBE. is_be_leader = isBEProc(&args, 1); if (is_be_leader) { - result = spindleExitBE(args.location); + result = spindleExitBE(args.commpath); if (result == -1) { - sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); + sdprintf(1, "ERROR: spindleExitBE returned an error on commpath %s\n", args.commpath); return -1; } } From a9c8809d6da0b7d5b147b79d010f7d794faa59b9 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 17:15:21 -0800 Subject: [PATCH 35/66] Updates a configure script to use commpath. --- .../testing-plugin/scripts/build_spindle.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 2a252b32..879a28c6 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 73b2d694578adccb5e2cc71ccb5ac3ce5adc3c6f Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 20:12:23 -0800 Subject: [PATCH 36/66] Additional integration for commpath + spank-plugin. --- src/slurm_plugin/plugin_utils.c | 6 +++--- src/slurm_plugin/slurm_plugin.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 3be45742..392677c4 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -269,7 +269,7 @@ int isBEProc(spindle_args_t *params, unsigned int exit_phase) int beproc_result = -1; int fd = -1, error; - dir = params->location; + dir = params->commpath; if (!dir) { sdprintf(1, "ERROR: Location not filled in\n"); goto done; @@ -615,7 +615,7 @@ int registerFEPid(pid_t pid, spindle_args_t *args) int fd; int result; - snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->location); + snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->commpath); pid_file[sizeof(pid_file)-1] = '\0'; snprintf(pid_s, sizeof(pid_s), "%d\n", (int) pid); @@ -646,7 +646,7 @@ int readFEPid(pid_t *pid, spindle_args_t *args) pid_t pid_result; int fd, result; - snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->location); + snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->commpath); pid_file[sizeof(pid_file)-1] = '\0'; sdprintf(2, "Reading FE pid from %s\n", pid_file); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index e5d1ccdb..501c3cfe 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -281,11 +281,11 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv args->startup_type = startup_external; symbolic_commpath = args->commpath; - orig_commpath = parse_location(xmbolic_commpath, args->number); + orig_commpath = parse_location(symbolic_commpath, args->number); if( !orig_commpath ){ return -1; } - args->commpath = realize(orig_commpath) + args->commpath = realize(orig_commpath); current_spank = spank; From 46e6e5141b3ded3485f91defda43a6772194bc72 Mon Sep 17 00:00:00 2001 From: Barry Date: Sat, 14 Feb 2026 11:45:32 -0800 Subject: [PATCH 37/66] Fixes two silly bugs. TMPDIR left out of a docker script, and --cachepath instead of --cachepaths. --- configure | 8 ++++++++ configure.common.ac | 4 ++++ containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile | 1 + .../testing-plugin/scripts/build_spindle.sh | 2 +- src/client/configure | 8 ++++++++ src/fe/configure | 8 ++++++++ src/server/configure | 8 ++++++++ 7 files changed, 38 insertions(+), 1 deletion(-) diff --git a/configure b/configure index d1b936c0..e838bb77 100755 --- a/configure +++ b/configure @@ -849,6 +849,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1596,6 +1597,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16683,6 +16685,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/configure.common.ac b/configure.common.ac index c507b053..2d158fce 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -25,6 +25,10 @@ AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], [CACHEPATHS=$DEFAULT_LOC]) +AC_ARG_WITH(cachepath, + [[],[]], + [AC_MSG_ERROR(use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths)], + []) AC_ARG_WITH(commpath, [AS_HELP_STRING([--with-compath=DIR],[Back-end directory for communication and housekeeping])], [COMMPATH=${withval}], diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile index 7b66a155..da39cbe5 100644 --- a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile @@ -2,6 +2,7 @@ ARG BASE_VERSION=latest FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} +ENV TMPDIR=/tmp ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 879a28c6..1aa9889c 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepaths=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/src/client/configure b/src/client/configure index b854bc84..785c4d0e 100755 --- a/src/client/configure +++ b/src/client/configure @@ -812,6 +812,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1538,6 +1539,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -12608,6 +12610,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/src/fe/configure b/src/fe/configure index 7384873e..9f333077 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -833,6 +833,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1576,6 +1577,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16458,6 +16460,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/src/server/configure b/src/server/configure index adac8311..6c208220 100755 --- a/src/server/configure +++ b/src/server/configure @@ -839,6 +839,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1573,6 +1574,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16455,6 +16457,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} From f1094fcdec48d6589fdd261186f7af4ed2c7c38e Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 19 Feb 2026 23:29:55 -0800 Subject: [PATCH 38/66] Testing non-overlapping cache/commpath directories. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile | 1 + .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 283e4451..90dbbb21 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 4fee85b4..cdc18537 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile index da39cbe5..951480f1 100644 --- a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile @@ -3,6 +3,7 @@ FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 1aa9889c..5fbb5c97 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepaths=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp/commpath --with-cachepaths=/tmp/cachepath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index 17e7197f..be6b933a 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From b9d29a5523374f6f90fea7226e456e2382387f54 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Feb 2026 09:10:25 -0800 Subject: [PATCH 39/66] Testing commpath as subdirectory of cachepath. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 90dbbb21..02f44096 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index cdc18537..42604a2f 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 5fbb5c97..1e88d116 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp/commpath --with-cachepaths=/tmp/cachepath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index be6b933a..fffe6a55 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 2b57b08f8da9bd5f3ee183cf0fdc482541f62199 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Feb 2026 09:22:33 -0800 Subject: [PATCH 40/66] Testing cachepath as a subdirectory of commpath. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 02f44096..a63b5d8d 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 42604a2f..1022ba14 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 1e88d116..2b36be90 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index fffe6a55..7fcb48e3 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 2cfb6ce302d0870687e3de951cb99793b944af6b Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 9 Mar 2026 16:23:24 -0700 Subject: [PATCH 41/66] Generate debug log artifacts as part of github ci --- .github/workflows/ci.yml | 286 +++++++++++++++++++++++++++++---------- 1 file changed, 212 insertions(+), 74 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d58ff869..ebb1e8fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,9 +1,9 @@ name: ci - on: push: branches: - devel + - commpath pull_request: branches: - devel @@ -17,11 +17,18 @@ concurrency: cancel-in-progress: true jobs: +################################################################################ +# spindle-serial-ubuntu +################################################################################ spindle-serial-ubuntu: - name: Testsuite (Serial, Ubuntu) + name: Testsuite (Serial, Ubuntu, debug=${{ matrix.debug }}) environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] steps: - name: Check out Spindle uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -48,10 +55,38 @@ jobs: run: | docker exec spindlenode bash -c 'munge -n | unmunge' +# Matrix across "debug" - name: Run spindle-serial-ubuntu testsuite - id: serial-ubuntu-testsuite + timeout-minutes: ${{ matrix.debug == 0 && 14 || 5 }} + run: | + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec spindlenode bash -c \ + 'cd Spindle-build/testsuite && ./runTests' + else + docker exec spindlenode bash -c \ + "cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} ./runTests" + fi + +# If we saw any failures, tar up the logfiles for extraction. + - name: On failure, pull logs out of the container(s) + id: serial-ubuntu-copy-from-container + if: ${{ failure() && matrix.debug != 0}} + continue-on-error: true run: | - docker exec spindlenode bash -c 'cd Spindle-build/testsuite && ./runTests' + docker exec spindlenode bash -c \ + "cd /home/spindleuser/Spindle-build && \ + tar cjvf ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 \ + ./testsuite/spindle_output*" + docker cp \ + spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 . + + - name: Upload ubuntu-serial logs + id: serial-ubuntu-copy-to-artifact + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: Ubuntu serial logs tarball + path: ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down @@ -61,43 +96,96 @@ jobs: cd containers/spindle-serial-ubuntu docker compose down - spindle-flux-ubuntu: - name: Testsuite (Flux, Ubuntu) +################################################################################ +# spindle-flux-ubuntu +################################################################################ + spindle-flux-ubuntu-debug3: + name: Testsuite (Flux, Ubuntu, SPINDLE_DEBUG=3) environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] steps: - - name: Check out Spindle + - name: Check out Spindle (spindle-flux-ubuntu-debug3) uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - - name: Setup Docker Compose + - name: Setup Docker Compose (spindle-flux-ubuntu-debug3) uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 with: version: latest - - name: Build spindle-flux-ubuntu image + - name: Build spindle-flux-ubuntu-debug3 image id: flux-ubuntu-build run: | cd containers/spindle-flux-ubuntu docker compose --progress=plain build - name: Bring spindle-flux-ubuntu up - id: flux-ubuntu-up + id: flux-ubuntu--up run: | cd containers/spindle-flux-ubuntu docker compose up -d --wait --wait-timeout 60 - - name: Verify munge works in spindle-flux-ubuntu - id: flux-ubuntu-munge + - name: Verify munge works in spindle-flux-ubuntu-debug3 + id: flux-ubuntu-debug3-munge run: | docker exec node-1 bash -c 'munge -n | unmunge' + # Observed time: 5m 12s - name: Run spindle-flux-ubuntu testsuite + timeout-minutes: 7 id: flux-ubuntu-testsuite run: | - docker exec node-1 bash -c 'cd Spindle-build/testsuite && flux alloc --nodes=${workers} ./runTests --nodes=${workers} --tasks-per-node=3' - - - name: Bring spindle-flux-ubuntu down + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec node-1 bash -c \ + 'cd Spindle-build/testsuite && \ + flux alloc --nodes=${workers} \ + ./runTests --nodes=${workers} --tasks-per-node=3' + else + docker exec node-1 bash -c \ + 'cd Spindle-build/testsuite && \ + SPINDLE_DEBUG=${{ matrix.debug }} \ + flux alloc --nodes=${workers} \ + ./runTests --nodes=${workers} --tasks-per-node=3' + fi + + # If we saw any failures, tar up the logfiles for extraction. Observed time: 7m 51s + - name: Extract logs from spindle-flux-ubuntu + timeout-minutes: 9 + id: flux-ubuntu-tar + if: ${{ failure() && matrix.debug != 0 }} + continue-on-error: true + run: | + docker exec node-1 bash -c 'ls /home/fluxuser/Spindle-build/testsuite' + docker exec node-1 bash -c 'cd /home/fluxuser && \ + tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' + docker cp node-1:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec node-2 bash -c 'cd /home/fluxuser && \ + tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' + docker cp node-2:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec node-3 bash -c 'cd /home/fluxuser && \ + tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' + docker cp node-3:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec node-4 bash -c 'cd /home/fluxuser && \ + tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' + docker cp node-4:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + + - name: Upload logs to artifacts + id: flux-ubuntu-artifact + if: ${{ failure() && matrix.debug != 0 }} + uses: actions/upload-artifact@v4 + with: + name: Ubuntu flux logs tarball + path: | + ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + + - name: spindle-flux-ubuntu teardown container id: flux-ubuntu-down if: ${{ always() }} continue-on-error: true @@ -105,11 +193,18 @@ jobs: cd containers/spindle-flux-ubuntu docker compose down +################################################################################ +# spindle-slurm-ubuntu +################################################################################ spindle-slurm-ubuntu: name: Testsuite (Slurm, Ubuntu) environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] steps: - name: Check out Spindle uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -150,74 +245,117 @@ jobs: run: | docker exec slurm-head bash -c 'munge -n | unmunge' +# Matrix across "debug" - name: Run spindle-slurm-ubuntu testsuite - id: slurm-ubuntu-testsuite + timeout-minutes: 8 + id: slurm-ubuntu-debug3-testsuite + if: ${{ success() }} run: | - docker exec slurm-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' - - - name: Bring spindle-slurm-ubuntu down - id: slurm-ubuntu-down - if: ${{ always() }} + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec slurm-head bash -c \ + 'cd Spindle-build/testsuite && \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + else + docker exec slurm-head bash -c \ + 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + fi + +# Remove logs on failure + - name: Pull logs out of the container(s) + id: slurm-ubuntu-copy-from-container + if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | - cd containers/spindle-slurm-ubuntu/testing - docker compose down - - spindle-slurm-plugin-ubuntu: - name: Testsuite (Slurm Plugin, Ubuntu) - environment: Spindle CI - runs-on: ubuntu-latest - timeout-minutes: 20 - steps: - - name: Check out Spindle - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - - - name: Setup Docker Compose - uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + docker exec slurm-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' + docker exec slurm-node-1 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' + docker cp slurm-node-1:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec slurm-node-2 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' + docker cp slurm-node-2:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec slurm-node-3 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' + docker cp slurm-node-3:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec slurm-node-4 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' + docker cp slurm-node-4:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + + - name: Upload slurm ubuntu logs + id: slurm-ubuntu-copy-to-artifact + if: ${{ failure() && matrix.debug != 0 }} + uses: actions/upload-artifact@v4 with: - version: latest + name: Ubuntu slurm logs tarball + path: | + ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - - name: Login to GitHub Container Registry - if: ${{ !env.ACT }} - uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate MariaDB configuration - id: slurm-ubuntu-mariadb - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - ./generate_config.sh - - - name: Build spindle-slurm-plugin-ubuntu image - id: slurm-ubuntu-build - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - docker compose --progress=plain build - - - name: Bring spindle-slurm-plugin-ubuntu up - id: slurm-ubuntu-up - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - docker compose up -d --wait --wait-timeout 120 - - - name: Verify munge works in spindle-slurm-plugin-ubuntu - id: slurm-ubuntu-munge - run: | - docker exec slurm-plugin-head bash -c 'munge -n | unmunge' - - - name: Run spindle-slurm-plugin-ubuntu testsuite - id: slurm-ubuntu-testsuite - run: | - docker exec slurm-plugin-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' - - - name: Bring spindle-slurm-plugin-ubuntu down + - name: Bring spindle-slurm-ubuntu down id: slurm-ubuntu-down if: ${{ always() }} continue-on-error: true run: | - cd containers/spindle-slurm-ubuntu/testing-plugin + cd containers/spindle-slurm-ubuntu/testing docker compose down +## spindle-slurm-plugin-ubuntu: +## name: Testsuite (Slurm Plugin, Ubuntu) +## environment: Spindle CI +## runs-on: ubuntu-latest +## timeout-minutes: 20 +## steps: +## - name: Check out Spindle +## uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 +## +## - name: Setup Docker Compose +## uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 +## with: +## version: latest +## +## - name: Login to GitHub Container Registry +## if: ${{ !env.ACT }} +## uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef +## with: +## registry: ghcr.io +## username: ${{ github.actor }} +## password: ${{ secrets.GITHUB_TOKEN }} +## +## - name: Generate MariaDB configuration +## id: slurm-ubuntu-mariadb +## run: | +## cd containers/spindle-slurm-ubuntu/testing-plugin +## ./generate_config.sh +## +## - name: Build spindle-slurm-plugin-ubuntu image +## id: slurm-ubuntu-build +## run: | +## cd containers/spindle-slurm-ubuntu/testing-plugin +## docker compose --progress=plain build +## +## - name: Bring spindle-slurm-plugin-ubuntu up +## id: slurm-ubuntu-up +## run: | +## cd containers/spindle-slurm-ubuntu/testing-plugin +## docker compose up -d --wait --wait-timeout 120 +## +## - name: Verify munge works in spindle-slurm-plugin-ubuntu +## id: slurm-ubuntu-munge +## run: | +## docker exec slurm-plugin-head bash -c 'munge -n | unmunge' +## +## - name: Run spindle-slurm-plugin-ubuntu testsuite +## id: slurm-ubuntu-testsuite +## run: | +## docker exec slurm-plugin-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' +## +## - name: Bring spindle-slurm-plugin-ubuntu down +## id: slurm-ubuntu-down +## if: ${{ always() }} +## continue-on-error: true +## run: | +## cd containers/spindle-slurm-ubuntu/testing-plugin +## docker compose down +## From 382036aca1869694961b2fc270252c53f2e7e3cc Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 13:13:13 -0700 Subject: [PATCH 42/66] Adds slurm-plugin to matrix refactor. --- .github/workflows/ci.yml | 184 ++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 68 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ebb1e8fb..b885f354 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -86,7 +86,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: Ubuntu serial logs tarball - path: ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 + path: /home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down @@ -180,10 +180,10 @@ jobs: with: name: Ubuntu flux logs tarball path: | - ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - name: spindle-flux-ubuntu teardown container id: flux-ubuntu-down @@ -249,7 +249,6 @@ jobs: - name: Run spindle-slurm-ubuntu testsuite timeout-minutes: 8 id: slurm-ubuntu-debug3-testsuite - if: ${{ success() }} run: | if [ "${{ matrix.debug }}" = "0" ]; then docker exec slurm-head bash -c \ @@ -288,10 +287,10 @@ jobs: with: name: Ubuntu slurm logs tarball path: | - ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - name: Bring spindle-slurm-ubuntu down id: slurm-ubuntu-down @@ -301,61 +300,110 @@ jobs: cd containers/spindle-slurm-ubuntu/testing docker compose down -## spindle-slurm-plugin-ubuntu: -## name: Testsuite (Slurm Plugin, Ubuntu) -## environment: Spindle CI -## runs-on: ubuntu-latest -## timeout-minutes: 20 -## steps: -## - name: Check out Spindle -## uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 -## -## - name: Setup Docker Compose -## uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 -## with: -## version: latest -## -## - name: Login to GitHub Container Registry -## if: ${{ !env.ACT }} -## uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef -## with: -## registry: ghcr.io -## username: ${{ github.actor }} -## password: ${{ secrets.GITHUB_TOKEN }} -## -## - name: Generate MariaDB configuration -## id: slurm-ubuntu-mariadb -## run: | -## cd containers/spindle-slurm-ubuntu/testing-plugin -## ./generate_config.sh -## -## - name: Build spindle-slurm-plugin-ubuntu image -## id: slurm-ubuntu-build -## run: | -## cd containers/spindle-slurm-ubuntu/testing-plugin -## docker compose --progress=plain build -## -## - name: Bring spindle-slurm-plugin-ubuntu up -## id: slurm-ubuntu-up -## run: | -## cd containers/spindle-slurm-ubuntu/testing-plugin -## docker compose up -d --wait --wait-timeout 120 -## -## - name: Verify munge works in spindle-slurm-plugin-ubuntu -## id: slurm-ubuntu-munge -## run: | -## docker exec slurm-plugin-head bash -c 'munge -n | unmunge' -## -## - name: Run spindle-slurm-plugin-ubuntu testsuite -## id: slurm-ubuntu-testsuite -## run: | -## docker exec slurm-plugin-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' -## -## - name: Bring spindle-slurm-plugin-ubuntu down -## id: slurm-ubuntu-down -## if: ${{ always() }} -## continue-on-error: true -## run: | -## cd containers/spindle-slurm-ubuntu/testing-plugin -## docker compose down -## +################################################################################ +# spindle-slurm-plugin-ubuntu +################################################################################ + spindle-slurm-plugin-ubuntu: + name: Testsuite (Slurm Plugin, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] + steps: + - name: Check out Spindle + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 + + - name: Setup Docker Compose + uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + with: + version: latest + + - name: Login to GitHub Container Registry + if: ${{ !env.ACT }} + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate MariaDB configuration + id: slurm-ubuntu-mariadb + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + ./generate_config.sh + + - name: Build spindle-slurm-plugin-ubuntu image + id: slurm-ubuntu-build + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose --progress=plain build + + - name: Bring spindle-slurm-plugin-ubuntu up + id: slurm-ubuntu-up + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose up -d --wait --wait-timeout 120 + + - name: Verify munge works in spindle-slurm-plugin-ubuntu + id: slurm-ubuntu-munge + run: | + docker exec slurm-plugin-head bash -c 'munge -n | unmunge' + +# Matrix across "debug" + - name: Run spindle-slurm-plugin-ubuntu testsuite + timeout-minutes: 8 + id: slurm-ubuntu-testsuite + run: | + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec slurm-plugin-head bash -c + 'cd Spindle-build/testsuite && + salloc -n${workers} -N${workers} ./runTests ${workers}' + else + docker exec slurm-plugin-head bash -c + 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + fi +# Remove logs on failure + - name: Pull logs out of the container(s) + id: slurm-ubuntu-copy-from-container + if: ${{ failure() && matrix.debug != 0 }} + continue-on-error: true + run: | + docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' + docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' + docker cp slurm-plugin-node-1:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' + docker cp slurm-plugin-node-2:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec slurm-plugin-node-3 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' + docker cp slurm-plugin-node-3:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec slurm-plugin-node-4 bash -c 'cd /home/slurmuser && \ + tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' + docker cp slurm-plugin-node-4:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + + - name: Upload slurm ubuntu logs + id: slurm-ubuntu-copy-to-artifact + if: ${{ failure() && matrix.debug != 0 }} + uses: actions/upload-artifact@v4 + with: + name: Ubuntu slurm logs tarball + path: | + /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + + + - name: Bring spindle-slurm-plugin-ubuntu down + id: slurm-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose down + From 01af996838aa303344f0006fb9bc3c878297ea3f Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 13:20:16 -0700 Subject: [PATCH 43/66] Fixup --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b885f354..18885ffe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -358,11 +358,11 @@ jobs: id: slurm-ubuntu-testsuite run: | if [ "${{ matrix.debug }}" = "0" ]; then - docker exec slurm-plugin-head bash -c - 'cd Spindle-build/testsuite && + docker exec slurm-plugin-head bash -c \ + 'cd Spindle-build/testsuite && \ salloc -n${workers} -N${workers} ./runTests ${workers}' else - docker exec slurm-plugin-head bash -c + docker exec slurm-plugin-head bash -c \ 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ salloc -n${workers} -N${workers} ./runTests ${workers}' fi From 9dba19e2dfbebf55444d5faa104f1094c2728b95 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 15:09:59 -0700 Subject: [PATCH 44/66] Finalizes logging on spindleRunBE error paths Also ups the timeout for slurm-plugin test cases. --- .github/workflows/ci.yml | 2 +- src/server/startup/spindle_be.cc | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 18885ffe..d358fb7b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -354,7 +354,7 @@ jobs: # Matrix across "debug" - name: Run spindle-slurm-plugin-ubuntu testsuite - timeout-minutes: 8 + timeout-minutes: 13 id: slurm-ubuntu-testsuite run: | if [ "${{ matrix.debug }}" = "0" ]; then diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index c3e7bbd9..f91ce2f6 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -136,6 +136,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = ldcs_audit_server_network_setup(port, num_ports, unique_id, &setup_data, &setup_data_size); if (result == -1) { err_printf("Error setting up network in spindleRunBE\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } unpack_data(&args, setup_data, setup_data_size); @@ -148,6 +150,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i char *new_commpath = parse_location(args.commpath, args.number); if (!new_commpath) { err_printf("Failed to convert commpath %s\n", args.commpath); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); @@ -157,6 +161,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } @@ -164,6 +170,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = post_setup(&args); if (result == -1) { err_printf("post_setup callback errored. Returning\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } } @@ -172,11 +180,12 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i ldcs_audit_server_run(); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } - - if (args.startup_type == startup_external) + if (args.startup_type == startup_external) LOGGING_FINI; return 0; From 5d9b2802f36bca9d67b646d65ca9bfcda2baf607 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 15:50:22 -0700 Subject: [PATCH 45/66] Several small fixes, add head node to slurm* --- .github/workflows/ci.yml | 72 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d358fb7b..9eb45801 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,3 +1,5 @@ +# FIXME Handle case where only a subset of nodes generate logs. +# FIXME More refactoring, possibly with bash functions and filename templates as ENV variables name: ci on: push: @@ -100,7 +102,7 @@ jobs: # spindle-flux-ubuntu ################################################################################ spindle-flux-ubuntu-debug3: - name: Testsuite (Flux, Ubuntu, SPINDLE_DEBUG=3) + name: Testsuite (Flux, Ubuntu) environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 @@ -141,12 +143,12 @@ jobs: run: | if [ "${{ matrix.debug }}" = "0" ]; then docker exec node-1 bash -c \ - 'cd Spindle-build/testsuite && \ + 'cd /home/fluxuser/Spindle-build/testsuite && \ flux alloc --nodes=${workers} \ ./runTests --nodes=${workers} --tasks-per-node=3' else docker exec node-1 bash -c \ - 'cd Spindle-build/testsuite && \ + 'cd /home/fluxuser/Spindle-build/testsuite && \ SPINDLE_DEBUG=${{ matrix.debug }} \ flux alloc --nodes=${workers} \ ./runTests --nodes=${workers} --tasks-per-node=3' @@ -160,18 +162,18 @@ jobs: continue-on-error: true run: | docker exec node-1 bash -c 'ls /home/fluxuser/Spindle-build/testsuite' - docker exec node-1 bash -c 'cd /home/fluxuser && \ + docker exec node-1 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp node-1:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . - docker exec node-2 bash -c 'cd /home/fluxuser && \ + docker cp node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec node-2 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp node-2:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . - docker exec node-3 bash -c 'cd /home/fluxuser && \ + docker cp node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec node-3 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp node-3:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . - docker exec node-4 bash -c 'cd /home/fluxuser && \ + docker cp node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec node-4 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp node-4:/home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . - name: Upload logs to artifacts id: flux-ubuntu-artifact @@ -180,10 +182,10 @@ jobs: with: name: Ubuntu flux logs tarball path: | - /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - /home/fluxuser/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - name: spindle-flux-ubuntu teardown container id: flux-ubuntu-down @@ -267,18 +269,21 @@ jobs: continue-on-error: true run: | docker exec slurm-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-node-1 bash -c 'cd /home/slurmuser && \ + docker exec slurm-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-1:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . - docker exec slurm-node-2 bash -c 'cd /home/slurmuser && \ + docker cp slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec slurm-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-2:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . - docker exec slurm-node-3 bash -c 'cd /home/slurmuser && \ + docker cp slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec slurm-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-3:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . - docker exec slurm-node-4 bash -c 'cd /home/slurmuser && \ + docker cp slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec slurm-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-4:/home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker exec slurm-head bash -c 'cd /home/slurmuser/Spindle-build && \ + tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' + docker cp slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 . - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -291,6 +296,7 @@ jobs: /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 - name: Bring spindle-slurm-ubuntu down id: slurm-ubuntu-down @@ -373,18 +379,21 @@ jobs: continue-on-error: true run: | docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser && \ + docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-1:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . - docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser && \ + docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-2:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . - docker exec slurm-plugin-node-3 bash -c 'cd /home/slurmuser && \ + docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker exec slurm-plugin-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-3:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . - docker exec slurm-plugin-node-4 bash -c 'cd /home/slurmuser && \ + docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker exec slurm-plugin-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-4:/home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker exec slurm-plugin-head bash -c 'cd /home/slurmuser/Spindle-build && \ + tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' + docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 . - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -397,6 +406,7 @@ jobs: /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 - name: Bring spindle-slurm-plugin-ubuntu down From d2bd28c058f3e2f276363019c877ff5643301ab1 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 16:45:47 -0700 Subject: [PATCH 46/66] Restoring non-canonical paths in runner. --- .github/workflows/ci.yml | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9eb45801..3a633a6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,7 +88,8 @@ jobs: uses: actions/upload-artifact@v4 with: name: Ubuntu serial logs tarball - path: /home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 + # NOTE: This is the runner path, not the container path. + path: ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down @@ -181,11 +182,12 @@ jobs: uses: actions/upload-artifact@v4 with: name: Ubuntu flux logs tarball + # NOTE: This is the runner path, not the container path. path: | - /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - /home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - name: spindle-flux-ubuntu teardown container id: flux-ubuntu-down @@ -291,12 +293,13 @@ jobs: uses: actions/upload-artifact@v4 with: name: Ubuntu slurm logs tarball + # NOTE: This is the runner path, not the container path. path: | - /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - /home/slurmuser/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 - name: Bring spindle-slurm-ubuntu down id: slurm-ubuntu-down @@ -401,12 +404,13 @@ jobs: uses: actions/upload-artifact@v4 with: name: Ubuntu slurm logs tarball + # NOTE: This is the runner path, not the container path. path: | - /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - /home/slurmuser/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 + ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 + ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 + ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 + ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 - name: Bring spindle-slurm-plugin-ubuntu down From c5a584c3e97e1cc69602326a94329cf2eab4e72a Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 22:37:37 -0700 Subject: [PATCH 47/66] Tarball filenames now allow multiple artifacts --- .github/workflows/ci.yml | 113 +++++++++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a633a6e..1b7aea90 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,8 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + env: + TARBALL: ./serial_ubunutu_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 strategy: fail-fast: false matrix: @@ -80,7 +82,8 @@ jobs: tar cjvf ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 \ ./testsuite/spindle_output*" docker cp \ - spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 . + spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 \ + $TARBALL - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -89,7 +92,7 @@ jobs: with: name: Ubuntu serial logs tarball # NOTE: This is the runner path, not the container path. - path: ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 + path: $TARBALL - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down @@ -107,6 +110,11 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + env: | + TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 strategy: fail-fast: false matrix: @@ -165,16 +173,24 @@ jobs: docker exec node-1 bash -c 'ls /home/fluxuser/Spindle-build/testsuite' docker exec node-1 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker cp \ + node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ + $TARBALL1 docker exec node-2 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker cp \ + node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ + $TARBALL2 docker exec node-3 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker cp \ + node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ + $TARBALL3 docker exec node-4 bash -c 'cd /home/fluxuser/Spindle-build && \ tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp \ + node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ + $TARBALL4 - name: Upload logs to artifacts id: flux-ubuntu-artifact @@ -184,10 +200,10 @@ jobs: name: Ubuntu flux logs tarball # NOTE: This is the runner path, not the container path. path: | - ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 + $TARBALL1 + $TARBALL2 + $TARBALL3 + $TARBALL4 - name: spindle-flux-ubuntu teardown container id: flux-ubuntu-down @@ -205,6 +221,12 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + env: | + TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./slurm_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL5: ./slurm_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 strategy: fail-fast: false matrix: @@ -264,7 +286,7 @@ jobs: salloc -n${workers} -N${workers} ./runTests ${workers}' fi -# Remove logs on failure +# Extract logs on failure - name: Pull logs out of the container(s) id: slurm-ubuntu-copy-from-container if: ${{ failure() && matrix.debug != 0 }} @@ -273,19 +295,29 @@ jobs: docker exec slurm-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' docker exec slurm-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker cp \ + slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ + $TARBALL1 docker exec slurm-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker cp \ + slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ + $TARBALL2 docker exec slurm-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker cp \ + slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ + $TARBALL3 docker exec slurm-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp \ + slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ + $TARBALL4 docker exec slurm-head bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' - docker cp slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 . + docker cp \ + slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 \ + $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -295,11 +327,11 @@ jobs: name: Ubuntu slurm logs tarball # NOTE: This is the runner path, not the container path. path: | - ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 + $TARBALL1 + $TARBALL2 + $TARBALL3 + $TARBALL4 + $TARBALLH - name: Bring spindle-slurm-ubuntu down id: slurm-ubuntu-down @@ -317,6 +349,12 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 + env: | + TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./plugin_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL5: ./plugin_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 strategy: fail-fast: false matrix: @@ -375,7 +413,7 @@ jobs: 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ salloc -n${workers} -N${workers} ./runTests ${workers}' fi -# Remove logs on failure +# Extract logs on failure - name: Pull logs out of the container(s) id: slurm-ubuntu-copy-from-container if: ${{ failure() && matrix.debug != 0 }} @@ -384,19 +422,29 @@ jobs: docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 . + docker cp + slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ + $TARBALL1 docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 . + docker cp \ + slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ + $TARBALL2 docker exec slurm-plugin-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 . + docker cp \ + slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ + $TARBALL3 docker exec slurm-plugin-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 . + docker cp \ + slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ + $TARBALL4 docker exec slurm-plugin-head bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' - docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 . + docker cp \ + slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 \ + $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -406,12 +454,11 @@ jobs: name: Ubuntu slurm logs tarball # NOTE: This is the runner path, not the container path. path: | - ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 - ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 - ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 - ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 - ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 - + $TARBALL1 + $TARBALL2 + $TARBALL3 + $TARBALL4 + $TARBALLH - name: Bring spindle-slurm-plugin-ubuntu down id: slurm-ubuntu-down From f9ad3c4c5ff8a541af04d0226d39d2129a4e9d3d Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 22:43:32 -0700 Subject: [PATCH 48/66] env doesn't take an array. --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1b7aea90..d6d298d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -110,7 +110,7 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 - env: | + env: TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 @@ -221,7 +221,7 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 - env: | + env: TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 @@ -349,7 +349,7 @@ jobs: environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 - env: | + env: TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 From d8a981bbd839a0c4aabd70254dd30b0190178906 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 23:11:08 -0700 Subject: [PATCH 49/66] printf, awkwardly. --- .github/workflows/ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6d298d4..26740d5a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,19 @@ jobs: - name: Build spindle-serial-ubuntu image id: serial-ubuntu-build run: | + echo "Testing unquoted:" + echo $TARBALL + echo "Testing quoted:" + echo "$TARBALL" + echo "Testing context:" + echo ${{ env.TARBALL }} + echo "Testing quoted context" + echo "${{ env.TARBALL }}" + echo "Testing single quote:" + echo '$TARBALL' + echo "Testing single-quote context:" + echo '${{ env.TARBALL }}' + cd containers/spindle-serial-ubuntu docker compose --progress=plain build From f2b2223e4f6e35fd0201e2b6ac302a662197826c Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Mar 2026 23:36:57 -0700 Subject: [PATCH 50/66] One bugfix, one upgrade --- .github/workflows/ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 26740d5a..1c4d8832 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,7 +54,7 @@ jobs: echo "Testing quoted context" echo "${{ env.TARBALL }}" echo "Testing single quote:" - echo '$TARBALL' + echo '$TARBALL' # This is the only one that doesn't work echo "Testing single-quote context:" echo '${{ env.TARBALL }}' @@ -101,7 +101,7 @@ jobs: - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact if: ${{ failure() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: Ubuntu serial logs tarball # NOTE: This is the runner path, not the container path. @@ -208,7 +208,7 @@ jobs: - name: Upload logs to artifacts id: flux-ubuntu-artifact if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: Ubuntu flux logs tarball # NOTE: This is the runner path, not the container path. @@ -335,7 +335,7 @@ jobs: - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: Ubuntu slurm logs tarball # NOTE: This is the runner path, not the container path. @@ -435,7 +435,7 @@ jobs: docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp + docker cp \ slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ $TARBALL1 docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ @@ -462,7 +462,7 @@ jobs: - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: Ubuntu slurm logs tarball # NOTE: This is the runner path, not the container path. From 0a1e1ade9e6038fb62898c3f842280d2c43973bd Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 00:09:19 -0700 Subject: [PATCH 51/66] I bet we can't resolve yaml variables in bash --- .github/workflows/ci.yml | 110 ++++++++++++--------------------------- 1 file changed, 34 insertions(+), 76 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1c4d8832..3ba0f63a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,7 +74,7 @@ jobs: # Matrix across "debug" - name: Run spindle-serial-ubuntu testsuite - timeout-minutes: ${{ matrix.debug == 0 && 14 || 5 }} + timeout-minutes: ${{ matrix.debug == 0 && 2 || 2 }} run: | if [ "${{ matrix.debug }}" = "0" ]; then docker exec spindlenode bash -c \ @@ -184,26 +184,14 @@ jobs: continue-on-error: true run: | docker exec node-1 bash -c 'ls /home/fluxuser/Spindle-build/testsuite' - docker exec node-1 bash -c 'cd /home/fluxuser/Spindle-build && \ - tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp \ - node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ - $TARBALL1 - docker exec node-2 bash -c 'cd /home/fluxuser/Spindle-build && \ - tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp \ - node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ - $TARBALL2 - docker exec node-3 bash -c 'cd /home/fluxuser/Spindle-build && \ - tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp \ - node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ - $TARBALL3 - docker exec node-4 bash -c 'cd /home/fluxuser/Spindle-build && \ - tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp \ - node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ - $TARBALL4 + docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" + docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" + docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" + docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" + docker cp node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 + docker cp node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 + docker cp node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 + docker cp node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 - name: Upload logs to artifacts id: flux-ubuntu-artifact @@ -233,7 +221,7 @@ jobs: name: Testsuite (Slurm, Ubuntu) environment: Spindle CI runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 12 env: TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 @@ -286,7 +274,7 @@ jobs: # Matrix across "debug" - name: Run spindle-slurm-ubuntu testsuite - timeout-minutes: 8 + timeout-minutes: 4 id: slurm-ubuntu-debug3-testsuite run: | if [ "${{ matrix.debug }}" = "0" ]; then @@ -306,31 +294,16 @@ jobs: continue-on-error: true run: | docker exec slurm-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ - $TARBALL1 - docker exec slurm-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ - $TARBALL2 - docker exec slurm-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ - $TARBALL3 - docker exec slurm-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ - $TARBALL4 - docker exec slurm-head bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 \ - $TARBALLH + docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" + docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*" + docker cp slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 + docker cp slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 + docker cp slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 + docker cp slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 + docker cp slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -361,7 +334,7 @@ jobs: name: Testsuite (Slurm Plugin, Ubuntu) environment: Spindle CI runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 12 env: TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 @@ -414,7 +387,7 @@ jobs: # Matrix across "debug" - name: Run spindle-slurm-plugin-ubuntu testsuite - timeout-minutes: 13 + timeout-minutes: 4 id: slurm-ubuntu-testsuite run: | if [ "${{ matrix.debug }}" = "0" ]; then @@ -428,36 +401,21 @@ jobs: fi # Extract logs on failure - name: Pull logs out of the container(s) - id: slurm-ubuntu-copy-from-container + id: plugin-ubuntu-copy-from-container if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-plugin-node-1 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 \ - $TARBALL1 - docker exec slurm-plugin-node-2 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 \ - $TARBALL2 - docker exec slurm-plugin-node-3 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 \ - $TARBALL3 - docker exec slurm-plugin-node-4 bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 \ - $TARBALL4 - docker exec slurm-plugin-head bash -c 'cd /home/slurmuser/Spindle-build && \ - tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*' - docker cp \ - slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 \ - $TARBALLH + docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 + docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 + docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 + docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 + docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact From 66a505c2fdb9720d54b1a3a39c26319420407c89 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 00:28:56 -0700 Subject: [PATCH 52/66] Remove glob by tarring entire testsuite dir. --- .github/workflows/ci.yml | 43 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ba0f63a..ad6b5185 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -90,13 +90,8 @@ jobs: if: ${{ failure() && matrix.debug != 0}} continue-on-error: true run: | - docker exec spindlenode bash -c \ - "cd /home/spindleuser/Spindle-build && \ - tar cjvf ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 \ - ./testsuite/spindle_output*" - docker cp \ - spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 \ - $TARBALL + docker exec spindlenode bash -c "cd /home/spindleuser/Spindle-build && tar cjvf ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 ./testsuite/spindle_output*" + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 $TARBALL - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -183,11 +178,11 @@ jobs: if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | - docker exec node-1 bash -c 'ls /home/fluxuser/Spindle-build/testsuite' - docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" - docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" - docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" - docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" + docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" + docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" + docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" + docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" docker cp node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 docker cp node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 docker cp node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 @@ -293,12 +288,12 @@ jobs: if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | - docker exec slurm-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" - docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" + docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" + docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" + docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" + docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite" docker cp slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 docker cp slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 docker cp slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 @@ -405,16 +400,16 @@ jobs: if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | - docker exec slurm-plugin-node-1 bash -c 'ls /home/slurmuser/Spindle-build/testsuite' - docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" + docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" + docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" + docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" + docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite" docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 - docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite/spindle_output*" docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 - docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite/spindle_output*" docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 - docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite/spindle_output*" docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 - docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite/spindle_output*" docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 $TARBALLH - name: Upload slurm ubuntu logs From e8702a19d9ca2171de3aa660b75efa9605ecb624 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 00:49:45 -0700 Subject: [PATCH 53/66] Removes YAML variables from bash tar commands --- .github/workflows/ci.yml | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ad6b5185..c01909f2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -90,8 +90,8 @@ jobs: if: ${{ failure() && matrix.debug != 0}} continue-on-error: true run: | - docker exec spindlenode bash -c "cd /home/spindleuser/Spindle-build && tar cjvf ./ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 ./testsuite/spindle_output*" - docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs_dbg${{ matrix.debug }}.tar.bz2 $TARBALL + docker exec spindlenode bash -c "cd /home/spindleuser/Spindle-build && tar cjvf ./ubuntu_serial_logs.tar.bz2 ./testsuite/spindle_output*" + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 $TARBALL - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -179,14 +179,14 @@ jobs: continue-on-error: true run: | docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" - docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" - docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" - docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" - docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" - docker cp node-1:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 - docker cp node-2:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 - docker cp node-3:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 - docker cp node-4:/home/fluxuser/Spindle-build/flux_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 + docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 testsuite/spindle_output*" + docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 testsuite/spindle_output*" + docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 testsuite/spindle_output*" + docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 testsuite/spindle_output*" + docker cp node-1:/home/fluxuser/Spindle-build/node1.tar.bz2 $TARBALL1 + docker cp node-2:/home/fluxuser/Spindle-build/node2.tar.bz2 $TARBALL2 + docker cp node-3:/home/fluxuser/Spindle-build/node3.tar.bz2 $TARBALL3 + docker cp node-4:/home/fluxuser/Spindle-build/node4.tar.bz2 $TARBALL4 - name: Upload logs to artifacts id: flux-ubuntu-artifact @@ -289,16 +289,16 @@ jobs: continue-on-error: true run: | docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" - docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" - docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" - docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" - docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite" - docker cp slurm-node-1:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 - docker cp slurm-node-2:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 - docker cp slurm-node-3:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 - docker cp slurm-node-4:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 - docker cp slurm-head:/home/slurmuser/Spindle-build/slurm_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 $TARBALLH + docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 testsuite/spindle_output*" + docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 testsuite/spindle_output*" + docker cp slurm-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 + docker cp slurm-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 + docker cp slurm-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 + docker cp slurm-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 + docker cp slurm-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -401,16 +401,16 @@ jobs: continue-on-error: true run: | docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 testsuite" - docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 testsuite" - docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 testsuite" - docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 testsuite" - docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 testsuite" - docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node1.tar.bz2 $TARBALL1 - docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node2.tar.bz2 $TARBALL2 - docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node3.tar.bz2 $TARBALL3 - docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_node4.tar.bz2 $TARBALL4 - docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/slurm_plugin_ubuntu_dbg${{ matrix.debug }}_head.tar.bz2 $TARBALLH + docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 testsuite/spindle_output*" + docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 + docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 + docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 + docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 + docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact From df873e5063864c43bed38aa5d4f02a4f3b3883aa Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:02:09 -0700 Subject: [PATCH 54/66] Adds ./ in front of testsuite dir. If this doesn't work, we'll use the canonical dir next. --- .github/workflows/ci.yml | 41 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c01909f2..d81bca73 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,19 +45,6 @@ jobs: - name: Build spindle-serial-ubuntu image id: serial-ubuntu-build run: | - echo "Testing unquoted:" - echo $TARBALL - echo "Testing quoted:" - echo "$TARBALL" - echo "Testing context:" - echo ${{ env.TARBALL }} - echo "Testing quoted context" - echo "${{ env.TARBALL }}" - echo "Testing single quote:" - echo '$TARBALL' # This is the only one that doesn't work - echo "Testing single-quote context:" - echo '${{ env.TARBALL }}' - cd containers/spindle-serial-ubuntu docker compose --progress=plain build @@ -179,10 +166,10 @@ jobs: continue-on-error: true run: | docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" - docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 testsuite/spindle_output*" - docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 testsuite/spindle_output*" - docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 testsuite/spindle_output*" - docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 testsuite/spindle_output*" + docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" + docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" + docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" + docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" docker cp node-1:/home/fluxuser/Spindle-build/node1.tar.bz2 $TARBALL1 docker cp node-2:/home/fluxuser/Spindle-build/node2.tar.bz2 $TARBALL2 docker cp node-3:/home/fluxuser/Spindle-build/node3.tar.bz2 $TARBALL3 @@ -289,11 +276,11 @@ jobs: continue-on-error: true run: | docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 testsuite/spindle_output*" - docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 testsuite/spindle_output*" - docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 testsuite/spindle_output*" + docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 ./testsuite/spindle_output*" docker cp slurm-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 docker cp slurm-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 docker cp slurm-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 @@ -401,11 +388,11 @@ jobs: continue-on-error: true run: | docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 testsuite/spindle_output*" - docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 testsuite/spindle_output*" - docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 testsuite/spindle_output*" - docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 testsuite/spindle_output*" - docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 testsuite/spindle_output*" + docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 ./testsuite/spindle_output*" + docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 ./testsuite/spindle_output*" docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 From 785e0afc3d66c1c470386a46d1eb5bf3284647f1 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:18:52 -0700 Subject: [PATCH 55/66] Using full paths, serial only, force-"fail" --- .github/workflows/ci.yml | 654 +++++++++++++++++++-------------------- 1 file changed, 327 insertions(+), 327 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d81bca73..a0c0e006 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,15 +74,15 @@ jobs: # If we saw any failures, tar up the logfiles for extraction. - name: On failure, pull logs out of the container(s) id: serial-ubuntu-copy-from-container - if: ${{ failure() && matrix.debug != 0}} + if: ${{ success() && matrix.debug != 0}} # REVERT! continue-on-error: true run: | - docker exec spindlenode bash -c "cd /home/spindleuser/Spindle-build && tar cjvf ./ubuntu_serial_logs.tar.bz2 ./testsuite/spindle_output*" + docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 $TARBALL - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact - if: ${{ failure() }} + if: ${{ success() }} # REVERT! uses: actions/upload-artifact@v6 with: name: Ubuntu serial logs tarball @@ -97,327 +97,327 @@ jobs: cd containers/spindle-serial-ubuntu docker compose down -################################################################################ -# spindle-flux-ubuntu -################################################################################ - spindle-flux-ubuntu-debug3: - name: Testsuite (Flux, Ubuntu) - environment: Spindle CI - runs-on: ubuntu-latest - timeout-minutes: 20 - env: - TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - strategy: - fail-fast: false - matrix: - debug: [3, 2, 1, 0] - steps: - - name: Check out Spindle (spindle-flux-ubuntu-debug3) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - - - name: Setup Docker Compose (spindle-flux-ubuntu-debug3) - uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 - with: - version: latest - - - name: Build spindle-flux-ubuntu-debug3 image - id: flux-ubuntu-build - run: | - cd containers/spindle-flux-ubuntu - docker compose --progress=plain build - - - name: Bring spindle-flux-ubuntu up - id: flux-ubuntu--up - run: | - cd containers/spindle-flux-ubuntu - docker compose up -d --wait --wait-timeout 60 - - - name: Verify munge works in spindle-flux-ubuntu-debug3 - id: flux-ubuntu-debug3-munge - run: | - docker exec node-1 bash -c 'munge -n | unmunge' - - # Observed time: 5m 12s - - name: Run spindle-flux-ubuntu testsuite - timeout-minutes: 7 - id: flux-ubuntu-testsuite - run: | - if [ "${{ matrix.debug }}" = "0" ]; then - docker exec node-1 bash -c \ - 'cd /home/fluxuser/Spindle-build/testsuite && \ - flux alloc --nodes=${workers} \ - ./runTests --nodes=${workers} --tasks-per-node=3' - else - docker exec node-1 bash -c \ - 'cd /home/fluxuser/Spindle-build/testsuite && \ - SPINDLE_DEBUG=${{ matrix.debug }} \ - flux alloc --nodes=${workers} \ - ./runTests --nodes=${workers} --tasks-per-node=3' - fi - - # If we saw any failures, tar up the logfiles for extraction. Observed time: 7m 51s - - name: Extract logs from spindle-flux-ubuntu - timeout-minutes: 9 - id: flux-ubuntu-tar - if: ${{ failure() && matrix.debug != 0 }} - continue-on-error: true - run: | - docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" - docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" - docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" - docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" - docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" - docker cp node-1:/home/fluxuser/Spindle-build/node1.tar.bz2 $TARBALL1 - docker cp node-2:/home/fluxuser/Spindle-build/node2.tar.bz2 $TARBALL2 - docker cp node-3:/home/fluxuser/Spindle-build/node3.tar.bz2 $TARBALL3 - docker cp node-4:/home/fluxuser/Spindle-build/node4.tar.bz2 $TARBALL4 - - - name: Upload logs to artifacts - id: flux-ubuntu-artifact - if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v6 - with: - name: Ubuntu flux logs tarball - # NOTE: This is the runner path, not the container path. - path: | - $TARBALL1 - $TARBALL2 - $TARBALL3 - $TARBALL4 - - - name: spindle-flux-ubuntu teardown container - id: flux-ubuntu-down - if: ${{ always() }} - continue-on-error: true - run: | - cd containers/spindle-flux-ubuntu - docker compose down - -################################################################################ -# spindle-slurm-ubuntu -################################################################################ - spindle-slurm-ubuntu: - name: Testsuite (Slurm, Ubuntu) - environment: Spindle CI - runs-on: ubuntu-latest - timeout-minutes: 12 - env: - TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL4: ./slurm_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL5: ./slurm_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - strategy: - fail-fast: false - matrix: - debug: [3, 2, 1, 0] - steps: - - name: Check out Spindle - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - - - name: Setup Docker Compose - uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 - with: - version: latest - - - name: Login to GitHub Container Registry - if: ${{ !env.ACT }} - uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate MariaDB configuration - id: slurm-ubuntu-mariadb - run: | - cd containers/spindle-slurm-ubuntu/testing - ./generate_config.sh - - - name: Build spindle-slurm-ubuntu image - id: slurm-ubuntu-build - run: | - cd containers/spindle-slurm-ubuntu/testing - docker compose --progress=plain build - - - name: Bring spindle-slurm-ubuntu up - id: slurm-ubuntu-up - run: | - cd containers/spindle-slurm-ubuntu/testing - docker compose up -d --wait --wait-timeout 120 - - - name: Verify munge works in spindle-slurm-ubuntu - id: slurm-ubuntu-munge - run: | - docker exec slurm-head bash -c 'munge -n | unmunge' - -# Matrix across "debug" - - name: Run spindle-slurm-ubuntu testsuite - timeout-minutes: 4 - id: slurm-ubuntu-debug3-testsuite - run: | - if [ "${{ matrix.debug }}" = "0" ]; then - docker exec slurm-head bash -c \ - 'cd Spindle-build/testsuite && \ - salloc -n${workers} -N${workers} ./runTests ${workers}' - else - docker exec slurm-head bash -c \ - 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ - salloc -n${workers} -N${workers} ./runTests ${workers}' - fi - -# Extract logs on failure - - name: Pull logs out of the container(s) - id: slurm-ubuntu-copy-from-container - if: ${{ failure() && matrix.debug != 0 }} - continue-on-error: true - run: | - docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 ./testsuite/spindle_output*" - docker cp slurm-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 - docker cp slurm-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 - docker cp slurm-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 - docker cp slurm-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 - docker cp slurm-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH - - - name: Upload slurm ubuntu logs - id: slurm-ubuntu-copy-to-artifact - if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v6 - with: - name: Ubuntu slurm logs tarball - # NOTE: This is the runner path, not the container path. - path: | - $TARBALL1 - $TARBALL2 - $TARBALL3 - $TARBALL4 - $TARBALLH - - - name: Bring spindle-slurm-ubuntu down - id: slurm-ubuntu-down - if: ${{ always() }} - continue-on-error: true - run: | - cd containers/spindle-slurm-ubuntu/testing - docker compose down - -################################################################################ -# spindle-slurm-plugin-ubuntu -################################################################################ - spindle-slurm-plugin-ubuntu: - name: Testsuite (Slurm Plugin, Ubuntu) - environment: Spindle CI - runs-on: ubuntu-latest - timeout-minutes: 12 - env: - TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL4: ./plugin_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL5: ./plugin_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - strategy: - fail-fast: false - matrix: - debug: [3, 2, 1, 0] - steps: - - name: Check out Spindle - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 - - - name: Setup Docker Compose - uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 - with: - version: latest - - - name: Login to GitHub Container Registry - if: ${{ !env.ACT }} - uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate MariaDB configuration - id: slurm-ubuntu-mariadb - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - ./generate_config.sh - - - name: Build spindle-slurm-plugin-ubuntu image - id: slurm-ubuntu-build - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - docker compose --progress=plain build - - - name: Bring spindle-slurm-plugin-ubuntu up - id: slurm-ubuntu-up - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - docker compose up -d --wait --wait-timeout 120 - - - name: Verify munge works in spindle-slurm-plugin-ubuntu - id: slurm-ubuntu-munge - run: | - docker exec slurm-plugin-head bash -c 'munge -n | unmunge' - -# Matrix across "debug" - - name: Run spindle-slurm-plugin-ubuntu testsuite - timeout-minutes: 4 - id: slurm-ubuntu-testsuite - run: | - if [ "${{ matrix.debug }}" = "0" ]; then - docker exec slurm-plugin-head bash -c \ - 'cd Spindle-build/testsuite && \ - salloc -n${workers} -N${workers} ./runTests ${workers}' - else - docker exec slurm-plugin-head bash -c \ - 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ - salloc -n${workers} -N${workers} ./runTests ${workers}' - fi -# Extract logs on failure - - name: Pull logs out of the container(s) - id: plugin-ubuntu-copy-from-container - if: ${{ failure() && matrix.debug != 0 }} - continue-on-error: true - run: | - docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 ./testsuite/spindle_output*" - docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 ./testsuite/spindle_output*" - docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 - docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 - docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 - docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 - docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH - - - name: Upload slurm ubuntu logs - id: slurm-ubuntu-copy-to-artifact - if: ${{ failure() && matrix.debug != 0 }} - uses: actions/upload-artifact@v6 - with: - name: Ubuntu slurm logs tarball - # NOTE: This is the runner path, not the container path. - path: | - $TARBALL1 - $TARBALL2 - $TARBALL3 - $TARBALL4 - $TARBALLH - - - name: Bring spindle-slurm-plugin-ubuntu down - id: slurm-ubuntu-down - if: ${{ always() }} - continue-on-error: true - run: | - cd containers/spindle-slurm-ubuntu/testing-plugin - docker compose down - +################################################################################# +## spindle-flux-ubuntu +################################################################################# +# spindle-flux-ubuntu-debug3: +# name: Testsuite (Flux, Ubuntu) +# environment: Spindle CI +# runs-on: ubuntu-latest +# timeout-minutes: 20 +# env: +# TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# strategy: +# fail-fast: false +# matrix: +# debug: [3, 2, 1, 0] +# steps: +# - name: Check out Spindle (spindle-flux-ubuntu-debug3) +# uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd +# +# - name: Setup Docker Compose (spindle-flux-ubuntu-debug3) +# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 +# with: +# version: latest +# +# - name: Build spindle-flux-ubuntu-debug3 image +# id: flux-ubuntu-build +# run: | +# cd containers/spindle-flux-ubuntu +# docker compose --progress=plain build +# +# - name: Bring spindle-flux-ubuntu up +# id: flux-ubuntu--up +# run: | +# cd containers/spindle-flux-ubuntu +# docker compose up -d --wait --wait-timeout 60 +# +# - name: Verify munge works in spindle-flux-ubuntu-debug3 +# id: flux-ubuntu-debug3-munge +# run: | +# docker exec node-1 bash -c 'munge -n | unmunge' +# +# # Observed time: 5m 12s +# - name: Run spindle-flux-ubuntu testsuite +# timeout-minutes: 7 +# id: flux-ubuntu-testsuite +# run: | +# if [ "${{ matrix.debug }}" = "0" ]; then +# docker exec node-1 bash -c \ +# 'cd /home/fluxuser/Spindle-build/testsuite && \ +# flux alloc --nodes=${workers} \ +# ./runTests --nodes=${workers} --tasks-per-node=3' +# else +# docker exec node-1 bash -c \ +# 'cd /home/fluxuser/Spindle-build/testsuite && \ +# SPINDLE_DEBUG=${{ matrix.debug }} \ +# flux alloc --nodes=${workers} \ +# ./runTests --nodes=${workers} --tasks-per-node=3' +# fi +# +# # If we saw any failures, tar up the logfiles for extraction. Observed time: 7m 51s +# - name: Extract logs from spindle-flux-ubuntu +# timeout-minutes: 9 +# id: flux-ubuntu-tar +# if: ${{ failure() && matrix.debug != 0 }} +# continue-on-error: true +# run: | +# docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" +# docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" +# docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" +# docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" +# docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" +# docker cp node-1:/home/fluxuser/Spindle-build/node1.tar.bz2 $TARBALL1 +# docker cp node-2:/home/fluxuser/Spindle-build/node2.tar.bz2 $TARBALL2 +# docker cp node-3:/home/fluxuser/Spindle-build/node3.tar.bz2 $TARBALL3 +# docker cp node-4:/home/fluxuser/Spindle-build/node4.tar.bz2 $TARBALL4 +# +# - name: Upload logs to artifacts +# id: flux-ubuntu-artifact +# if: ${{ failure() && matrix.debug != 0 }} +# uses: actions/upload-artifact@v6 +# with: +# name: Ubuntu flux logs tarball +# # NOTE: This is the runner path, not the container path. +# path: | +# $TARBALL1 +# $TARBALL2 +# $TARBALL3 +# $TARBALL4 +# +# - name: spindle-flux-ubuntu teardown container +# id: flux-ubuntu-down +# if: ${{ always() }} +# continue-on-error: true +# run: | +# cd containers/spindle-flux-ubuntu +# docker compose down +# +################################################################################# +## spindle-slurm-ubuntu +################################################################################# +# spindle-slurm-ubuntu: +# name: Testsuite (Slurm, Ubuntu) +# environment: Spindle CI +# runs-on: ubuntu-latest +# timeout-minutes: 12 +# env: +# TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL4: ./slurm_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL5: ./slurm_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# strategy: +# fail-fast: false +# matrix: +# debug: [3, 2, 1, 0] +# steps: +# - name: Check out Spindle +# uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd +# +# - name: Setup Docker Compose +# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 +# with: +# version: latest +# +# - name: Login to GitHub Container Registry +# if: ${{ !env.ACT }} +# uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 +# with: +# registry: ghcr.io +# username: ${{ github.actor }} +# password: ${{ secrets.GITHUB_TOKEN }} +# +# - name: Generate MariaDB configuration +# id: slurm-ubuntu-mariadb +# run: | +# cd containers/spindle-slurm-ubuntu/testing +# ./generate_config.sh +# +# - name: Build spindle-slurm-ubuntu image +# id: slurm-ubuntu-build +# run: | +# cd containers/spindle-slurm-ubuntu/testing +# docker compose --progress=plain build +# +# - name: Bring spindle-slurm-ubuntu up +# id: slurm-ubuntu-up +# run: | +# cd containers/spindle-slurm-ubuntu/testing +# docker compose up -d --wait --wait-timeout 120 +# +# - name: Verify munge works in spindle-slurm-ubuntu +# id: slurm-ubuntu-munge +# run: | +# docker exec slurm-head bash -c 'munge -n | unmunge' +# +## Matrix across "debug" +# - name: Run spindle-slurm-ubuntu testsuite +# timeout-minutes: 4 +# id: slurm-ubuntu-debug3-testsuite +# run: | +# if [ "${{ matrix.debug }}" = "0" ]; then +# docker exec slurm-head bash -c \ +# 'cd Spindle-build/testsuite && \ +# salloc -n${workers} -N${workers} ./runTests ${workers}' +# else +# docker exec slurm-head bash -c \ +# 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ +# salloc -n${workers} -N${workers} ./runTests ${workers}' +# fi +# +## Extract logs on failure +# - name: Pull logs out of the container(s) +# id: slurm-ubuntu-copy-from-container +# if: ${{ failure() && matrix.debug != 0 }} +# continue-on-error: true +# run: | +# docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" +# docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 ./testsuite/spindle_output*" +# docker cp slurm-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 +# docker cp slurm-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 +# docker cp slurm-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 +# docker cp slurm-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 +# docker cp slurm-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH +# +# - name: Upload slurm ubuntu logs +# id: slurm-ubuntu-copy-to-artifact +# if: ${{ failure() && matrix.debug != 0 }} +# uses: actions/upload-artifact@v6 +# with: +# name: Ubuntu slurm logs tarball +# # NOTE: This is the runner path, not the container path. +# path: | +# $TARBALL1 +# $TARBALL2 +# $TARBALL3 +# $TARBALL4 +# $TARBALLH +# +# - name: Bring spindle-slurm-ubuntu down +# id: slurm-ubuntu-down +# if: ${{ always() }} +# continue-on-error: true +# run: | +# cd containers/spindle-slurm-ubuntu/testing +# docker compose down +# +################################################################################# +## spindle-slurm-plugin-ubuntu +################################################################################# +# spindle-slurm-plugin-ubuntu: +# name: Testsuite (Slurm Plugin, Ubuntu) +# environment: Spindle CI +# runs-on: ubuntu-latest +# timeout-minutes: 12 +# env: +# TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL4: ./plugin_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# TARBALL5: ./plugin_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 +# strategy: +# fail-fast: false +# matrix: +# debug: [3, 2, 1, 0] +# steps: +# - name: Check out Spindle +# uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 +# +# - name: Setup Docker Compose +# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 +# with: +# version: latest +# +# - name: Login to GitHub Container Registry +# if: ${{ !env.ACT }} +# uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef +# with: +# registry: ghcr.io +# username: ${{ github.actor }} +# password: ${{ secrets.GITHUB_TOKEN }} +# +# - name: Generate MariaDB configuration +# id: slurm-ubuntu-mariadb +# run: | +# cd containers/spindle-slurm-ubuntu/testing-plugin +# ./generate_config.sh +# +# - name: Build spindle-slurm-plugin-ubuntu image +# id: slurm-ubuntu-build +# run: | +# cd containers/spindle-slurm-ubuntu/testing-plugin +# docker compose --progress=plain build +# +# - name: Bring spindle-slurm-plugin-ubuntu up +# id: slurm-ubuntu-up +# run: | +# cd containers/spindle-slurm-ubuntu/testing-plugin +# docker compose up -d --wait --wait-timeout 120 +# +# - name: Verify munge works in spindle-slurm-plugin-ubuntu +# id: slurm-ubuntu-munge +# run: | +# docker exec slurm-plugin-head bash -c 'munge -n | unmunge' +# +## Matrix across "debug" +# - name: Run spindle-slurm-plugin-ubuntu testsuite +# timeout-minutes: 4 +# id: slurm-ubuntu-testsuite +# run: | +# if [ "${{ matrix.debug }}" = "0" ]; then +# docker exec slurm-plugin-head bash -c \ +# 'cd Spindle-build/testsuite && \ +# salloc -n${workers} -N${workers} ./runTests ${workers}' +# else +# docker exec slurm-plugin-head bash -c \ +# 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ +# salloc -n${workers} -N${workers} ./runTests ${workers}' +# fi +## Extract logs on failure +# - name: Pull logs out of the container(s) +# id: plugin-ubuntu-copy-from-container +# if: ${{ failure() && matrix.debug != 0 }} +# continue-on-error: true +# run: | +# docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" +# docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 ./testsuite/spindle_output*" +# docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 ./testsuite/spindle_output*" +# docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 +# docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 +# docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 +# docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 +# docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH +# +# - name: Upload slurm ubuntu logs +# id: slurm-ubuntu-copy-to-artifact +# if: ${{ failure() && matrix.debug != 0 }} +# uses: actions/upload-artifact@v6 +# with: +# name: Ubuntu slurm logs tarball +# # NOTE: This is the runner path, not the container path. +# path: | +# $TARBALL1 +# $TARBALL2 +# $TARBALL3 +# $TARBALL4 +# $TARBALLH +# +# - name: Bring spindle-slurm-plugin-ubuntu down +# id: slurm-ubuntu-down +# if: ${{ always() }} +# continue-on-error: true +# run: | +# cd containers/spindle-slurm-ubuntu/testing-plugin +# docker compose down +# From 37c508ea0ca5247e9ac25e74fa0f0c620f658fe0 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:32:20 -0700 Subject: [PATCH 56/66] Testing hard-coded filename for one DEBUG case --- .github/workflows/ci.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a0c0e006..21f0e0e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: strategy: fail-fast: false matrix: - debug: [3, 2, 1, 0] + debug: [3] # revert! steps: - name: Check out Spindle uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -78,7 +78,7 @@ jobs: continue-on-error: true run: | docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" - docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 $TARBALL + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs.tar.bz2 - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -86,8 +86,7 @@ jobs: uses: actions/upload-artifact@v6 with: name: Ubuntu serial logs tarball - # NOTE: This is the runner path, not the container path. - path: $TARBALL + path: ./ubuntu_serial_logs.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down From 4d68fc337acc8b6529936e0fce81a2a6a7728e8f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:41:28 -0700 Subject: [PATCH 57/66] Gingerly reintroducing a YAML variable. --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21f0e0e5..3064ee7e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: continue-on-error: true run: | docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" - docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs.tar.bz2 + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs_${{ maxtrix.debug }}.tar.bz2 - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -86,7 +86,7 @@ jobs: uses: actions/upload-artifact@v6 with: name: Ubuntu serial logs tarball - path: ./ubuntu_serial_logs.tar.bz2 + path: ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down From 553f8a9586c88e3aa595afdcc5fc339c49c932b9 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:46:22 -0700 Subject: [PATCH 58/66] Fix typo --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3064ee7e..da464f3f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: continue-on-error: true run: | docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" - docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs_${{ maxtrix.debug }}.tar.bz2 + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact From 24d851406838f32da06ad37d2cb50c4f38f8266f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 01:54:28 -0700 Subject: [PATCH 59/66] Gingerly restoring environment variables. --- .github/workflows/ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da464f3f..66b07947 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,9 @@ jobs: continue-on-error: true run: | docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" - docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 + docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ${{ env.TARBALL }} + # This works + #docker cp spindlenode:/home/spindleuser/ubuntu_serial_logs.tar.bz2 ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact @@ -86,7 +88,9 @@ jobs: uses: actions/upload-artifact@v6 with: name: Ubuntu serial logs tarball - path: ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 + path: ${{ env.TARBALL }} + # This works + #path: ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 - name: Bring spindle-serial-ubuntu down id: serial-ubuntu-down From b33298ea36e90ba84916d7d1216b5fd29bb9af27 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 08:36:58 -0700 Subject: [PATCH 60/66] Bring flux back online with lessons learned. --- .github/workflows/ci.yml | 205 ++++++++++++++++++++------------------- 1 file changed, 105 insertions(+), 100 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 66b07947..71d0508a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,10 @@ # FIXME Handle case where only a subset of nodes generate logs. # FIXME More refactoring, possibly with bash functions and filename templates as ENV variables +# Lessons learned: +# Invoking tar via bash -c really wants full paths for both parameters. +# Given that, there's no need for a cd call ahead of it. +# YAML env variables don't get translated in bash -c strings. +# Use ${{ env.FOO }} to dereference YAML environment variables. name: ci on: push: @@ -32,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - debug: [3] # revert! + debug: [3] # REVERT! steps: - name: Check out Spindle uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -100,105 +105,105 @@ jobs: cd containers/spindle-serial-ubuntu docker compose down -################################################################################# -## spindle-flux-ubuntu -################################################################################# -# spindle-flux-ubuntu-debug3: -# name: Testsuite (Flux, Ubuntu) -# environment: Spindle CI -# runs-on: ubuntu-latest -# timeout-minutes: 20 -# env: -# TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# strategy: -# fail-fast: false -# matrix: -# debug: [3, 2, 1, 0] -# steps: -# - name: Check out Spindle (spindle-flux-ubuntu-debug3) -# uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd -# -# - name: Setup Docker Compose (spindle-flux-ubuntu-debug3) -# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 -# with: -# version: latest -# -# - name: Build spindle-flux-ubuntu-debug3 image -# id: flux-ubuntu-build -# run: | -# cd containers/spindle-flux-ubuntu -# docker compose --progress=plain build -# -# - name: Bring spindle-flux-ubuntu up -# id: flux-ubuntu--up -# run: | -# cd containers/spindle-flux-ubuntu -# docker compose up -d --wait --wait-timeout 60 -# -# - name: Verify munge works in spindle-flux-ubuntu-debug3 -# id: flux-ubuntu-debug3-munge -# run: | -# docker exec node-1 bash -c 'munge -n | unmunge' -# -# # Observed time: 5m 12s -# - name: Run spindle-flux-ubuntu testsuite -# timeout-minutes: 7 -# id: flux-ubuntu-testsuite -# run: | -# if [ "${{ matrix.debug }}" = "0" ]; then -# docker exec node-1 bash -c \ -# 'cd /home/fluxuser/Spindle-build/testsuite && \ -# flux alloc --nodes=${workers} \ -# ./runTests --nodes=${workers} --tasks-per-node=3' -# else -# docker exec node-1 bash -c \ -# 'cd /home/fluxuser/Spindle-build/testsuite && \ -# SPINDLE_DEBUG=${{ matrix.debug }} \ -# flux alloc --nodes=${workers} \ -# ./runTests --nodes=${workers} --tasks-per-node=3' -# fi -# -# # If we saw any failures, tar up the logfiles for extraction. Observed time: 7m 51s -# - name: Extract logs from spindle-flux-ubuntu -# timeout-minutes: 9 -# id: flux-ubuntu-tar -# if: ${{ failure() && matrix.debug != 0 }} -# continue-on-error: true -# run: | -# docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" -# docker exec node-1 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" -# docker exec node-2 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" -# docker exec node-3 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" -# docker exec node-4 bash -c "cd /home/fluxuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" -# docker cp node-1:/home/fluxuser/Spindle-build/node1.tar.bz2 $TARBALL1 -# docker cp node-2:/home/fluxuser/Spindle-build/node2.tar.bz2 $TARBALL2 -# docker cp node-3:/home/fluxuser/Spindle-build/node3.tar.bz2 $TARBALL3 -# docker cp node-4:/home/fluxuser/Spindle-build/node4.tar.bz2 $TARBALL4 -# -# - name: Upload logs to artifacts -# id: flux-ubuntu-artifact -# if: ${{ failure() && matrix.debug != 0 }} -# uses: actions/upload-artifact@v6 -# with: -# name: Ubuntu flux logs tarball -# # NOTE: This is the runner path, not the container path. -# path: | -# $TARBALL1 -# $TARBALL2 -# $TARBALL3 -# $TARBALL4 -# -# - name: spindle-flux-ubuntu teardown container -# id: flux-ubuntu-down -# if: ${{ always() }} -# continue-on-error: true -# run: | -# cd containers/spindle-flux-ubuntu -# docker compose down -# +################################################################################ +# spindle-flux-ubuntu +################################################################################ + spindle-flux-ubuntu: + name: Testsuite (Flux, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + strategy: + fail-fast: false + matrix: + debug: [0, 1, 2, 3] + steps: + - name: Check out Spindle (spindle-flux-ubuntu-debug3) + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + + - name: Setup Docker Compose (spindle-flux-ubuntu-debug3) + uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + with: + version: latest + + - name: Build spindle-flux-ubuntu-debug3 image + id: flux-ubuntu-build + run: | + cd containers/spindle-flux-ubuntu + docker compose --progress=plain build + + - name: Bring spindle-flux-ubuntu up + id: flux-ubuntu--up + run: | + cd containers/spindle-flux-ubuntu + docker compose up -d --wait --wait-timeout 60 + + - name: Verify munge works in spindle-flux-ubuntu-debug3 + id: flux-ubuntu-debug3-munge + run: | + docker exec node-1 bash -c 'munge -n | unmunge' + + # Observed time: 5m 12s + - name: Run spindle-flux-ubuntu testsuite + timeout-minutes: 7 + id: flux-ubuntu-testsuite + run: | + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec node-1 bash -c \ + 'cd /home/fluxuser/Spindle-build/testsuite && \ + flux alloc --nodes=${workers} \ + ./runTests --nodes=${workers} --tasks-per-node=3' + else + docker exec node-1 bash -c \ + 'cd /home/fluxuser/Spindle-build/testsuite && \ + SPINDLE_DEBUG=${{ matrix.debug }} \ + flux alloc --nodes=${workers} \ + ./runTests --nodes=${workers} --tasks-per-node=3' + fi + + # If we saw any failures, tar up the logfiles for extraction. Observed time: 7m 51s + - name: Extract logs from spindle-flux-ubuntu + timeout-minutes: 9 + id: flux-ubuntu-tar + if: ${{ success() && matrix.debug != 0 }} # REVERT! + continue-on-error: true + run: | + docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker exec node-1 bash -c "tar cjvf /home/fluxuser/node1.tar.bz2 /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker exec node-2 bash -c "tar cjvf /home/fluxuser/node2.tar.bz2 /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker exec node-3 bash -c "tar cjvf /home/fluxuser/node3.tar.bz2 /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker exec node-4 bash -c "tar cjvf /home/fluxuser/node4.tar.bz2 /home/fluxuser/Spindle-build/testsuite/spindle_output*" + docker cp node-1:/home/fluxuser/node1.tar.bz2 ${{ env.TARBALL1 }} + docker cp node-2:/home/fluxuser/node2.tar.bz2 ${{ env.TARBALL2 }} + docker cp node-3:/home/fluxuser/node3.tar.bz2 ${{ env.TARBALL3 }} + docker cp node-4:/home/fluxuser/node4.tar.bz2 ${{ env.TARBALL4 }} + + - name: Upload logs to artifacts + id: flux-ubuntu-artifact + if: ${{ success() && matrix.debug != 0 }} # REVERT! + uses: actions/upload-artifact@v6 + with: + name: Ubuntu flux logs tarball + # NOTE: This is the runner path, not the container path. + path: | + ${{ env.TARBALL1 }} + ${{ env.TARBALL2 }} + ${{ env.TARBALL3 }} + ${{ env.TARBALL4 }} + + - name: spindle-flux-ubuntu teardown container + id: flux-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-flux-ubuntu + docker compose down + ################################################################################# ## spindle-slurm-ubuntu ################################################################################# From e46cca2298f002427023144f5b75d0d7b585bdb0 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 08:58:34 -0700 Subject: [PATCH 61/66] Pass unique name to upload action. --- .github/workflows/ci.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 71d0508a..c3424c56 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - debug: [3] # REVERT! + debug: [0, 1, 2, 3] steps: - name: Check out Spindle uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -114,10 +114,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 20 env: - TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_${{ github.sha }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + TARBALL1: ./flux_node1_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + UNIQNAME: ./flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} strategy: fail-fast: false matrix: @@ -188,7 +189,7 @@ jobs: if: ${{ success() && matrix.debug != 0 }} # REVERT! uses: actions/upload-artifact@v6 with: - name: Ubuntu flux logs tarball + name: ${{ env.UNIQNAME }} # NOTE: This is the runner path, not the container path. path: | ${{ env.TARBALL1 }} From 2c5dc7d8fd3bb2ce204136e2dab7f153f7f73864 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 09:19:38 -0700 Subject: [PATCH 62/66] Tweaks. --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c3424c56..b857be36 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: timeout-minutes: 20 env: TARBALL: ./serial_ubunutu_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 + UNIQNAME: flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} strategy: fail-fast: false matrix: @@ -92,7 +93,7 @@ jobs: if: ${{ success() }} # REVERT! uses: actions/upload-artifact@v6 with: - name: Ubuntu serial logs tarball + name: ${{ env.UNIQNAME }} path: ${{ env.TARBALL }} # This works #path: ./ubuntu_serial_logs_${{ matrix.debug }}.tar.bz2 @@ -118,7 +119,7 @@ jobs: TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 - UNIQNAME: ./flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} + UNIQNAME: flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} strategy: fail-fast: false matrix: From a66b780cabb4df2d26179ca9fbb26873b949c6e6 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 09:32:33 -0700 Subject: [PATCH 63/66] Unique name fixes. --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b857be36..83965729 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ # Given that, there's no need for a cd call ahead of it. # YAML env variables don't get translated in bash -c strings. # Use ${{ env.FOO }} to dereference YAML environment variables. +# The env space is global across jobs. name: ci on: push: @@ -34,7 +35,7 @@ jobs: timeout-minutes: 20 env: TARBALL: ./serial_ubunutu_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 - UNIQNAME: flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} + UNIQNAME: spindle_serial_ubuntu_${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} strategy: fail-fast: false matrix: @@ -119,7 +120,7 @@ jobs: TARBALL2: ./flux_node2_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 TARBALL3: ./flux_node3_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 TARBALL4: ./flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 - UNIQNAME: flux_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} + UNIQNAME: spindle_flux_ubutnu_${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} strategy: fail-fast: false matrix: From 0e8f8e21bb31c7779f6971efe62f6c1a6ef0bccf Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 09:56:35 -0700 Subject: [PATCH 64/66] slurm and plugin back online --- .github/workflows/ci.yml | 452 ++++++++++++++++++++------------------- 1 file changed, 227 insertions(+), 225 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83965729..af671581 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,228 +207,230 @@ jobs: cd containers/spindle-flux-ubuntu docker compose down -################################################################################# -## spindle-slurm-ubuntu -################################################################################# -# spindle-slurm-ubuntu: -# name: Testsuite (Slurm, Ubuntu) -# environment: Spindle CI -# runs-on: ubuntu-latest -# timeout-minutes: 12 -# env: -# TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL4: ./slurm_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL5: ./slurm_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# strategy: -# fail-fast: false -# matrix: -# debug: [3, 2, 1, 0] -# steps: -# - name: Check out Spindle -# uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd -# -# - name: Setup Docker Compose -# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 -# with: -# version: latest -# -# - name: Login to GitHub Container Registry -# if: ${{ !env.ACT }} -# uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 -# with: -# registry: ghcr.io -# username: ${{ github.actor }} -# password: ${{ secrets.GITHUB_TOKEN }} -# -# - name: Generate MariaDB configuration -# id: slurm-ubuntu-mariadb -# run: | -# cd containers/spindle-slurm-ubuntu/testing -# ./generate_config.sh -# -# - name: Build spindle-slurm-ubuntu image -# id: slurm-ubuntu-build -# run: | -# cd containers/spindle-slurm-ubuntu/testing -# docker compose --progress=plain build -# -# - name: Bring spindle-slurm-ubuntu up -# id: slurm-ubuntu-up -# run: | -# cd containers/spindle-slurm-ubuntu/testing -# docker compose up -d --wait --wait-timeout 120 -# -# - name: Verify munge works in spindle-slurm-ubuntu -# id: slurm-ubuntu-munge -# run: | -# docker exec slurm-head bash -c 'munge -n | unmunge' -# -## Matrix across "debug" -# - name: Run spindle-slurm-ubuntu testsuite -# timeout-minutes: 4 -# id: slurm-ubuntu-debug3-testsuite -# run: | -# if [ "${{ matrix.debug }}" = "0" ]; then -# docker exec slurm-head bash -c \ -# 'cd Spindle-build/testsuite && \ -# salloc -n${workers} -N${workers} ./runTests ${workers}' -# else -# docker exec slurm-head bash -c \ -# 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ -# salloc -n${workers} -N${workers} ./runTests ${workers}' -# fi -# -## Extract logs on failure -# - name: Pull logs out of the container(s) -# id: slurm-ubuntu-copy-from-container -# if: ${{ failure() && matrix.debug != 0 }} -# continue-on-error: true -# run: | -# docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" -# docker exec slurm-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node1.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node2.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node3.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./node4.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf ./head.tar.bz2 ./testsuite/spindle_output*" -# docker cp slurm-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 -# docker cp slurm-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 -# docker cp slurm-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 -# docker cp slurm-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 -# docker cp slurm-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH -# -# - name: Upload slurm ubuntu logs -# id: slurm-ubuntu-copy-to-artifact -# if: ${{ failure() && matrix.debug != 0 }} -# uses: actions/upload-artifact@v6 -# with: -# name: Ubuntu slurm logs tarball -# # NOTE: This is the runner path, not the container path. -# path: | -# $TARBALL1 -# $TARBALL2 -# $TARBALL3 -# $TARBALL4 -# $TARBALLH -# -# - name: Bring spindle-slurm-ubuntu down -# id: slurm-ubuntu-down -# if: ${{ always() }} -# continue-on-error: true -# run: | -# cd containers/spindle-slurm-ubuntu/testing -# docker compose down -# -################################################################################# -## spindle-slurm-plugin-ubuntu -################################################################################# -# spindle-slurm-plugin-ubuntu: -# name: Testsuite (Slurm Plugin, Ubuntu) -# environment: Spindle CI -# runs-on: ubuntu-latest -# timeout-minutes: 12 -# env: -# TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL4: ./plugin_node4_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# TARBALL5: ./plugin_nodeH_dbg${{ matrix.debug }}_${{ github.run_id }}_${{ github.run_attempt }}.tar.bz2 -# strategy: -# fail-fast: false -# matrix: -# debug: [3, 2, 1, 0] -# steps: -# - name: Check out Spindle -# uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 -# -# - name: Setup Docker Compose -# uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 -# with: -# version: latest -# -# - name: Login to GitHub Container Registry -# if: ${{ !env.ACT }} -# uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef -# with: -# registry: ghcr.io -# username: ${{ github.actor }} -# password: ${{ secrets.GITHUB_TOKEN }} -# -# - name: Generate MariaDB configuration -# id: slurm-ubuntu-mariadb -# run: | -# cd containers/spindle-slurm-ubuntu/testing-plugin -# ./generate_config.sh -# -# - name: Build spindle-slurm-plugin-ubuntu image -# id: slurm-ubuntu-build -# run: | -# cd containers/spindle-slurm-ubuntu/testing-plugin -# docker compose --progress=plain build -# -# - name: Bring spindle-slurm-plugin-ubuntu up -# id: slurm-ubuntu-up -# run: | -# cd containers/spindle-slurm-ubuntu/testing-plugin -# docker compose up -d --wait --wait-timeout 120 -# -# - name: Verify munge works in spindle-slurm-plugin-ubuntu -# id: slurm-ubuntu-munge -# run: | -# docker exec slurm-plugin-head bash -c 'munge -n | unmunge' -# -## Matrix across "debug" -# - name: Run spindle-slurm-plugin-ubuntu testsuite -# timeout-minutes: 4 -# id: slurm-ubuntu-testsuite -# run: | -# if [ "${{ matrix.debug }}" = "0" ]; then -# docker exec slurm-plugin-head bash -c \ -# 'cd Spindle-build/testsuite && \ -# salloc -n${workers} -N${workers} ./runTests ${workers}' -# else -# docker exec slurm-plugin-head bash -c \ -# 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ -# salloc -n${workers} -N${workers} ./runTests ${workers}' -# fi -## Extract logs on failure -# - name: Pull logs out of the container(s) -# id: plugin-ubuntu-copy-from-container -# if: ${{ failure() && matrix.debug != 0 }} -# continue-on-error: true -# run: | -# docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" -# docker exec slurm-plugin-node-1 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node1.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-plugin-node-2 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node2.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-plugin-node-3 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node3.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-plugin-node-4 bash -c "cd /home/slurmuser/Spindle-build && tar cjvf node4.tar.bz2 ./testsuite/spindle_output*" -# docker exec slurm-plugin-head bash -c "cd /home/slurmuser/Spindle-build && tar cjvf head.tar.bz2 ./testsuite/spindle_output*" -# docker cp slurm-plugin-node-1:/home/slurmuser/Spindle-build/node1.tar.bz2 $TARBALL1 -# docker cp slurm-plugin-node-2:/home/slurmuser/Spindle-build/node2.tar.bz2 $TARBALL2 -# docker cp slurm-plugin-node-3:/home/slurmuser/Spindle-build/node3.tar.bz2 $TARBALL3 -# docker cp slurm-plugin-node-4:/home/slurmuser/Spindle-build/node4.tar.bz2 $TARBALL4 -# docker cp slurm-plugin-head:/home/slurmuser/Spindle-build/head.tar.bz2 $TARBALLH -# -# - name: Upload slurm ubuntu logs -# id: slurm-ubuntu-copy-to-artifact -# if: ${{ failure() && matrix.debug != 0 }} -# uses: actions/upload-artifact@v6 -# with: -# name: Ubuntu slurm logs tarball -# # NOTE: This is the runner path, not the container path. -# path: | -# $TARBALL1 -# $TARBALL2 -# $TARBALL3 -# $TARBALL4 -# $TARBALLH -# -# - name: Bring spindle-slurm-plugin-ubuntu down -# id: slurm-ubuntu-down -# if: ${{ always() }} -# continue-on-error: true -# run: | -# cd containers/spindle-slurm-ubuntu/testing-plugin -# docker compose down -# +################################################################################ +# spindle-slurm-ubuntu +################################################################################ + spindle-slurm-ubuntu: + name: Testsuite (Slurm, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 12 + env: + TARBALL1: ./slurm_node1_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./slurm_node2_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./slurm_node3_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./slurm_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL5: ./slurm_nodeH_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + UNIQNAME: spindle_slurm_ubuntu_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] + steps: + - name: Check out Spindle + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + + - name: Setup Docker Compose + uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + with: + version: latest + + - name: Login to GitHub Container Registry + if: ${{ !env.ACT }} + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate MariaDB configuration + id: slurm-ubuntu-mariadb + run: | + cd containers/spindle-slurm-ubuntu/testing + ./generate_config.sh + + - name: Build spindle-slurm-ubuntu image + id: slurm-ubuntu-build + run: | + cd containers/spindle-slurm-ubuntu/testing + docker compose --progress=plain build + + - name: Bring spindle-slurm-ubuntu up + id: slurm-ubuntu-up + run: | + cd containers/spindle-slurm-ubuntu/testing + docker compose up -d --wait --wait-timeout 120 + + - name: Verify munge works in spindle-slurm-ubuntu + id: slurm-ubuntu-munge + run: | + docker exec slurm-head bash -c 'munge -n | unmunge' + +# Matrix across "debug" + - name: Run spindle-slurm-ubuntu testsuite + timeout-minutes: 4 + id: slurm-ubuntu-debug3-testsuite + run: | + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec slurm-head bash -c \ + 'cd Spindle-build/testsuite && \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + else + docker exec slurm-head bash -c \ + 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + fi + +# Extract logs on failure + - name: Pull logs out of the container(s) + id: slurm-ubuntu-copy-from-container + if: ${{ success() && matrix.debug != 0 }} # REVERT! + continue-on-error: true + run: | + docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-node-1 bash -c "tar cjvf /home/slurmuser/node1.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-node-2 bash -c "tar cjvf /home/slurmuser/node2.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-node-3 bash -c "tar cjvf /home/slurmuser/node3.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-node-4 bash -c "tar cjvf /home/slurmuser/node4.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker cp slurm-node-1:/home/slurmuser/node1.tar.bz2 ${{ env.TARBALL1 }} + docker cp slurm-node-2:/home/slurmuser/node2.tar.bz2 ${{ env.TARBALL2 }} + docker cp slurm-node-3:/home/slurmuser/node3.tar.bz2 ${{ env.TARBALL3 }} + docker cp slurm-node-4:/home/slurmuser/node4.tar.bz2 ${{ env.TARBALL4 }} + docker cp slurm-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} + + - name: Upload slurm ubuntu logs + id: slurm-ubuntu-copy-to-artifact + if: ${{ success() && matrix.debug != 0 }} # REVERT + uses: actions/upload-artifact@v6 + with: + name: ${{ env.UNIQNAME }} + # NOTE: This is the runner path, not the container path. + path: | + ${{ env.TARBALL1 }} + ${{ env.TARBALL2 }} + ${{ env.TARBALL3 }} + ${{ env.TARBALL4 }} + ${{ env.TARBALLH }} + + - name: Bring spindle-slurm-ubuntu down + id: slurm-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-slurm-ubuntu/testing + docker compose down + +################################################################################ +# spindle-slurm-plugin-ubuntu +################################################################################ + spindle-slurm-plugin-ubuntu: + name: Testsuite (Slurm Plugin, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 12 + env: + TARBALL1: ./plugin_node1_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL2: ./plugin_node2_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL3: ./plugin_node3_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL4: ./plugin_node4_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + TARBALL5: ./plugin_nodeH_dbg${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }}.tar.bz2 + UNIQNAME: spindle_plugin_ubuntu_${{ matrix.debug }}_sha${{ github.sha }}_run${{ github.run_id }}_attempt${{ github.run_attempt }} + strategy: + fail-fast: false + matrix: + debug: [3, 2, 1, 0] + steps: + - name: Check out Spindle + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 + + - name: Setup Docker Compose + uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746 + with: + version: latest + + - name: Login to GitHub Container Registry + if: ${{ !env.ACT }} + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate MariaDB configuration + id: slurm-ubuntu-mariadb + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + ./generate_config.sh + + - name: Build spindle-slurm-plugin-ubuntu image + id: slurm-ubuntu-build + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose --progress=plain build + + - name: Bring spindle-slurm-plugin-ubuntu up + id: slurm-ubuntu-up + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose up -d --wait --wait-timeout 120 + + - name: Verify munge works in spindle-slurm-plugin-ubuntu + id: slurm-ubuntu-munge + run: | + docker exec slurm-plugin-head bash -c 'munge -n | unmunge' + +# Matrix across "debug" + - name: Run spindle-slurm-plugin-ubuntu testsuite + timeout-minutes: 4 + id: slurm-ubuntu-testsuite + run: | + if [ "${{ matrix.debug }}" = "0" ]; then + docker exec slurm-plugin-head bash -c \ + 'cd Spindle-build/testsuite && \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + else + docker exec slurm-plugin-head bash -c \ + 'cd Spindle-build/testsuite && SPINDLE_DEBUG=${{ matrix.debug }} \ + salloc -n${workers} -N${workers} ./runTests ${workers}' + fi +# Extract logs on failure + - name: Pull logs out of the container(s) + id: plugin-ubuntu-copy-from-container + if: ${{ success() && matrix.debug != 0 }} # REVERT! + continue-on-error: true + run: | + docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-node-1 bash -c "tar cjvf /home/slurmuser/node1.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-node-2 bash -c "tar cjvf /home/slurmuser/node2.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-node-3 bash -c "tar cjvf /home/slurmuser/node3.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-node-4 bash -c "tar cjvf /home/slurmuser/node4.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker cp slurm-plugin-node-1:/home/slurmuser/node1.tar.bz2 ${{ env.TARBALL1 }} + docker cp slurm-plugin-node-2:/home/slurmuser/node2.tar.bz2 ${{ env.TARBALL2 }} + docker cp slurm-plugin-node-3:/home/slurmuser/node3.tar.bz2 ${{ env.TARBALL3 }} + docker cp slurm-plugin-node-4:/home/slurmuser/node4.tar.bz2 ${{ env.TARBALL4 }} + docker cp slurm-plugin-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} + + - name: Upload slurm ubuntu logs + id: slurm-ubuntu-copy-to-artifact + if: ${{ success() && matrix.debug != 0 }} # REVERT! + uses: actions/upload-artifact@v6 + with: + name: ${{ env.UNIQNAME }} + # NOTE: This is the runner path, not the container path. + path: | + ${{ env.TARBALL1 }} + ${{ env.TARBALL2 }} + ${{ env.TARBALL3 }} + ${{ env.TARBALL4 }} + ${{ env.TARBALLH }} + + - name: Bring spindle-slurm-plugin-ubuntu down + id: slurm-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-slurm-ubuntu/testing-plugin + docker compose down + From b115678a5fc264a55d679e3317b2531e6ee6329f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 10:11:43 -0700 Subject: [PATCH 65/66] Removed debugging code. --- .github/workflows/ci.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index af671581..d1a86d97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,7 +81,7 @@ jobs: # If we saw any failures, tar up the logfiles for extraction. - name: On failure, pull logs out of the container(s) id: serial-ubuntu-copy-from-container - if: ${{ success() && matrix.debug != 0}} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | docker exec spindlenode bash -c "tar cjvf /home/spindleuser/ubuntu_serial_logs.tar.bz2 /home/spindleuser/Spindle-build//testsuite/spindle_output*" @@ -91,7 +91,7 @@ jobs: - name: Upload ubuntu-serial logs id: serial-ubuntu-copy-to-artifact - if: ${{ success() }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} uses: actions/upload-artifact@v6 with: name: ${{ env.UNIQNAME }} @@ -173,7 +173,7 @@ jobs: - name: Extract logs from spindle-flux-ubuntu timeout-minutes: 9 id: flux-ubuntu-tar - if: ${{ success() && matrix.debug != 0 }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | docker exec node-1 bash -c "ls /home/fluxuser/Spindle-build/testsuite/spindle_output*" @@ -188,7 +188,7 @@ jobs: - name: Upload logs to artifacts id: flux-ubuntu-artifact - if: ${{ success() && matrix.debug != 0 }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} uses: actions/upload-artifact@v6 with: name: ${{ env.UNIQNAME }} @@ -284,7 +284,7 @@ jobs: # Extract logs on failure - name: Pull logs out of the container(s) id: slurm-ubuntu-copy-from-container - if: ${{ success() && matrix.debug != 0 }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | docker exec slurm-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" @@ -301,7 +301,7 @@ jobs: - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact - if: ${{ success() && matrix.debug != 0 }} # REVERT + if: ${{ failure() && matrix.debug != 0 }} uses: actions/upload-artifact@v6 with: name: ${{ env.UNIQNAME }} @@ -397,7 +397,7 @@ jobs: # Extract logs on failure - name: Pull logs out of the container(s) id: plugin-ubuntu-copy-from-container - if: ${{ success() && matrix.debug != 0 }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} continue-on-error: true run: | docker exec slurm-plugin-node-1 bash -c "ls /home/slurmuser/Spindle-build/testsuite/spindle_output*" @@ -414,7 +414,7 @@ jobs: - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact - if: ${{ success() && matrix.debug != 0 }} # REVERT! + if: ${{ failure() && matrix.debug != 0 }} uses: actions/upload-artifact@v6 with: name: ${{ env.UNIQNAME }} From f5787ca01f2a840602f5f09b52fe51a478022036 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 13 Mar 2026 10:32:30 -0700 Subject: [PATCH 66/66] Handle absent log in head node cases. --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d1a86d97..e14c5ee1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -292,12 +292,12 @@ jobs: docker exec slurm-node-2 bash -c "tar cjvf /home/slurmuser/node2.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" docker exec slurm-node-3 bash -c "tar cjvf /home/slurmuser/node3.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" docker exec slurm-node-4 bash -c "tar cjvf /home/slurmuser/node4.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output* || /bin/true " docker cp slurm-node-1:/home/slurmuser/node1.tar.bz2 ${{ env.TARBALL1 }} docker cp slurm-node-2:/home/slurmuser/node2.tar.bz2 ${{ env.TARBALL2 }} docker cp slurm-node-3:/home/slurmuser/node3.tar.bz2 ${{ env.TARBALL3 }} docker cp slurm-node-4:/home/slurmuser/node4.tar.bz2 ${{ env.TARBALL4 }} - docker cp slurm-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} + docker cp slurm-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} || /bin/true - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact @@ -405,12 +405,12 @@ jobs: docker exec slurm-plugin-node-2 bash -c "tar cjvf /home/slurmuser/node2.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" docker exec slurm-plugin-node-3 bash -c "tar cjvf /home/slurmuser/node3.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" docker exec slurm-plugin-node-4 bash -c "tar cjvf /home/slurmuser/node4.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" - docker exec slurm-plugin-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output*" + docker exec slurm-plugin-head bash -c "tar cjvf /home/slurmuser/head.tar.bz2 /home/slurmuser/Spindle-build/testsuite/spindle_output* || /bin/true" docker cp slurm-plugin-node-1:/home/slurmuser/node1.tar.bz2 ${{ env.TARBALL1 }} docker cp slurm-plugin-node-2:/home/slurmuser/node2.tar.bz2 ${{ env.TARBALL2 }} docker cp slurm-plugin-node-3:/home/slurmuser/node3.tar.bz2 ${{ env.TARBALL3 }} docker cp slurm-plugin-node-4:/home/slurmuser/node4.tar.bz2 ${{ env.TARBALL4 }} - docker cp slurm-plugin-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} + docker cp slurm-plugin-head:/home/slurmuser/head.tar.bz2 ${{ env.TARBALLH }} || /bin/true - name: Upload slurm ubuntu logs id: slurm-ubuntu-copy-to-artifact