From ee93a00f09c2b280d33712ff73d2d12d46ed1a56 Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Tue, 4 Nov 2025 11:37:18 -0500 Subject: [PATCH 1/6] started implementing slurm job submission from the gui --- src/nsls2ptycho/core/ptycho_recon.py | 41 ++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index eb6c612..58cdf5c 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -10,6 +10,10 @@ import traceback import time + +import requests +import json + from .databroker_api import load_metadata, save_data from .utils import use_mpi_machinefile, set_flush_early from .ptycho.utils import save_config @@ -98,7 +102,37 @@ def clear_slurm_header(self): os.remove(slurm_header) except: pass - + + def submit_job(self): + url = "https://orion-api-staging.nsls2.bnl.gov/api/v1/compute/orion/jobs" + bearer_token = os.getenv("SLURM_JWT") + + payload = { + "script": "#!/bin/bash -l\n" + "#SBATCH --job-name=trial\n" + "#SBATCH --time=0-00:10:00\n" + "#SBATCH --nodes=1\n" + "#SBATCH --ntasks-per-node=2\n" + "#SBATCH --error=%x.err\n" + "#SBATCH --output=%x.out\n\n" + "module purge\n" + "module load beamline-aliases\n\n" + "load-hxn\n\n" + "srun python solve.py", + "environment": ["PATH=/usr/bin:/bin:/usr/sbin:/sbin", "SLURM_EXPORT_ENV=ALL"], + "working_dir_path": "/nsls2/users/skarakuzu1/orion_ptycho" + } + + print(payload) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {bearer_token}" + } + + response = requests.post(url, headers=headers, json=payload) + print("response is ", response) + + def recon_remote(self, param:Param, update_fcn=None): self.fname_full = os.path.join(self.remote_path,'ptycho_'+str(param.scan_num)+'_'+param.sign) @@ -119,6 +153,9 @@ def recon_remote(self, param:Param, update_fcn=None): self.return_value = 0 # Assume the recon will succeed unless later detects failure and modify it. + print("Submitting job from the gui") + self.submit_job() + # try: time.sleep(1) out = self.msg.readlines() @@ -157,7 +194,7 @@ def recon_remote(self, param:Param, update_fcn=None): # pass def run(self): - print('Ptycho thread started') + print('Ptycho thread started helloooo***') try: self.recon_remote(self.param, self.update_signal.emit) except IndexError: From f58758163cbfac987c3caa6b8d0753e768fe5b2e Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Tue, 4 Nov 2025 13:14:33 -0500 Subject: [PATCH 2/6] change source file --- src/nsls2ptycho/core/ptycho_recon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index 58cdf5c..5434f5d 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -117,7 +117,7 @@ def submit_job(self): "#SBATCH --output=%x.out\n\n" "module purge\n" "module load beamline-aliases\n\n" - "load-hxn\n\n" + "source load-hxn\n\n" "srun python solve.py", "environment": ["PATH=/usr/bin:/bin:/usr/sbin:/sbin", "SLURM_EXPORT_ENV=ALL"], "working_dir_path": "/nsls2/users/skarakuzu1/orion_ptycho" From 42d9ddae2b85e675d3fbdaef92fd84a8fef5e362 Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Wed, 3 Dec 2025 19:34:41 -0500 Subject: [PATCH 3/6] job summission and deletion works but orion-api still directs the jobs to cpu nodes. to be continued ... --- src/nsls2ptycho/core/ptycho_recon.py | 97 +++++++++++++++++++--------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index 5434f5d..920541f 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -18,6 +18,65 @@ from .utils import use_mpi_machinefile, set_flush_early from .ptycho.utils import save_config + +class RemoteJobHandler: + def __init__(self): + self.url = "https://orion-api-staging.nsls2.bnl.gov/api/v1/compute/orion/jobs" + self.api_key = os.getenv("APIKEY") + self.headers = { + "Content-Type": "application/json", + "x-api-key": f"{self.api_key}" + } + self.params = {"expand_info": False} + + def submit_job(self, remote_path, param): + print("inside submit job with path ", remote_path) + print("inside submit job num_gpus ", param.gpus) + + #remote_path = "/nsls2/users/skarakuzu1/orion_ptycho" + + srun_command = "python -W ignore -m nsls2ptycho.core.ptycho.recon_ptycho_gui /nsls2/users/skarakuzu1/ptycho_test/remote_orion/ptycho_320045_t1" + #srun_command = "python solve.py" + + payload = { + "script": "#!/bin/bash -l\n" + "#SBATCH --job-name=trial\n" + "#SBATCH --partition=normal\n" + f"#SBATCH --gres=gpu:{len(param.gpus)}\n" + "#SBATCH --time=0-00:10:00\n" + f"#SBATCH --ntasks={len(param.gpus)}\n" + "#SBATCH --gpus-per-task=1\n" + "#SBATCH --error=%x.err\n" + "#SBATCH --output=%x.out\n\n" + "conda activate /nsls2/conda/envs/2025-2.0-py311-tiled\n\n" + "module load orion/gpu\n\n" + f"srun {srun_command}", + "environment": ["PATH=/usr/bin:/bin:/usr/sbin:/sbin", "HOME=/nsls2/users/skarakuzu1", "SLURM_EXPORT_ENV=ALL"], + "working_dir_path": f"{remote_path}" + } + print("printing payload") + print(payload) + sys.stdout.flush() + + response = requests.post(self.url, headers=self.headers, json=payload) + resp_json = response.json() + print("response is ", response.status_code, resp_json) + + if response.status_code == 200: + self.remote_job_id = resp_json['job_id'] + print("job_id is ", self.remote_job_id) + + return response.status_code + + + def cancel_job(self): + response = requests.delete(f"{self.url}/{self.remote_job_id}", headers=self.headers, params=self.params) + + resp_json = response.json() + print("response is ", response.status_code, resp_json) + + return response.status_code + class PtychoReconRemote(QtCore.QThread): update_signal = QtCore.pyqtSignal(int, object) # (interation number, chi arrays) @@ -38,6 +97,8 @@ def __init__(self, param:Param=None, parent=None): pass self.msg = open(self.msg_file,'r') self.msg.readlines() + + self.remote_job_handler = RemoteJobHandler() def _parse_message(self, tokens): def _parser(current, upper_limit, target_list): @@ -103,35 +164,6 @@ def clear_slurm_header(self): except: pass - def submit_job(self): - url = "https://orion-api-staging.nsls2.bnl.gov/api/v1/compute/orion/jobs" - bearer_token = os.getenv("SLURM_JWT") - - payload = { - "script": "#!/bin/bash -l\n" - "#SBATCH --job-name=trial\n" - "#SBATCH --time=0-00:10:00\n" - "#SBATCH --nodes=1\n" - "#SBATCH --ntasks-per-node=2\n" - "#SBATCH --error=%x.err\n" - "#SBATCH --output=%x.out\n\n" - "module purge\n" - "module load beamline-aliases\n\n" - "source load-hxn\n\n" - "srun python solve.py", - "environment": ["PATH=/usr/bin:/bin:/usr/sbin:/sbin", "SLURM_EXPORT_ENV=ALL"], - "working_dir_path": "/nsls2/users/skarakuzu1/orion_ptycho" - } - - print(payload) - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {bearer_token}" - } - - response = requests.post(url, headers=headers, json=payload) - print("response is ", response) - def recon_remote(self, param:Param, update_fcn=None): @@ -153,8 +185,10 @@ def recon_remote(self, param:Param, update_fcn=None): self.return_value = 0 # Assume the recon will succeed unless later detects failure and modify it. + #bearer_token = os.getenv("SLURM_JWT") print("Submitting job from the gui") - self.submit_job() + status = self.remote_job_handler.submit_job(self.remote_path, param) + # try: time.sleep(1) @@ -210,9 +244,12 @@ def run(self): finally: self.clear_slurm_header() + print("Cancelling from the gui") + self.remote_job_handler.cancel_job() print('finally?') def kill(self): + print("In the kill section") if os.path.isdir(self.remote_path): with open(os.path.join(self.remote_path,'abort'),'w') as f: pass From ad381714ad9dc6893367ff05b055282fb89637ff Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Tue, 9 Dec 2025 13:23:53 -0500 Subject: [PATCH 4/6] changed submission. now can get jobid status and also cancel. to be continued --- src/nsls2ptycho/core/ptycho_recon.py | 115 +++++++++++++++++++++------ 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index 920541f..424b30d 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -38,22 +38,38 @@ def submit_job(self, remote_path, param): srun_command = "python -W ignore -m nsls2ptycho.core.ptycho.recon_ptycho_gui /nsls2/users/skarakuzu1/ptycho_test/remote_orion/ptycho_320045_t1" #srun_command = "python solve.py" + overrides = { + "name": "trial", + "partition": "normal", + "tasks": f"{len(param.gpus)}", + "time_limit": 720, + "tres_per_task": "cpu=1,gres/gpu=1", + "standard_output": "trial.out", + "standard_error": "trial.err", + } + payload = { - "script": "#!/bin/bash -l\n" - "#SBATCH --job-name=trial\n" - "#SBATCH --partition=normal\n" - f"#SBATCH --gres=gpu:{len(param.gpus)}\n" - "#SBATCH --time=0-00:10:00\n" - f"#SBATCH --ntasks={len(param.gpus)}\n" - "#SBATCH --gpus-per-task=1\n" - "#SBATCH --error=%x.err\n" - "#SBATCH --output=%x.out\n\n" - "conda activate /nsls2/conda/envs/2025-2.0-py311-tiled\n\n" - "module load orion/gpu\n\n" - f"srun {srun_command}", - "environment": ["PATH=/usr/bin:/bin:/usr/sbin:/sbin", "HOME=/nsls2/users/skarakuzu1", "SLURM_EXPORT_ENV=ALL"], - "working_dir_path": f"{remote_path}" + "script": ( + "#!/bin/bash -l\n" + "module load orion/gpu\n" + "module unload openmpi\n" + "conda activate /nsls2/conda/envs/2025-2.0-py311-tiled/\n" + "nvidia-smi\n" + "echo $(pwd)\n" + "echo $(which mpicc)\n" + #f"mpirun -n 2 {srun_command}\n" + f"srun --mpi=pmix {srun_command}\n" + ), + "working_dir_path": f"{remote_path}", + "environment": [ + "PATH=/usr/bin:/bin:/usr/sbin:/sbin", + "HOME=/nsls2/users/skarakuzu1", + "SLURM_EXPORT_ENV=ALL", + ], + "overrides": overrides, } + + print("printing payload") print(payload) sys.stdout.flush() @@ -66,6 +82,7 @@ def submit_job(self, remote_path, param): self.remote_job_id = resp_json['job_id'] print("job_id is ", self.remote_job_id) + return response.status_code @@ -77,6 +94,17 @@ def cancel_job(self): return response.status_code + def get_job_status(self): + response = requests.get(f"{self.url}/{self.remote_job_id}", headers=self.headers, params=self.params) + + resp_json = response.json() + print("response is ", response.status_code, resp_json) + + if response.status_code == 200: + state = resp_json["jobs"][0]["state"][0] + return state + + class PtychoReconRemote(QtCore.QThread): update_signal = QtCore.pyqtSignal(int, object) # (interation number, chi arrays) @@ -186,24 +214,59 @@ def recon_remote(self, param:Param, update_fcn=None): self.return_value = 0 # Assume the recon will succeed unless later detects failure and modify it. #bearer_token = os.getenv("SLURM_JWT") - print("Submitting job from the gui") status = self.remote_job_handler.submit_job(self.remote_path, param) + print("Submitted job from the gui") # try: + #time.sleep(1) + #while not out: + # print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) + # time.sleep(1) + # out = self.msg.readlines() + # if os.path.isfile(os.path.join(self.remote_path,'abort')): + # os.remove(os.path.join(self.remote_path,'abort')) + # if os.path.isfile(os.path.join(self.remote_path,'msg')): + # os.remove(os.path.join(self.remote_path,'msg')) + # if os.path.isfile(self.fname_full): + # os.remove(self.fname_full) + # raise Exception('Remote recon aborted...') + + while self.remote_job_handler.get_job_status() != "RUNNING": + print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) + + time.sleep(1) - out = self.msg.readlines() + print("DEBUG: Attempting to read job output...") + + file_name = f"slurm-{self.remote_job_handler.remote_job_id}.out" + msg_file = os.path.join(self.remote_path, file_name) + + print("DEBUG: msg_file =", msg_file) + print("DEBUG: exists? ", os.path.exists(msg_file)) + + try: + with open(msg_file, "r") as f: + print("DEBUG: opened successfully") + print(f.read()) + except Exception as e: + print("DEBUG: ERROR opening file:", e) + + msg = open(msg_file, "r") + out = None while not out: print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) time.sleep(1) - out = self.msg.readlines() - if os.path.isfile(os.path.join(self.remote_path,'abort')): - os.remove(os.path.join(self.remote_path,'abort')) - if os.path.isfile(os.path.join(self.remote_path,'msg')): - os.remove(os.path.join(self.remote_path,'msg')) - if os.path.isfile(self.fname_full): - os.remove(self.fname_full) - raise Exception('Remote recon aborted...') + out = msg.readlines() + #if os.path.isfile(os.path.join(self.remote_path,'abort')): + # os.remove(os.path.join(self.remote_path,'abort')) + # if os.path.isfile(os.path.join(self.remote_path,'msg')): + # os.remove(os.path.join(self.remote_path,'msg')) + # if os.path.isfile(self.fname_full): + # os.remove(self.fname_full) + # raise Exception('Remote recon aborted...') + + while True: for line in out: @@ -216,12 +279,12 @@ def recon_remote(self, param:Param, update_fcn=None): #print(result['probe_chi']) if 'aborted' in line: self.return_value = 1 # Aborted - if not os.path.isfile(self.fname_full): break time.sleep(0.1) - out = self.msg.readlines() + out = msg.readlines() + #out = self.msg.readlines() # except: # pass # finally: From ecaed30b7533902a03d163fadbdc396f40cbc1cc Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Wed, 10 Dec 2025 12:31:48 -0500 Subject: [PATCH 5/6] clean a bit more and fix GUI abort issue --- src/nsls2ptycho/core/ptycho_recon.py | 112 ++++++++++++++------------- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index 424b30d..a051ddc 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -29,14 +29,11 @@ def __init__(self): } self.params = {"expand_info": False} - def submit_job(self, remote_path, param): - print("inside submit job with path ", remote_path) - print("inside submit job num_gpus ", param.gpus) + def submit_job(self, remote_path, parent_module, param): - #remote_path = "/nsls2/users/skarakuzu1/orion_ptycho" - - srun_command = "python -W ignore -m nsls2ptycho.core.ptycho.recon_ptycho_gui /nsls2/users/skarakuzu1/ptycho_test/remote_orion/ptycho_320045_t1" - #srun_command = "python solve.py" + fname_full = os.path.join(remote_path,'ptycho_'+str(param.scan_num)+'_'+param.sign) + srun_command = "python " + "-W " + "ignore " + "-m " + parent_module + ".ptycho.recon_ptycho_gui " + fname_full + #srun_command = "python -W ignore -m nsls2ptycho.core.ptycho.recon_ptycho_gui /nsls2/users/skarakuzu1/ptycho_test/remote_orion/ptycho_320045_t1" overrides = { "name": "trial", @@ -70,17 +67,15 @@ def submit_job(self, remote_path, param): } - print("printing payload") - print(payload) sys.stdout.flush() response = requests.post(self.url, headers=self.headers, json=payload) resp_json = response.json() - print("response is ", response.status_code, resp_json) + #print("response is ", response.status_code, resp_json) if response.status_code == 200: self.remote_job_id = resp_json['job_id'] - print("job_id is ", self.remote_job_id) + print("submitted job with id ", self.remote_job_id) return response.status_code @@ -90,7 +85,7 @@ def cancel_job(self): response = requests.delete(f"{self.url}/{self.remote_job_id}", headers=self.headers, params=self.params) resp_json = response.json() - print("response is ", response.status_code, resp_json) + #print("response is ", response.status_code, resp_json) return response.status_code @@ -98,7 +93,7 @@ def get_job_status(self): response = requests.get(f"{self.url}/{self.remote_job_id}", headers=self.headers, params=self.params) resp_json = response.json() - print("response is ", response.status_code, resp_json) + #print("response is ", response.status_code, resp_json) if response.status_code == 200: state = resp_json["jobs"][0]["state"][0] @@ -119,12 +114,12 @@ def __init__(self, param:Param=None, parent=None): if not os.path.isdir(self.remote_path): os.mkdir(self.remote_path) - self.msg_file = os.path.join(os.path.join(self.remote_path,'msg')) - if not os.path.isfile(self.msg_file): - with open(self.msg_file,'w') as f: - pass - self.msg = open(self.msg_file,'r') - self.msg.readlines() + #self.msg_file = os.path.join(os.path.join(self.remote_path,'msg')) + #if not os.path.isfile(self.msg_file): + # with open(self.msg_file,'w') as f: + # pass + #self.msg = open(self.msg_file,'r') + #self.msg.readlines() self.remote_job_handler = RemoteJobHandler() @@ -192,11 +187,20 @@ def clear_slurm_header(self): except: pass + def cleanup(self): + if self.fname_full and os.path.exists(self.fname_full): + os.remove(self.fname_full) + self.fname_full = None + if os.path.exists(os.path.join(self.remote_path,'prb_live.npy')): + os.remove(os.path.join(self.remote_path,'prb_live.npy')) + if os.path.exists(os.path.join(self.remote_path,'obj_live.npy')): + os.remove(os.path.join(self.remote_path,'obj_live.npy')) def recon_remote(self, param:Param, update_fcn=None): self.fname_full = os.path.join(self.remote_path,'ptycho_'+str(param.scan_num)+'_'+param.sign) - + self.parent_module = '.'.join(self.__module__.rsplit('.', 2)[:-1]) # get parent module name to run the correct recon worker + if param.working_directory: param.working_directory = os.path.realpath(param.working_directory)+'/' if param.prb_dir: @@ -213,9 +217,8 @@ def recon_remote(self, param:Param, update_fcn=None): self.return_value = 0 # Assume the recon will succeed unless later detects failure and modify it. - #bearer_token = os.getenv("SLURM_JWT") - status = self.remote_job_handler.submit_job(self.remote_path, param) - print("Submitted job from the gui") + status = self.remote_job_handler.submit_job(self.remote_path, self.parent_module, param) + print(f"Submitted job from the gui with status code {status} and reserved job id {self.remote_job_handler.remote_job_id}") # try: @@ -232,43 +235,32 @@ def recon_remote(self, param:Param, update_fcn=None): # os.remove(self.fname_full) # raise Exception('Remote recon aborted...') + time.sleep(1) while self.remote_job_handler.get_job_status() != "RUNNING": print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) + time.sleep(1) - time.sleep(1) - print("DEBUG: Attempting to read job output...") - file_name = f"slurm-{self.remote_job_handler.remote_job_id}.out" - msg_file = os.path.join(self.remote_path, file_name) + self.msg_file = os.path.join(self.remote_path, file_name) - print("DEBUG: msg_file =", msg_file) - print("DEBUG: exists? ", os.path.exists(msg_file)) - - try: - with open(msg_file, "r") as f: - print("DEBUG: opened successfully") - print(f.read()) - except Exception as e: - print("DEBUG: ERROR opening file:", e) - - msg = open(msg_file, "r") - out = None + self.msg = open(self.msg_file, "r") + out = self.msg.readlines() + pos = 0 + + time.sleep(1) while not out: - print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) + print('Waiting for remote worker on %s to start writing...'%param.remote_srv) + out = self.msg.readlines() time.sleep(1) - out = msg.readlines() - #if os.path.isfile(os.path.join(self.remote_path,'abort')): - # os.remove(os.path.join(self.remote_path,'abort')) - # if os.path.isfile(os.path.join(self.remote_path,'msg')): - # os.remove(os.path.join(self.remote_path,'msg')) - # if os.path.isfile(self.fname_full): - # os.remove(self.fname_full) - # raise Exception('Remote recon aborted...') - + time.sleep(1) while True: + self.msg.seek(pos) + out = self.msg.readlines() + pos = self.msg.tell() # remember where we stopped + for line in out: print(line, end='') # because the line already ends with '\n' tokens = line.split() @@ -282,9 +274,17 @@ def recon_remote(self, param:Param, update_fcn=None): if not os.path.isfile(self.fname_full): break + # ask Slurm about job status + status = self.remote_job_handler.get_job_status() + + # stop when job is no longer running AND there was no new data + if status != "RUNNING": + size = os.path.getsize(self.msg_file) + if size == pos: + break + time.sleep(0.1) - out = msg.readlines() - #out = self.msg.readlines() + # except: # pass # finally: @@ -306,13 +306,15 @@ def run(self): self.update_signal.emit(self.param.n_iterations+1,None) finally: - self.clear_slurm_header() - print("Cancelling from the gui") - self.remote_job_handler.cancel_job() print('finally?') + self.clear_slurm_header() + status = self.remote_job_handler.cancel_job() + print(f"Cancelled job with id {self.remote_job_handler.remote_job_id} from the gui with status code {status}") + self.cleanup() def kill(self): - print("In the kill section") + self.remote_job_handler.cancel_job() + print(f"Cancelled job with id {self.remote_job_handler.remote_job_id} from the gui with status code {status}") if os.path.isdir(self.remote_path): with open(os.path.join(self.remote_path,'abort'),'w') as f: pass From 233100fabb424f1b7dfe213ae7c6862665194873 Mon Sep 17 00:00:00 2001 From: skarakuzu Date: Thu, 11 Dec 2025 11:05:53 -0500 Subject: [PATCH 6/6] cleanup and fix user HOME retrieval --- src/nsls2ptycho/core/ptycho_recon.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/nsls2ptycho/core/ptycho_recon.py b/src/nsls2ptycho/core/ptycho_recon.py index a051ddc..52b7f4f 100755 --- a/src/nsls2ptycho/core/ptycho_recon.py +++ b/src/nsls2ptycho/core/ptycho_recon.py @@ -35,6 +35,8 @@ def submit_job(self, remote_path, parent_module, param): srun_command = "python " + "-W " + "ignore " + "-m " + parent_module + ".ptycho.recon_ptycho_gui " + fname_full #srun_command = "python -W ignore -m nsls2ptycho.core.ptycho.recon_ptycho_gui /nsls2/users/skarakuzu1/ptycho_test/remote_orion/ptycho_320045_t1" + HOME = os.environ["HOME"] + overrides = { "name": "trial", "partition": "normal", @@ -52,6 +54,7 @@ def submit_job(self, remote_path, parent_module, param): "module unload openmpi\n" "conda activate /nsls2/conda/envs/2025-2.0-py311-tiled/\n" "nvidia-smi\n" + "echo $HOME\n" "echo $(pwd)\n" "echo $(which mpicc)\n" #f"mpirun -n 2 {srun_command}\n" @@ -60,7 +63,7 @@ def submit_job(self, remote_path, parent_module, param): "working_dir_path": f"{remote_path}", "environment": [ "PATH=/usr/bin:/bin:/usr/sbin:/sbin", - "HOME=/nsls2/users/skarakuzu1", + f"HOME={HOME}", "SLURM_EXPORT_ENV=ALL", ], "overrides": overrides, @@ -213,28 +216,13 @@ def recon_remote(self, param:Param, update_fcn=None): param.obj_path = os.path.realpath(param.obj_path) save_config(self.fname_full,param) - self.export_slurm_header() + #self.export_slurm_header() self.return_value = 0 # Assume the recon will succeed unless later detects failure and modify it. status = self.remote_job_handler.submit_job(self.remote_path, self.parent_module, param) print(f"Submitted job from the gui with status code {status} and reserved job id {self.remote_job_handler.remote_job_id}") - - # try: - #time.sleep(1) - #while not out: - # print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) - # time.sleep(1) - # out = self.msg.readlines() - # if os.path.isfile(os.path.join(self.remote_path,'abort')): - # os.remove(os.path.join(self.remote_path,'abort')) - # if os.path.isfile(os.path.join(self.remote_path,'msg')): - # os.remove(os.path.join(self.remote_path,'msg')) - # if os.path.isfile(self.fname_full): - # os.remove(self.fname_full) - # raise Exception('Remote recon aborted...') - time.sleep(1) while self.remote_job_handler.get_job_status() != "RUNNING": print('Waiting for remote worker on %s to take the recon task...'%param.remote_srv) @@ -307,7 +295,7 @@ def run(self): finally: print('finally?') - self.clear_slurm_header() + #self.clear_slurm_header() status = self.remote_job_handler.cancel_job() print(f"Cancelled job with id {self.remote_job_handler.remote_job_id} from the gui with status code {status}") self.cleanup()