From ce52b324ef9ea1707c1c96f0f96a5735e6f842f0 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Fri, 21 Mar 2025 15:26:21 -0400 Subject: [PATCH 01/39] Fixed the change by making the retries attribute private (and also the assigned attribute), improving logging surrounding retries and giving up --- jobQueue.py | 4 +-- tangoObjects.py | 59 +++++++++++++++++++++++++++++++-------------- vmms/localDocker.py | 2 +- worker.py | 14 ++++++++++- 4 files changed, 57 insertions(+), 22 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index a583b3f1..1b0612e8 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -281,9 +281,9 @@ def unassignJob(self, jobId): # Increment the number of retires if job.retries is None: - job.retries = 0 + job.resetRetries() else: - job.retries += 1 + job.incrementRetries() Config.job_retries += 1 self.log.info("unassignJob|Unassigning job %s" % str(job.id)) diff --git a/tangoObjects.py b/tangoObjects.py index 03c33c58..83da888f 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -102,8 +102,8 @@ def __init__( disableNetwork=None, stopBefore="", ): - self.assigned = False - self.retries = 0 + self._assigned = False + self._retries = 0 self.vm = vm if input is None: @@ -126,10 +126,31 @@ def __init__( def __repr__(self): self.syncRemote() return f"ID: {self.id} - Name: {self.name}" + + # Getters for private variables + @property + def assigned(self): + self.syncRemote() # Is it necessary to sync here? + return self._assigned + + @property + def retries(self): + self.syncRemote() + return self._retries def makeAssigned(self): self.syncRemote() - self.assigned = True + self._assigned = True + self.updateRemote() + + def resetRetries(self): + self.syncRemote() + self._retries = 0 + self.updateRemote() + + def incrementRetries(self): + self.syncRemote() + self._retries += 1 self.updateRemote() def makeVM(self, vm): @@ -139,12 +160,12 @@ def makeVM(self, vm): def makeUnassigned(self): self.syncRemote() - self.assigned = False + self._assigned = False self.updateRemote() def isNotAssigned(self): self.syncRemote() - return not self.assigned + return not self._assigned def appendTrace(self, trace_str): self.syncRemote() @@ -161,13 +182,27 @@ def setId(self, new_id): self._remoteLocation = dict_hash + ":" + str(new_id) self.updateRemote() + # Private method + def __updateSelf(self, other_job): + self._assigned = other_job._assigned + self._retries = other_job._retries + self.vm = other_job.vm + self.input = other_job.input + self.outputFile = other_job.outputFile + self.name = other_job.name + self.notifyURL = other_job.notifyURL + self.timeout = other_job.timeout + self.trace = other_job.trace + self.maxOutputFileSize = other_job.maxOutputFileSize + + def syncRemote(self): if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] dictionary = TangoDictionary(dict_hash) temp_job = dictionary.get(key) - self.updateSelf(temp_job) + self.__updateSelf(temp_job) def updateRemote(self): if Config.USE_REDIS and self._remoteLocation is not None: @@ -176,18 +211,6 @@ def updateRemote(self): dictionary = TangoDictionary(dict_hash) dictionary.set(key, self) - def updateSelf(self, other_job): - self.assigned = other_job.assigned - self.retries = other_job.retries - self.vm = other_job.vm - self.input = other_job.input - self.outputFile = other_job.outputFile - self.name = other_job.name - self.notifyURL = other_job.notifyURL - self.timeout = other_job.timeout - self.trace = other_job.trace - self.maxOutputFileSize = other_job.maxOutputFileSize - def TangoIntValue(object_name, obj): if Config.USE_REDIS: diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 45dda03d..1a1e1dd0 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -123,7 +123,7 @@ def waitVM(self, vm, max_secs): """waitVM - Nothing to do for waitVM""" return - def copyIn(self, vm, inputFiles): + def copyIn(self, vm, inputFiles, job_id=None): """copyIn - Create a directory to be mounted as a volume for the docker containers. Copy input files to this directory. """ diff --git a/worker.py b/worker.py index 0c6b6614..f06e1f10 100644 --- a/worker.py +++ b/worker.py @@ -77,6 +77,11 @@ def rescheduleJob(self, hdrfile, ret, err): # Try a few times before giving up if self.job.retries < Config.JOB_RETRIES: + self.log.error("Retrying job %s:%d, retries: %d" % (self.job.name, self.job.id, self.job.retries)) + self.job.appendTrace( + "%s|Retrying job %s:%d, retries: %d" + % (datetime.now().ctime(), self.job.name, self.job.id, self.job.retries) + ) try: os.remove(hdrfile) except OSError: @@ -86,7 +91,12 @@ def rescheduleJob(self, hdrfile, ret, err): # Here is where we give up else: - self.jobQueue.makeDead(self.job.id, err) + self.log.error("Giving up on job %s:%d" % (self.job.name, self.job.id)) + self.job.appendTrace( + "%s|Giving up on job %s:%d" + % (datetime.now().ctime(), self.job.name, self.job.id) + ) + self.jobQueue.makeDead(self.job.id, err) # Note: this reports the error that caused the last call to rescheduleJob to fail self.appendMsg( hdrfile, @@ -149,6 +159,8 @@ def notifyServer(self, job): % (job.notifyURL, response.content) ) fh.close() + else: + self.log.info("No callback URL for job %s:%d" % (self.job.name, self.job.id)) except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) From 28db9e5ef0a13f9ef22a6d7896c4ddb2404f8845 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 23 Mar 2025 13:43:50 -0400 Subject: [PATCH 02/39] Made all of tangoJob's attributes read only --- tango.py | 2 +- tangoObjects.py | 119 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 96 insertions(+), 25 deletions(-) diff --git a/tango.py b/tango.py index 1a03ed0d..e487481a 100755 --- a/tango.py +++ b/tango.py @@ -399,7 +399,7 @@ def __validateJob(self, job, vmms): "validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT, ) - job.timeout = Config.RUNJOB_TIMEOUT + job.setTimeout(Config.RUNJOB_TIMEOUT) # Any problems, return an error status if errors > 0: diff --git a/tangoObjects.py b/tangoObjects.py index 83da888f..dcd97c67 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -105,23 +105,23 @@ def __init__( self._assigned = False self._retries = 0 - self.vm = vm + self._vm = vm if input is None: - self.input = [] + self._input = [] else: - self.input = input - - self.outputFile = outputFile - self.name = name - self.notifyURL = notifyURL - self.timeout = timeout - self.trace = [] - self.maxOutputFileSize = maxOutputFileSize + self._input = input + + self._outputFile = outputFile + self._name = name + self._notifyURL = notifyURL + self._timeout = timeout # How long to run the autodriver on the job for before timing out. + self._trace = [] + self._maxOutputFileSize = maxOutputFileSize self._remoteLocation = None - self.accessKeyId = accessKeyId - self.accessKey = accessKey - self.disableNetwork = disableNetwork - self.stopBefore = "stopBefore" + self._accessKeyId = accessKeyId + self._accessKey = accessKey + self._disableNetwork = disableNetwork + self._stopBefore = "stopBefore" def __repr__(self): self.syncRemote() @@ -138,6 +138,72 @@ def retries(self): self.syncRemote() return self._retries + @property + def vm(self): + self.syncRemote() + return self._vm + + @property + def input(self): + self.syncRemote() + return self._input + + @property + def outputFile(self): + self.syncRemote() + return self._outputFile + + @property + def name(self): + self.syncRemote() + return self._name + + @property + def notifyURL(self): + self.syncRemote() + return self._notifyURL + + @property + def timeout(self): + self.syncRemote() + return self._timeout + + @property + def trace(self): + self.syncRemote() + return self._trace + + @property + def maxOutputFileSize(self): + self.syncRemote() + return self._maxOutputFileSize + + @property + def remoteLocation(self): + self.syncRemote() + return self._remoteLocation + + @property + def accessKeyId(self): + self.syncRemote() + return self._accessKeyId + + @property + def accessKey(self): + self.syncRemote() + return self._accessKey + + @property + def disableNetwork(self): + self.syncRemote() + return self._disableNetwork + + @property + def stopBefore(self): + self.syncRemote() + return self._stopBefore + + def makeAssigned(self): self.syncRemote() self._assigned = True @@ -155,7 +221,7 @@ def incrementRetries(self): def makeVM(self, vm): self.syncRemote() - self.vm = vm + self._vm = vm self.updateRemote() def makeUnassigned(self): @@ -169,7 +235,7 @@ def isNotAssigned(self): def appendTrace(self, trace_str): self.syncRemote() - self.trace.append(trace_str) + self._trace.append(trace_str) self.updateRemote() def setId(self, new_id): @@ -181,19 +247,24 @@ def setId(self, new_id): dictionary.delete(key) self._remoteLocation = dict_hash + ":" + str(new_id) self.updateRemote() + + def setTimeout(self, new_timeout): + self.syncRemote() + self._timeout = new_timeout + self.updateRemote() # Private method def __updateSelf(self, other_job): self._assigned = other_job._assigned self._retries = other_job._retries - self.vm = other_job.vm - self.input = other_job.input - self.outputFile = other_job.outputFile - self.name = other_job.name - self.notifyURL = other_job.notifyURL - self.timeout = other_job.timeout - self.trace = other_job.trace - self.maxOutputFileSize = other_job.maxOutputFileSize + self._vm = other_job._vm + self._input = other_job._input + self._outputFile = other_job._outputFile + self._name = other_job._name + self._notifyURL = other_job._notifyURL + self._timeout = other_job._timeout + self._trace = other_job._trace + self._maxOutputFileSize = other_job._maxOutputFileSize def syncRemote(self): From 9d57597555ed9ee4aa2fd0415c166ef7c9300401 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 24 Aug 2025 13:00:51 -0400 Subject: [PATCH 03/39] fixed the local/remote desyncing issues, left some print statements in there --- jobQueue.py | 40 ++++++++++++++++++++++++---------------- tango.py | 1 + tangoObjects.py | 7 +++++-- worker.py | 13 +++++++------ 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 1b0612e8..707b4e3f 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -136,7 +136,7 @@ def add(self, job): # Since we assume that the job is new, we set the number of retries # of this job to 0 - job.retries = 0 + assert(job.retries == 0) # Add the job to the queue. Careful not to append the trace until we # know the job has actually been added to the queue. @@ -297,28 +297,36 @@ def unassignJob(self, jobId): self.queueLock.release() self.log.debug("unassignJob| Released lock to job queue.") - def makeDead(self, id, reason): + def makeDead(self, job: TangoJob, reason): """makeDead - move a job from live queue to dead queue""" - self.log.info("makeDead| Making dead job ID: " + str(id)) + self.log.info("makeDead| Making dead job ID: " + str(job.id)) self.queueLock.acquire() self.log.debug("makeDead| Acquired lock to job queue.") status = -1 # Check to make sure that the job is in the live jobs queue - if id in self.liveJobs: - self.log.info("makeDead| Found job ID: %s in the live queue" % (id)) - status = 0 - job = self.liveJobs.get(id) - self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason)) - # Add the job to the dead jobs dictionary - self.deadJobs.set(id, job) - # Remove the job from the live jobs dictionary - self.liveJobs.delete(id) + if job.id not in self.liveJobs: + self.log.error("makeDead| Job ID: %s not found in live jobs" % (job.id)) + return -1 + + self.log.info("makeDead| Found job ID: %s in the live queue" % (job.id)) + status = 0 + self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason)) + + print(f"Anthony: {job.id} has remote location {job._remoteLocation} before swapping") + + # Add the job to the dead jobs dictionary + self.deadJobs.set(job.id, job) + # Remove the job from the live jobs dictionary + self.liveJobs.delete(job.id) - # unassign, remove from unassigned jobs queue - job.makeUnassigned() - self.unassignedJobs.remove(int(id)) + print(f"Anthony: {job.id} has remote location {job._remoteLocation} after swapping") + print(f"Anthony: {job.id} has remote location {job._remoteLocation} after syncing") + + # unassign, remove from unassigned jobs queue + job.makeUnassigned() + self.unassignedJobs.remove(int(job.id)) - job.appendTrace("%s|%s" % (datetime.utcnow().ctime(), reason)) + job.appendTrace("%s|%s" % (datetime.utcnow().ctime(), reason)) self.queueLock.release() self.log.debug("makeDead| Released lock to job queue.") return status diff --git a/tango.py b/tango.py index e487481a..8841a38c 100755 --- a/tango.py +++ b/tango.py @@ -278,6 +278,7 @@ def resetTango(self, vmms): self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) + # Returns 0 if the job is valid, -1 if the job is invalid def __validateJob(self, job, vmms): """validateJob - validate the input arguments in an addJob request.""" errors = 0 diff --git a/tangoObjects.py b/tangoObjects.py index dcd97c67..1a0156d0 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -272,7 +272,10 @@ def syncRemote(self): dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] dictionary = TangoDictionary(dict_hash) - temp_job = dictionary.get(key) + temp_job = dictionary.get(key) # Key should be in dictionary + if temp_job is None: + print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob + return self.__updateSelf(temp_job) def updateRemote(self): @@ -446,7 +449,7 @@ def set(self, id, obj): pickled_obj = pickle.dumps(obj) if hasattr(obj, "_remoteLocation"): - obj._remoteLocation = self.hash_name + ":" + str(id) + obj._remoteLocation = self.hash_name + ":" + str(id) # TODO: don't violate the encapsulation of TangoJob self.r.hset(self.hash_name, str(id), pickled_obj) return str(id) diff --git a/worker.py b/worker.py index f06e1f10..1dd8d8e7 100644 --- a/worker.py +++ b/worker.py @@ -13,7 +13,7 @@ from datetime import datetime from config import Config - +from jobQueue import JobQueue # # Worker - The worker class is very simple and very dumb. The goal is # to walk through the VMMS interface, track the job's progress, and if @@ -32,7 +32,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM): self.daemon = True self.job = job self.vmms = vmms - self.jobQueue = jobQueue + self.jobQueue : JobQueue = jobQueue self.preallocator = preallocator self.preVM = preVM threading.Thread.__init__(self) @@ -64,6 +64,7 @@ def detachVM(self, return_vm=False, replace_vm=False): # pool is empty and creates a spurious vm. self.preallocator.removeVM(self.job.vm) + # TODO: figure out what hdrfile, ret and err are def rescheduleJob(self, hdrfile, ret, err): """rescheduleJob - Reschedule a job that has failed because of a system error, such as a VM timing out or a connection @@ -96,7 +97,7 @@ def rescheduleJob(self, hdrfile, ret, err): "%s|Giving up on job %s:%d" % (datetime.now().ctime(), self.job.name, self.job.id) ) - self.jobQueue.makeDead(self.job.id, err) # Note: this reports the error that caused the last call to rescheduleJob to fail + self.jobQueue.makeDead(self.job, err) # Note: this reports the error that caused the last call to rescheduleJob to fail self.appendMsg( hdrfile, @@ -165,13 +166,13 @@ def notifyServer(self, job): self.log.debug("Error in notifyServer: %s" % str(e)) def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): - self.jobQueue.makeDead(self.job.id, msg) - + self.jobQueue.makeDead(self.job, msg) + # Update the text that users see in the autodriver output file self.appendMsg(hdrfile, msg) self.catFiles(hdrfile, self.job.outputFile) os.chmod(self.job.outputFile, 0o644) - + # Thread exit after termination if killVM: self.detachVM(return_vm=returnVM) From 3bd80cf3bbb3a7c0eff7992b3ffeb0bedac7ef03 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 24 Aug 2025 17:06:50 -0400 Subject: [PATCH 04/39] Better encapsulation of TangoJob._remoteLocation --- jobQueue.py | 15 +++++---------- tangoObjects.py | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 707b4e3f..ae23d4f9 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -145,7 +145,7 @@ def add(self, job): self.log.debug("add| Acquired lock to job queue.") # Adds the job to the live jobs dictionary - self.liveJobs.set(job.id, job) + job.addToDict(self.liveJobs) # Add this to the unassigned job queue too self.unassignedJobs.put(int(job.id)) @@ -194,7 +194,7 @@ def addDead(self, job): self.log.debug("addDead|Acquired lock to job queue.") # We add the job into the dead jobs dictionary - self.deadJobs.set(job.id, job) + job.addToDict(self.deadJobs) self.queueLock.release() self.log.debug("addDead|Released lock to job queue.") @@ -312,15 +312,10 @@ def makeDead(self, job: TangoJob, reason): status = 0 self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason)) - print(f"Anthony: {job.id} has remote location {job._remoteLocation} before swapping") - - # Add the job to the dead jobs dictionary - self.deadJobs.set(job.id, job) # Remove the job from the live jobs dictionary - self.liveJobs.delete(job.id) - - print(f"Anthony: {job.id} has remote location {job._remoteLocation} after swapping") - print(f"Anthony: {job.id} has remote location {job._remoteLocation} after syncing") + job.deleteFromDict(self.liveJobs) + # Add the job to the dead jobs dictionary + job.addToDict(self.deadJobs) # unassign, remove from unassigned jobs queue job.makeUnassigned() diff --git a/tangoObjects.py b/tangoObjects.py index 1a0156d0..9dcdf5fe 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -1,3 +1,4 @@ +from __future__ import annotations # tangoREST.py # # Implements objects used to pass state within Tango. @@ -284,6 +285,18 @@ def updateRemote(self): key = self._remoteLocation.split(":")[1] dictionary = TangoDictionary(dict_hash) dictionary.set(key, self) + + def deleteFromDict(self, dictionary : TangoDictionary) -> None: + dictionary.delete(self.id) + self._remoteLocation = None + + def addToDict(self, dictionary : TangoDictionary) -> None: + dictionary.set(self.id, self) + assert self._remoteLocation is None, "Job already has a remote location" + if Config.USE_REDIS: + self._remoteLocation = dictionary.hash_name + ":" + str(self.id) + self.updateRemote() + def TangoIntValue(object_name, obj): @@ -448,9 +461,6 @@ def __contains__(self, id): def set(self, id, obj): pickled_obj = pickle.dumps(obj) - if hasattr(obj, "_remoteLocation"): - obj._remoteLocation = self.hash_name + ":" + str(id) # TODO: don't violate the encapsulation of TangoJob - self.r.hset(self.hash_name, str(id), pickled_obj) return str(id) @@ -474,7 +484,6 @@ def values(self): return valslist def delete(self, id): - self._remoteLocation = None self.r.hdel(self.hash_name, id) def _clean(self): From f0c0d3262bf3fc0327d0198776808cefb16370dd Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 24 Aug 2025 19:01:26 -0400 Subject: [PATCH 05/39] Fixing uses of makeDead that was changed 2 commits ago --- jobManager.py | 4 ++-- jobQueue.py | 4 +++- tangoObjects.py | 10 ++++++++++ tests/testJobQueue.py | 2 +- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/jobManager.py b/jobManager.py index 2be2147f..1010827c 100644 --- a/jobManager.py +++ b/jobManager.py @@ -28,7 +28,7 @@ class JobManager(object): def __init__(self, queue): self.daemon = True - self.jobQueue = queue + self.jobQueue: JobQueue = queue self.preallocator = self.jobQueue.preallocator self.vmms = self.preallocator.vmms self.log = logging.getLogger("JobManager") @@ -124,7 +124,7 @@ def __manage(self): self.log.info("job_manager: job is None") else: self.log.error("job failed during creation %d %s" % (job.id, str(err))) - self.jobQueue.makeDead(job.id, str(err)) + self.jobQueue.makeDead(job, str(err)) if __name__ == "__main__": diff --git a/jobQueue.py b/jobQueue.py index ae23d4f9..e1c35cec 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -209,6 +209,7 @@ def delJob(self, id, deadjob): """ status = -1 if deadjob == 0: + temp_job = self.liveJobs.getExn(id) try: # Remove the job from the unassigned live jobs queue, if it # is yet to be assigned. @@ -218,7 +219,7 @@ def delJob(self, id, deadjob): self.log.info("delJob | Job ID %s was already assigned" % (id)) return status - return self.makeDead(id, "Requested by operator") + return self.makeDead(temp_job, "Requested by operator") else: self.queueLock.acquire() self.log.debug("delJob| Acquired lock to job queue.") @@ -297,6 +298,7 @@ def unassignJob(self, jobId): self.queueLock.release() self.log.debug("unassignJob| Released lock to job queue.") + # Takes in a job in order to switch its remote location, in which you can't rely on syncing def makeDead(self, job: TangoJob, reason): """makeDead - move a job from live queue to dead queue""" self.log.info("makeDead| Making dead job ID: " + str(job.id)) diff --git a/tangoObjects.py b/tangoObjects.py index 9dcdf5fe..3e57dccb 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -472,6 +472,11 @@ def get(self, id): else: return None + def getExn(self, id): + job = self.get(id) + assert job is not None, f"ID {id} does not exist in this remote dictionary" + return job + def keys(self): keys = map(lambda key: key.decode(), self.r.hkeys(self.hash_name)) return list(keys) @@ -519,6 +524,11 @@ def get(self, id): else: return None + def getExn(self, id): + job = self.get(id) + assert job is not None, f"ID {id} does not exist in this native dictionary" + return job + def keys(self): return list(self.dict.keys()) diff --git a/tests/testJobQueue.py b/tests/testJobQueue.py index 00861d23..27fd3d4f 100644 --- a/tests/testJobQueue.py +++ b/tests/testJobQueue.py @@ -130,7 +130,7 @@ def test_makeDead(self): info = self.jobQueue.getInfo() self.assertEqual(info["size_deadjobs"], 0) self.assertEqual(info["size_unassignedjobs"], 2) - self.jobQueue.makeDead(self.jobId1, "test") + self.jobQueue.makeDead(self.job1, "test") info = self.jobQueue.getInfo() self.assertEqual(info["size_deadjobs"], 1) self.assertEqual(info["size_unassignedjobs"], 1) From ba83509eb984a52f51346992840887b60b1a2e04 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sat, 30 Aug 2025 18:25:38 -0400 Subject: [PATCH 06/39] comments, logging and todos --- clients/tango-cli.py | 2 +- jobManager.py | 4 ++++ jobQueue.py | 4 ++++ tangoObjects.py | 3 ++- vmms/ec2SSH.py | 1 + 5 files changed, 12 insertions(+), 2 deletions(-) diff --git a/clients/tango-cli.py b/clients/tango-cli.py index d9f5ab8d..f060bd1d 100755 --- a/clients/tango-cli.py +++ b/clients/tango-cli.py @@ -131,7 +131,7 @@ parser.add_argument("--accessKeyId", default="", help="AWS account access key ID") parser.add_argument("--accessKey", default="", help="AWS account access key content") parser.add_argument("--instanceType", default="", help="AWS EC2 instance type") -parser.add_argument("--ec2", action="store_true", help="Enable ec2SSH VMMS") +parser.add_argument("--ec2", action="store_true", help="Enable ec2SSH VMMS") # TODO: shouldn't this be taken from config.py? parser.add_argument("--stopBefore", default="", help="Stops the worker before a function is executed") def checkKey(): diff --git a/jobManager.py b/jobManager.py index 59c1ebad..7f254cce 100644 --- a/jobManager.py +++ b/jobManager.py @@ -19,10 +19,12 @@ import tango # Written this way to avoid circular imports from config import Config +from jobQueue import JobQueue from tangoObjects import TangoQueue from worker import Worker + class JobManager(object): def __init__(self, queue): self.daemon = True @@ -63,6 +65,7 @@ def __manage(self): # Blocks until we get a next job job = self.jobQueue.getNextPendingJob() if not job.accessKey and Config.REUSE_VMS: + self.log.info(f"job has access key {job.accessKey} and is calling reuseVM") vm = None while vm is None: vm = self.jobQueue.reuseVM(job) @@ -89,6 +92,7 @@ def __manage(self): "EC2 SSH VM initialization failed: see log" ) else: + self.log.info(f"job {job.id} is not an ec2 vmms job") # Try to find a vm on the free list and allocate it to # the worker if successful. if Config.REUSE_VMS: diff --git a/jobQueue.py b/jobQueue.py index e1c35cec..45dd40ac 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -246,6 +246,8 @@ def get(self, id): self.log.debug("get| Released lock to job queue.") return job + # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. + # TODO: It does assign the job to a particular VM though. def assignJob(self, jobId, vm=None): """assignJob - marks a job to be assigned""" self.queueLock.acquire() @@ -389,3 +391,5 @@ def reuseVM(self, job): return job.vm else: raise Exception("Job assigned without vm") + + diff --git a/tangoObjects.py b/tangoObjects.py index 3e57dccb..64c5f309 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -7,6 +7,7 @@ from queue import Queue import pickle import redis +from typing import Optional redisConnection = None @@ -91,7 +92,7 @@ class TangoJob(object): def __init__( self, - vm=None, + vm: Optional[TangoMachine] = None, outputFile=None, name=None, input=None, diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index b2621192..318b6d49 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -162,6 +162,7 @@ def release_vm_semaphore(): """Releases the VM sempahore""" Ec2SSH._vm_semaphore.release() + # TODO: the arguments accessKeyId and accessKey don't do anything def __init__(self, accessKeyId=None, accessKey=None): """log - logger for the instance connection - EC2Connection object that stores the connection From 600803ccd43980c2a2617a6435272a38f19c7322 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 15 Sep 2025 22:06:33 -0400 Subject: [PATCH 07/39] Code to enable spot instances --- jobManager.py | 4 ++-- jobQueue.py | 2 +- vmms/ec2SSH.py | 24 +++++++++++++++++------- worker.py | 3 ++- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/jobManager.py b/jobManager.py index 7f254cce..df54dec8 100644 --- a/jobManager.py +++ b/jobManager.py @@ -20,7 +20,7 @@ import tango # Written this way to avoid circular imports from config import Config from jobQueue import JobQueue -from tangoObjects import TangoQueue +from tangoObjects import TangoJob, TangoQueue from worker import Worker @@ -63,7 +63,7 @@ def __manage(self): self.running = True while True: # Blocks until we get a next job - job = self.jobQueue.getNextPendingJob() + job: TangoJob = self.jobQueue.getNextPendingJob() if not job.accessKey and Config.REUSE_VMS: self.log.info(f"job has access key {job.accessKey} and is calling reuseVM") vm = None diff --git a/jobQueue.py b/jobQueue.py index 45dd40ac..e3dc4256 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -347,7 +347,7 @@ def reset(self): self.deadJobs._clean() self.unassignedJobs._clean() - def getNextPendingJob(self): + def getNextPendingJob(self) -> TangoJob: """Gets the next unassigned live job. Note that this is a blocking function and we will block till there is an available job. diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 318b6d49..1e21da79 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -22,6 +22,7 @@ import config from tangoObjects import TangoMachine +from typing import Optional # suppress most boto logging logging.getLogger("boto3").setLevel(logging.CRITICAL) @@ -192,7 +193,7 @@ def __init__(self, accessKeyId=None, accessKey=None): self.img2ami = {} self.images = [] try: - self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + self.boto3resource: boto3.EC2.ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) self.boto3client = boto3.client("ec2", config.Config.EC2_REGION) # Get images from ec2 @@ -248,7 +249,7 @@ def domainName(self, vm): # VMMS helper methods # - def tangoMachineToEC2Instance(self, vm: TangoMachine): + def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict: """tangoMachineToEC2Instance - returns an object with EC2 instance type and AMI. Only general-purpose instances are used. Defalt AMI is currently used. @@ -328,7 +329,7 @@ def createSecurityGroup(self): # # VMMS API functions # - def initializeVM(self, vm): + def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: """initializeVM - Tell EC2 to create a new VM instance. Returns a boto.ec2.instance.Instance object. @@ -350,13 +351,21 @@ def initializeVM(self, vm): self.key_pair_name = self.keyPairName(vm.id, vm.name) self.createKeyPair() - reservation = self.boto3resource.create_instances( + instance_market_options = { + "MarketType": "spot", + "SpotOptions": { + "SpotInstanceType": "one-time", + "InstanceInterruptionBehavior": "terminate" + } + } + reservation: list[boto3.ec2.Instance] = self.boto3resource.create_instances( ImageId=ec2instance["ami"], KeyName=self.key_pair_name, SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP], InstanceType=ec2instance["instance_type"], MaxCount=1, MinCount=1, + InstanceMarketOptions=instance_market_options, ) # Sleep for a while to prevent random transient errors observed @@ -365,8 +374,9 @@ def initializeVM(self, vm): # reservation is a list of instances created. there is only # one instance created so get index 0. - newInstance = reservation[0] + newInstance: boto3.ec2.Instance = reservation[0] if not newInstance: + # TODO: when does this happen? raise ValueError("Cannot find new instance for %s" % vm.name) # Wait for instance to reach 'running' state @@ -436,7 +446,7 @@ def initializeVM(self, vm): self.log.debug("initializeVM Failed: %s" % e) # if the new instance exists, terminate it - if newInstance: + if newInstance is not None: try: self.boto3resource.instances.filter( InstanceIds=[newInstance.id] @@ -448,7 +458,7 @@ def initializeVM(self, vm): return None return None - def waitVM(self, vm, max_secs): + def waitVM(self, vm, max_secs) -> 0 | -1: """waitVM - Wait at most max_secs for a VM to become ready. Return error if it takes too long. diff --git a/worker.py b/worker.py index 1dd8d8e7..59a17239 100644 --- a/worker.py +++ b/worker.py @@ -247,7 +247,7 @@ def run(self): ) # Host name returned from EC2 is stored in the vm object - self.vmms.initializeVM(self.job.vm) + self.vmms.initializeVM(self.job.vm) # TODO: This can return None if the step fails, check for that self.log.debug("Asigned job to a new VM") vm = self.job.vm @@ -322,6 +322,7 @@ def run(self): self.job.vm.notes = str(self.job.id) + "_" + self.job.name self.job.vm.keep_for_debugging = True self.afterJobExecution(hdrfile, msg, False) + # TODO: no reschedule? return self.log.info( From d4501129334b2bbce4506217d96cf48c7688db1b Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 16 Sep 2025 16:01:11 -0400 Subject: [PATCH 08/39] type annotations --- jobManager.py | 23 +++++------ jobQueue.py | 1 + mypy.ini | 5 +++ restful_tango/server.py | 6 +-- restful_tango/tangoREST.py | 5 --- tango.py | 6 +-- tangoObjects.py | 78 ++++++++++++++++++++++++++++++++------ tests/validate.py | 4 +- vmms/ec2SSH.py | 54 +++++++++++++------------- vmms/interface.py | 42 ++++++++++++++++++++ vmms/localDocker.py | 2 +- 11 files changed, 162 insertions(+), 64 deletions(-) create mode 100644 mypy.ini create mode 100644 vmms/interface.py diff --git a/jobManager.py b/jobManager.py index df54dec8..65d62116 100644 --- a/jobManager.py +++ b/jobManager.py @@ -26,7 +26,7 @@ class JobManager(object): - def __init__(self, queue): + def __init__(self, queue: JobQueue): self.daemon = True self.jobQueue: JobQueue = queue self.preallocator = self.jobQueue.preallocator @@ -36,7 +36,7 @@ def __init__(self, queue): self.nextId = 10000 self.running = False - def start(self): + def start(self) -> None: if self.running: return thread = threading.Thread(target=self.__manage) @@ -59,7 +59,7 @@ def _getNextID(self): self.nextId = 10000 return id - def __manage(self): + def __manage(self) -> None: self.running = True while True: # Blocks until we get a next job @@ -83,7 +83,8 @@ def __manage(self): newVM = copy.deepcopy(job.vm) newVM.id = self._getNextID() try: - preVM = vmms.initializeVM(newVM) + vmms.initializeVM(newVM) + preVM = newVM except Exception as e: self.log.error("ERROR initialization VM: %s", e) self.log.error(traceback.format_exc()) @@ -138,19 +139,19 @@ def __manage(self): JobManager" ) else: - tango = tango.TangoServer() - tango.log.debug("Resetting Tango VMs") - tango.resetTango(tango.preallocator.vmms) - for key in tango.preallocator.machines.keys(): - tango.preallocator.machines.set(key, [[], TangoQueue(key)]) + tango_server = tango.TangoServer() + tango_server.log.debug("Resetting Tango VMs") + tango_server.resetTango(tango_server.preallocator.vmms) + for key in tango_server.preallocator.machines.keys(): + tango_server.preallocator.machines.set(key, [[], TangoQueue(key)]) # The above call sets the total pool empty. But the free pool which # is a queue in redis, may not be empty. When the job manager restarts, # resetting the free queue using the key doesn't change its content. # Therefore we empty the queue, thus the free pool, to keep it consistent # with the total pool. - tango.preallocator.machines.get(key)[1].make_empty() - jobs = JobManager(tango.jobQueue) + tango_server.preallocator.machines.get(key)[1].make_empty() + jobs = JobManager(tango_server.jobQueue) print("Starting the stand-alone Tango JobManager") jobs.run() diff --git a/jobQueue.py b/jobQueue.py index e3dc4256..2d0a9934 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -270,6 +270,7 @@ def assignJob(self, jobId, vm=None): self.log.debug("assignJob| Released lock to job queue.") # return job + # TODO: Rename this job to be more accurate in its description def unassignJob(self, jobId): """unassignJob - marks a job to be unassigned Note: We assume here that a job is to be rescheduled or diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..db776d15 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,5 @@ +[mypy] +explicit_package_bases = True + +[mypy-vmms.tashiSSH] +ignore_errors = True \ No newline at end of file diff --git a/restful_tango/server.py b/restful_tango/server.py index 2eba21fc..b31f0e8b 100755 --- a/restful_tango/server.py +++ b/restful_tango/server.py @@ -9,13 +9,9 @@ import tornado.web from tempfile import NamedTemporaryFile -from tangoREST import TangoREST +from restful_tango.tangoREST import TangoREST import asyncio -currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) -parentdir = os.path.dirname(currentdir) -sys.path.insert(0, parentdir) - from config import Config tangoREST = TangoREST() diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py index c60cf44d..90dbc300 100644 --- a/restful_tango/tangoREST.py +++ b/restful_tango/tangoREST.py @@ -6,16 +6,11 @@ import sys import os -import inspect import hashlib import json import logging import docker -currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) -parentdir = os.path.dirname(currentdir) -sys.path.insert(0, parentdir) - from config import Config from tangoObjects import TangoJob, TangoMachine, InputFile from tango import TangoServer diff --git a/tango.py b/tango.py index 8841a38c..3f94cd12 100755 --- a/tango.py +++ b/tango.py @@ -75,8 +75,8 @@ def __init__(self): vmms = DistDocker() - self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) - self.jobQueue = JobQueue(self.preallocator) + self.preallocator: Preallocator = Preallocator({Config.VMMS_NAME: vmms}) + self.jobQueue: JobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will @@ -89,7 +89,7 @@ def __init__(self): level=Config.LOGLEVEL, ) self.start_time = time.time() - self.log = logging.getLogger("TangoServer") + self.log: logging.Logger = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): diff --git a/tangoObjects.py b/tangoObjects.py index 64c5f309..a5554e1e 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -7,7 +7,8 @@ from queue import Queue import pickle import redis -from typing import Optional +from typing import Optional, Protocol, TypeVar +from abc import abstractmethod redisConnection = None @@ -117,9 +118,9 @@ def __init__( self._name = name self._notifyURL = notifyURL self._timeout = timeout # How long to run the autodriver on the job for before timing out. - self._trace = [] + self._trace: list[str] = [] self._maxOutputFileSize = maxOutputFileSize - self._remoteLocation = None + self._remoteLocation: Optional[str] = None self._accessKeyId = accessKeyId self._accessKey = accessKey self._disableNetwork = disableNetwork @@ -245,7 +246,7 @@ def setId(self, new_id): if self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] - dictionary = TangoDictionary(dict_hash) + dictionary = TangoDictionary.create(dict_hash) dictionary.delete(key) self._remoteLocation = dict_hash + ":" + str(new_id) self.updateRemote() @@ -438,23 +439,72 @@ def make_empty(self): if item is None: break +T = TypeVar('T') +class TangoDictionary(Protocol[T]): + + @staticmethod + def create(dictionary_name: str) -> TangoDictionary[T]: + if Config.USE_REDIS: + return TangoRemoteDictionary(dictionary_name) + else: + return TangoNativeDictionary() + + @property + @abstractmethod + def hash_name(self) -> str: + ... + + @abstractmethod + def __contains__(self, id: str) -> bool: + ... + @abstractmethod + def set(self, id: str, obj: T) -> str: + ... + @abstractmethod + def get(self, id: str) -> Optional[T]: + ... + @abstractmethod + def getExn(self, id: str) -> T: + ... + @abstractmethod + def keys(self) -> list[str]: + ... + @abstractmethod + def values(self) -> list[T]: + ... + @abstractmethod + def delete(self, id: str) -> None: + ... + @abstractmethod + def _clean(self) -> None: + ... + @abstractmethod + def items(self) -> list[tuple[str, T]]: + ... + # This is an abstract class that decides on # if we should initiate a TangoRemoteDictionary or TangoNativeDictionary # Since there are no abstract classes in Python, we use a simple method -def TangoDictionary(object_name): - if Config.USE_REDIS: - return TangoRemoteDictionary(object_name) - else: - return TangoNativeDictionary() +# def TangoDictionary(object_name): +# if Config.USE_REDIS: +# return TangoRemoteDictionary(object_name) +# else: +# return TangoNativeDictionary() + + -class TangoRemoteDictionary(object): +class TangoRemoteDictionary(TangoDictionary[T]): def __init__(self, object_name): self.r = getRedisConnection() - self.hash_name = object_name + self._hash_name = object_name + + @property + def hash_name(self) -> str: + return self._hash_name def __contains__(self, id): return self.r.hexists(self.hash_name, str(id)) @@ -506,10 +556,14 @@ def items(self): ) -class TangoNativeDictionary(object): +class TangoNativeDictionary(TangoDictionary[T]): def __init__(self): self.dict = {} + @property + def hash_name(self) -> str: + raise ValueError("TangoNativeDictionary does not have a hash name") + def __repr__(self): return str(self.dict) diff --git a/tests/validate.py b/tests/validate.py index 546db78b..fe0ab3bf 100644 --- a/tests/validate.py +++ b/tests/validate.py @@ -4,6 +4,8 @@ # directories. # +from typing import List + try: import pyflakes @@ -46,7 +48,7 @@ args = parser.parse_args() # Setup -skip_paths = [] +skip_paths: List[str] = [] # Stats file_count = 0 diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 1e21da79..0e7297a8 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -22,7 +22,9 @@ import config from tangoObjects import TangoMachine -from typing import Optional +from typing import Optional, Literal, List +from mypy_boto3_ec2 import EC2ServiceResource +from mypy_boto3_ec2.service_resource import Instance # suppress most boto logging logging.getLogger("boto3").setLevel(logging.CRITICAL) @@ -193,7 +195,7 @@ def __init__(self, accessKeyId=None, accessKey=None): self.img2ami = {} self.images = [] try: - self.boto3resource: boto3.EC2.ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) + self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) self.boto3client = boto3.client("ec2", config.Config.EC2_REGION) # Get images from ec2 @@ -276,26 +278,26 @@ def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict: def createKeyPair(self): # TODO: SUPPORT raise - # try to delete the key to avoid collision - self.key_pair_path = "%s/%s.pem" % ( - config.Config.DYNAMIC_SECURITY_KEY_PATH, - self.key_pair_name, - ) - self.deleteKeyPair() - key = self.connection.create_key_pair(self.key_pair_name) - key.save(config.Config.DYNAMIC_SECURITY_KEY_PATH) - # change the SSH_FLAG accordingly - self.ssh_flags[1] = self.key_pair_path + # # try to delete the key to avoid collision + # self.key_pair_path: str = "%s/%s.pem" % ( + # config.Config.DYNAMIC_SECURITY_KEY_PATH, + # self.key_pair_name, + # ) + # self.deleteKeyPair() + # key = self.connection.create_key_pair(self.key_pair_name) + # key.save(config.Config.DYNAMIC_SECURITY_KEY_PATH) + # # change the SSH_FLAG accordingly + # self.ssh_flags[1] = self.key_pair_path def deleteKeyPair(self): # TODO: SUPPORT raise - self.boto3client.delete_key_pair(self.key_pair_name) - # try to delete may not exist key file - try: - os.remove(self.key_pair_path) - except OSError: - pass + # self.boto3client.delete_key_pair(self.key_pair_name) + # # try to delete may not exist key file + # try: + # os.remove(self.key_pair_path) + # except OSError: + # pass def createSecurityGroup(self): try: @@ -329,7 +331,7 @@ def createSecurityGroup(self): # # VMMS API functions # - def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: + def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: """initializeVM - Tell EC2 to create a new VM instance. Returns a boto.ec2.instance.Instance object. @@ -343,8 +345,8 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: # ensure that security group exists self.createSecurityGroup() if self.useDefaultKeyPair: - self.key_pair_name = config.Config.SECURITY_KEY_NAME - self.key_pair_path = config.Config.SECURITY_KEY_PATH + self.key_pair_name: str = config.Config.SECURITY_KEY_NAME + self.key_pair_path: str = config.Config.SECURITY_KEY_PATH else: # TODO: SUPPORT raise @@ -358,7 +360,7 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: "InstanceInterruptionBehavior": "terminate" } } - reservation: list[boto3.ec2.Instance] = self.boto3resource.create_instances( + reservation: List[Instance] = self.boto3resource.create_instances( ImageId=ec2instance["ami"], KeyName=self.key_pair_name, SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP], @@ -440,7 +442,7 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: vm.domain_name = newInstance.public_ip_address vm.instance_id = newInstance.id self.log.debug("VM %s: %s" % (instanceName, newInstance)) - return vm + return 0 except Exception as e: self.log.debug("initializeVM Failed: %s" % e) @@ -455,10 +457,10 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]: self.log.error( "Exception handling failed for %s: %s" % (vm.name, e) ) - return None - return None + return -1 + return -1 - def waitVM(self, vm, max_secs) -> 0 | -1: + def waitVM(self, vm, max_secs) -> Literal[0, -1]: """waitVM - Wait at most max_secs for a VM to become ready. Return error if it takes too long. diff --git a/vmms/interface.py b/vmms/interface.py new file mode 100644 index 00000000..d71500f9 --- /dev/null +++ b/vmms/interface.py @@ -0,0 +1,42 @@ +from typing import Protocol, Optional, Literal, List +from tangoObjects import TangoMachine +from abc import abstractmethod + + +class VMMSInterface(Protocol): + @abstractmethod + def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: + ... + + @abstractmethod + def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: + ... + + @abstractmethod + def copyIn(self, vm: TangoMachine, inputFiles: List[str], job_id: Optional[int] = None) -> Literal[0, -1]: + ... + + @abstractmethod + def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: # -1 to infinity + ... + + @abstractmethod + def copyOut(self, vm: TangoMachine, destFile: str) -> Literal[0, -1]: + ... + + @abstractmethod + def destroyVM(self, vm: TangoMachine) -> Literal[0, -1]: + ... + + @abstractmethod + def safeDestroyVM(self, vm: TangoMachine) -> Literal[0, -1]: + ... + + @abstractmethod + def getVMs(self) -> List[TangoMachine]: + ... + + @abstractmethod + def existsVM(self, vm: TangoMachine) -> bool: + ... + diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 1a1e1dd0..ada77c09 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -117,7 +117,7 @@ def domainName(self, vm): # def initializeVM(self, vm): """initializeVM - Nothing to do for initializeVM""" - return vm + return 0 def waitVM(self, vm, max_secs): """waitVM - Nothing to do for waitVM""" From 029c1a9f386551c886d01d6393e93faf30db6c64 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 16 Sep 2025 16:21:04 -0400 Subject: [PATCH 09/39] finished mypy stuff up to not using --check-untyped-defs --- Dockerfile | 3 +++ requirements.txt | 49 +++++++++++++++++++++++++++++++++++++++++------- vmms/ec2SSH.py | 45 ++++++++++++++++++++++---------------------- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/Dockerfile b/Dockerfile index c120e15f..d6c7dac8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,5 +70,8 @@ RUN mkdir -p /var/log/docker /var/log/supervisor RUN cp /opt/TangoService/Tango/deployment/config/nginx.conf /etc/nginx/nginx.conf RUN cp /opt/TangoService/Tango/deployment/config/supervisord.conf /etc/supervisor/supervisord.conf +# Set up PYTHONPATH +ENV PYTHONPATH="/opt/TangoService/Tango" + # Reload new config scripts CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] diff --git a/requirements.txt b/requirements.txt index 548d9625..8ebee4d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,47 @@ +async-timeout==5.0.1 +backoff==2.2.1 backports.ssl-match-hostname==3.7.0.1 -boto3 +boto3==1.37.38 +boto3-stubs==1.40.31 +botocore==1.37.38 +botocore-stubs==1.38.30 +certifi==2025.8.3 +cffi==1.17.1 +charset-normalizer==3.4.3 +cryptography==45.0.7 +distlib==0.4.0 +docker==5.0.3 +filelock==3.16.1 +idna==3.10 +jmespath==1.0.1 +mypy==1.14.1 +mypy-boto3-ec2==1.40.24 +mypy-extensions==1.1.0 +platformdirs==4.3.6 +pycparser==2.23 pyflakes==2.1.1 +python-dateutil==2.9.0.post0 +pytz==2025.2 +PyYAML==6.0.2 redis==4.4.4 requests==2.31.0 -# needed for tashi, rpyc==4.1.4 +s3transfer==0.11.5 +six==1.17.0 +supervisor==4.1.0 +tomli==2.2.1 tornado==6.4.1 -urllib3==1.26.19 -docker==5.0.3 -backoff==2.2.1 -pytz -pyyaml +types-awscrt==0.27.6 +types-cffi==1.16.0.20241221 +types-docker==7.1.0.20241229 +types-jmespath==1.0.2.20240106 +types-pyflakes==3.2.0.20240813 +types-pyOpenSSL==24.1.0.20240722 +types-PyYAML==6.0.12.20241230 +types-redis==4.6.0.20241004 +types-requests==2.32.0.20241016 +types-s3transfer==0.13.1 +types-setuptools==75.8.0.20250110 +typing-extensions==4.13.2 +urllib3==2.2.3 +virtualenv==20.34.0 +websocket-client==1.8.0 diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 0e7297a8..8f59e1d0 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -22,9 +22,10 @@ import config from tangoObjects import TangoMachine -from typing import Optional, Literal, List +from typing import Optional, Literal, List, Sequence from mypy_boto3_ec2 import EC2ServiceResource from mypy_boto3_ec2.service_resource import Instance +from mypy_boto3_ec2.type_defs import FilterTypeDef # suppress most boto logging logging.getLogger("boto3").setLevel(logging.CRITICAL) @@ -189,7 +190,14 @@ def __init__(self, accessKeyId=None, accessKey=None): self.useDefaultKeyPair = True # key pair settings, for now, use default security key - + if self.useDefaultKeyPair: + self.key_pair_name: str = config.Config.SECURITY_KEY_NAME + self.key_pair_path: str = config.Config.SECURITY_KEY_PATH + else: + # TODO: SUPPORT. Know that this if/else block used to be under initializeVM, using vm for a unique identifier + raise + # self.key_pair_name = self.keyPairName(vm.id, vm.name) + # self.createKeyPair() # create boto3resource self.img2ami = {} @@ -336,7 +344,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: Returns a boto.ec2.instance.Instance object. """ - newInstance = None + newInstance: Optional[Instance] = None # Create the instance and obtain the reservation try: instanceName = self.instanceName(vm.id, vm.name) @@ -344,22 +352,8 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: self.log.debug("instanceName: %s" % instanceName) # ensure that security group exists self.createSecurityGroup() - if self.useDefaultKeyPair: - self.key_pair_name: str = config.Config.SECURITY_KEY_NAME - self.key_pair_path: str = config.Config.SECURITY_KEY_PATH - else: - # TODO: SUPPORT - raise - self.key_pair_name = self.keyPairName(vm.id, vm.name) - self.createKeyPair() - - instance_market_options = { - "MarketType": "spot", - "SpotOptions": { - "SpotInstanceType": "one-time", - "InstanceInterruptionBehavior": "terminate" - } - } + + reservation: List[Instance] = self.boto3resource.create_instances( ImageId=ec2instance["ami"], KeyName=self.key_pair_name, @@ -367,7 +361,14 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: InstanceType=ec2instance["instance_type"], MaxCount=1, MinCount=1, - InstanceMarketOptions=instance_market_options, + InstanceMarketOptions= + { + "MarketType": "spot", + "SpotOptions": { + "SpotInstanceType": "one-time", + "InstanceInterruptionBehavior": "terminate" + } + }, ) # Sleep for a while to prevent random transient errors observed @@ -376,7 +377,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: # reservation is a list of instances created. there is only # one instance created so get index 0. - newInstance: boto3.ec2.Instance = reservation[0] + newInstance = reservation[0] if not newInstance: # TODO: when does this happen? raise ValueError("Cannot find new instance for %s" % vm.name) @@ -385,7 +386,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: start_time = time.time() while True: - filters = [ + filters: Sequence[FilterTypeDef] = [ {"Name": "instance-state-name", "Values": ["running"]} ] instances = self.boto3resource.instances.filter(Filters=filters) From 85bdf7c2398092bef6d15def3915721c6e57dfdf Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 22 Sep 2025 17:44:24 -0400 Subject: [PATCH 10/39] doesn't quite work, but wanted a checkpoint before muddling with the preallocator --- jobQueue.py | 11 +++++----- mypy.ini | 3 +++ preallocator.py | 11 ++++++---- tangoObjects.py | 7 +++--- tests/testObjects.py | 2 +- vmms/ec2SSH.py | 10 ++++++--- vmms/localDocker.py | 4 +++- worker.py | 51 ++++++++++++++++++++++++++++++++++++++------ 8 files changed, 75 insertions(+), 24 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 2d0a9934..ee5b2956 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -53,8 +53,8 @@ def __init__(self, preallocator): is needed since there are multiple worker threads and they might be using the makeUnassigned api. """ - self.liveJobs = TangoDictionary("liveJobs") - self.deadJobs = TangoDictionary("deadJobs") + self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs") + self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") self.unassignedJobs = TangoQueue("unassignedLiveJobs") self.queueLock = threading.Lock() self.preallocator = preallocator @@ -187,7 +187,7 @@ def addDead(self, job): self.log.info("addDead|Unassigning job %s" % str(job.id)) job.makeUnassigned() - job.retries = 0 + job.resetRetries() self.log.debug("addDead|Acquiring lock to job queue.") self.queueLock.acquire() @@ -248,12 +248,13 @@ def get(self, id): # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. # TODO: It does assign the job to a particular VM though. - def assignJob(self, jobId, vm=None): + # Precondition: jobId is in self.liveJobs + def assignJob(self, jobId, vm=None) -> None: """assignJob - marks a job to be assigned""" self.queueLock.acquire() self.log.debug("assignJob| Acquired lock to job queue.") - job = self.liveJobs.get(jobId) + job = self.liveJobs.getExn(jobId) print(str(job), jobId) # print(str(self.unassignedJobs)) # print(str(self.liveJobs)) diff --git a/mypy.ini b/mypy.ini index db776d15..f6332a89 100644 --- a/mypy.ini +++ b/mypy.ini @@ -2,4 +2,7 @@ explicit_package_bases = True [mypy-vmms.tashiSSH] +ignore_errors = True + +[mypy-preallocator] ignore_errors = True \ No newline at end of file diff --git a/preallocator.py b/preallocator.py index 93e9c822..6607c552 100644 --- a/preallocator.py +++ b/preallocator.py @@ -6,7 +6,7 @@ import time import copy -from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue +from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine from config import Config # @@ -22,7 +22,7 @@ class Preallocator(object): def __init__(self, vmms): - self.machines = TangoDictionary("machines") + self.machines: TangoDictionary[TangoMachine] = TangoDictionary.create("machines") self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) self.vmms = vmms @@ -33,7 +33,7 @@ def poolSize(self, vmName): if vmName not in self.machines: return 0 else: - return len(self.machines.get(vmName)[0]) + return len(self.machines.getExn(vmName)[0]) def update(self, vm, num): """update - Updates the number of machines of a certain type @@ -47,7 +47,10 @@ def update(self, vm, num): self.lock.acquire() if vm.name not in self.machines: self.machines.set(vm.name, [[], TangoQueue(vm.name)]) - self.machines.get(vm.name)[1].make_empty() + self.machines.getExn(vm.name)[1].make_empty() # TODO: oh bruh this is incorrect. + # TODO: when used with a TangoRemoteDictionary, this will create a transient copy of the queue (from the redis pickle), + # TODO: then modify it, and then discard it :). self.machines will not be updated by a .get/.getExn call. + # TODO: TangoDictionary's should have value semantics, so the value type should probably be a tuple self.log.debug("Creating empty pool of %s instances" % (vm.name)) self.lock.release() diff --git a/tangoObjects.py b/tangoObjects.py index a5554e1e..9bf60b65 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -246,7 +246,7 @@ def setId(self, new_id): if self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] - dictionary = TangoDictionary.create(dict_hash) + dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash) dictionary.delete(key) self._remoteLocation = dict_hash + ":" + str(new_id) self.updateRemote() @@ -274,7 +274,7 @@ def syncRemote(self): if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] - dictionary = TangoDictionary(dict_hash) + dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash) temp_job = dictionary.get(key) # Key should be in dictionary if temp_job is None: print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob @@ -285,7 +285,7 @@ def updateRemote(self): if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] - dictionary = TangoDictionary(dict_hash) + dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash) dictionary.set(key, self) def deleteFromDict(self, dictionary : TangoDictionary) -> None: @@ -440,6 +440,7 @@ def make_empty(self): break T = TypeVar('T') +# Dictionary from string to T class TangoDictionary(Protocol[T]): @staticmethod diff --git a/tests/testObjects.py b/tests/testObjects.py index f0b08ea6..d477073f 100644 --- a/tests/testObjects.py +++ b/tests/testObjects.py @@ -18,7 +18,7 @@ def setUp(self): } def runDictionaryTests(self): - test_dict = TangoDictionary("test") + test_dict = TangoDictionary.create("test") self.assertEqual(test_dict.keys(), []) self.assertEqual(test_dict.values(), []) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 8f59e1d0..02a3f577 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -27,6 +27,9 @@ from mypy_boto3_ec2.service_resource import Instance from mypy_boto3_ec2.type_defs import FilterTypeDef +from vmms.interface import VMMSInterface + + # suppress most boto logging logging.getLogger("boto3").setLevel(logging.CRITICAL) logging.getLogger("botocore").setLevel(logging.CRITICAL) @@ -143,7 +146,7 @@ class ec2CallError(Exception): pass -class Ec2SSH(object): +class Ec2SSH(VMMSInterface): _SSH_FLAGS = [ "-i", config.Config.SECURITY_KEY_PATH, @@ -200,10 +203,11 @@ def __init__(self, accessKeyId=None, accessKey=None): # self.createKeyPair() # create boto3resource - self.img2ami = {} + self.img2ami = {} # this is a bad name, should really be img_name to img self.images = [] try: - self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) + # This is a service resource + self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource self.boto3client = boto3.client("ec2", config.Config.EC2_REGION) # Get images from ec2 diff --git a/vmms/localDocker.py b/vmms/localDocker.py index ada77c09..8ccfb7d5 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -14,6 +14,8 @@ import config from tangoObjects import TangoMachine +from vmms.interface import VMMSInterface + def timeout(command, time_out=1): """timeout - Run a unix command with a timeout. Return -1 on @@ -72,7 +74,7 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0): # -class LocalDocker(object): +class LocalDocker(VMMSInterface): def __init__(self): """Checks if the machine is ready to run docker containers. Initialize boot2docker if running on OS X. diff --git a/worker.py b/worker.py index 59a17239..14da23f6 100644 --- a/worker.py +++ b/worker.py @@ -6,7 +6,7 @@ import logging import tempfile import requests -from requests.packages.urllib3.util.retry import Retry +from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import os import shutil @@ -41,6 +41,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM): # # Worker helper functions # + # TODO: These should not have default values in my opinion def detachVM(self, return_vm=False, replace_vm=False): """detachVM - Detach the VM from this worker. The options are to return it to the pool's free list (return_vm), destroy it @@ -106,8 +107,8 @@ def rescheduleJob(self, hdrfile, ret, err): ) self.appendMsg( hdrfile, - "Job status: waitVM=%s copyIn=%s runJob=%s copyOut=%s" - % (ret["waitvm"], ret["copyin"], ret["runjob"], ret["copyout"]), + "Job status: waitVM=%s initializeVM=%s copyIn=%s runJob=%s copyOut=%s" + % (ret["waitvm"], ret["initializevm"], ret["copyin"], ret["runjob"], ret["copyout"]), ) self.catFiles(hdrfile, self.job.outputFile) @@ -165,7 +166,10 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): + def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): + # TODO: I don't think killVM is a good variable name, it can refer to either returning or destroying the VM + # TODO: Also, what does it mean to not kill the VM? (i.e. not returning it)? It only gets called when we have a job.stopBefore. + # TODO: This directly contradicts the documentation of detachVM ("The worker must always call this function before returning.") self.jobQueue.makeDead(self.job, msg) # Update the text that users see in the autodriver output file @@ -188,6 +192,7 @@ def run(self): # Hash of return codes for each step ret = {} ret["waitvm"] = None + ret["initializevm"] = None ret["copyin"] = None ret["runjob"] = None ret["copyout"] = None @@ -201,6 +206,7 @@ def run(self): # Assigning job to a preallocated VM if self.preVM: # self.preVM: + assert not Config.VMMS_NAME == "ec2ssh", "Unimplemented" self.log.debug("Assigning job to preallocated VM") self.job.makeVM(self.preVM) self.log.info( @@ -221,6 +227,7 @@ def run(self): ) ) self.log.debug("Assigned job to preallocated VM") + ret["initializevm"] = 0 # Vacuous success since it doesn't happen # Assigning job to a new VM else: self.log.debug("Assigning job to a new VM") @@ -247,7 +254,15 @@ def run(self): ) # Host name returned from EC2 is stored in the vm object - self.vmms.initializeVM(self.job.vm) # TODO: This can return None if the step fails, check for that + ret["initializevm"] = self.vmms.initializeVM(self.job.vm) + if ret["initializevm"] != 0: + self.rescheduleJob( + hdrfile, + ret, + "Internal error: initializeVM failed" + ) + return + self.log.debug("Asigned job to a new VM") vm = self.job.vm @@ -321,8 +336,12 @@ def run(self): msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) self.job.vm.notes = str(self.job.id) + "_" + self.job.name self.job.vm.keep_for_debugging = True - self.afterJobExecution(hdrfile, msg, False) - # TODO: no reschedule? + self.log.debug(msg) + self.rescheduleJob( + hdrfile, + ret, + msg + ) return self.log.info( @@ -350,6 +369,13 @@ def run(self): Config.runjob_errors += 1 if ret["runjob"] == -1: Config.runjob_timeouts += 1 + self.rescheduleJob( + hdrfile, + ret, + "Internal error: runJob failed" + ) + return + self.log.info( "Job %s:%d executed [status=%s]" % (self.job.name, self.job.id, ret["runjob"]) @@ -368,6 +394,13 @@ def run(self): ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile) if ret["copyout"] != 0: Config.copyout_errors += 1 + self.rescheduleJob( + hdrfile, + ret, + "Internal error: copyOut failed" + ) + return + self.log.info( "Output copied for job %s:%d [status=%d]" % (self.job.name, self.job.id, ret["copyout"]) @@ -382,6 +415,10 @@ def run(self): # normal termination and doesn't reschedule the job. self.log.info("Success: job %s:%d finished" % (self.job.name, self.job.id)) + for status in ret.values(): + assert status == 0, "Should not get to the success point if any stage failed" + # TODO: test this, then remove everything below this point + # Move the job from the live queue to the dead queue # with an explanatory message msg = "Success: Autodriver returned normally" From 7e6751e61e71b7f7d8f5bb2859c76c567580a520 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 22 Sep 2025 21:08:06 -0400 Subject: [PATCH 11/39] more typing changes, streamlining the used of the TangoDictionary in preallocator (do note that my initial worry about incorrectness was unfounded due to weird Redis sharing behavior) --- jobManager.py | 8 +++++--- jobQueue.py | 2 +- preallocator.py | 15 ++++++++------- tangoObjects.py | 42 ++++++++++++++++++++++++++++++++---------- tests/testObjects.py | 2 +- 5 files changed, 47 insertions(+), 22 deletions(-) diff --git a/jobManager.py b/jobManager.py index 65d62116..c21d316f 100644 --- a/jobManager.py +++ b/jobManager.py @@ -20,7 +20,8 @@ import tango # Written this way to avoid circular imports from config import Config from jobQueue import JobQueue -from tangoObjects import TangoJob, TangoQueue +from tangoObjects import TangoJob, TangoQueue, TangoMachine +from typing import List, Tuple from worker import Worker @@ -143,14 +144,15 @@ def __manage(self) -> None: tango_server.log.debug("Resetting Tango VMs") tango_server.resetTango(tango_server.preallocator.vmms) for key in tango_server.preallocator.machines.keys(): - tango_server.preallocator.machines.set(key, [[], TangoQueue(key)]) + machine: Tuple[List[TangoMachine], TangoQueue] = ([], TangoQueue.create(key)) + machine[1].make_empty() + tango_server.preallocator.machines.set(key, machine) # The above call sets the total pool empty. But the free pool which # is a queue in redis, may not be empty. When the job manager restarts, # resetting the free queue using the key doesn't change its content. # Therefore we empty the queue, thus the free pool, to keep it consistent # with the total pool. - tango_server.preallocator.machines.get(key)[1].make_empty() jobs = JobManager(tango_server.jobQueue) print("Starting the stand-alone Tango JobManager") diff --git a/jobQueue.py b/jobQueue.py index ee5b2956..34c3ae5b 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -55,7 +55,7 @@ def __init__(self, preallocator): """ self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs") self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") - self.unassignedJobs = TangoQueue("unassignedLiveJobs") + self.unassignedJobs = TangoQueue.create("unassignedLiveJobs") self.queueLock = threading.Lock() self.preallocator = preallocator self.log = logging.getLogger("JobQueue") diff --git a/preallocator.py b/preallocator.py index 6607c552..69291933 100644 --- a/preallocator.py +++ b/preallocator.py @@ -8,6 +8,7 @@ from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine from config import Config +from typing import Tuple, List # # Preallocator - This class maintains a pool of active VMs for future @@ -22,7 +23,7 @@ class Preallocator(object): def __init__(self, vmms): - self.machines: TangoDictionary[TangoMachine] = TangoDictionary.create("machines") + self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue]] = TangoDictionary.create("machines") self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) self.vmms = vmms @@ -46,11 +47,9 @@ def update(self, vm, num): """ self.lock.acquire() if vm.name not in self.machines: - self.machines.set(vm.name, [[], TangoQueue(vm.name)]) - self.machines.getExn(vm.name)[1].make_empty() # TODO: oh bruh this is incorrect. - # TODO: when used with a TangoRemoteDictionary, this will create a transient copy of the queue (from the redis pickle), - # TODO: then modify it, and then discard it :). self.machines will not be updated by a .get/.getExn call. - # TODO: TangoDictionary's should have value semantics, so the value type should probably be a tuple + initial_queue = TangoQueue.create(vm.name) + initial_queue.make_empty() + self.machines.set(vm.name, ([], initial_queue)) self.log.debug("Creating empty pool of %s instances" % (vm.name)) self.lock.release() @@ -209,7 +208,9 @@ def destroyVM(self, vmName, id): for i in range(size): vm = self.machines.get(vmName)[1].get_nowait() if vm.id != id: - self.machines.get(vmName)[1].put(vm) + machine = self.machines.get(vmName) + machine[1].put(vm) + self.machines.set(vmName, machine) else: dieVM = vm self.lock.release() diff --git a/tangoObjects.py b/tangoObjects.py index 9bf60b65..7d0dfc55 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -342,16 +342,35 @@ def get(self): def set(self, val): self.val = val return val + +class TangoQueue(Protocol): + @staticmethod + def create(key_name: str) -> TangoQueue: + if Config.USE_REDIS: + return TangoRemoteQueue(key_name) + else: + return ExtendedQueue() -def TangoQueue(object_name): - if Config.USE_REDIS: - return TangoRemoteQueue(object_name) - else: - return ExtendedQueue() - + @abstractmethod + def qsize(self) -> int: + ... + def empty(self) -> bool: + ... + def put(self, item) -> None: + ... + def get(self, block=True, timeout=None) -> Optional[T]: + ... + def get_nowait(self) -> Optional[T]: + ... + def remove(self, item) -> None: + ... + def _clean(self) -> None: + ... + def make_empty(self) -> None: + ... -class ExtendedQueue(Queue): +class ExtendedQueue(Queue, TangoQueue): """Python Thread safe Queue with the remove and clean function added""" def test(self): @@ -369,9 +388,12 @@ def remove(self, value): def _clean(self): with self.mutex: self.queue.clear() + + def make_empty(self): + self._clean() + - -class TangoRemoteQueue(object): +class TangoRemoteQueue(TangoQueue): """Simple Queue with Redis Backend""" @@ -433,7 +455,7 @@ def remove(self, item): def _clean(self): self.__db.delete(self.key) - def make_empty(self): + def make_empty(self) -> None: while True: item = self.__db.lpop(self.key) if item is None: diff --git a/tests/testObjects.py b/tests/testObjects.py index d477073f..8ba81270 100644 --- a/tests/testObjects.py +++ b/tests/testObjects.py @@ -69,7 +69,7 @@ def addAllToQueue(self): self.assertEqual(self.testQueue.qsize(), self.expectedSize) def runQueueTests(self): - self.testQueue = TangoQueue("self.testQueue") + self.testQueue = TangoQueue.create("self.testQueue") self.expectedSize = 0 self.assertEqual(self.testQueue.qsize(), self.expectedSize) self.assertTrue(self.testQueue.empty()) From 03e1f3181e23e26d738cb06ce06fb10f82ded926 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 28 Sep 2025 14:08:25 -0400 Subject: [PATCH 12/39] refactored afterJobExecution and detachVM --- worker.py | 92 +++++++++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/worker.py b/worker.py index 14da23f6..271a9b7f 100644 --- a/worker.py +++ b/worker.py @@ -10,6 +10,7 @@ from requests.adapters import HTTPAdapter import os import shutil +from enum import Enum from datetime import datetime from config import Config @@ -25,6 +26,11 @@ # anything else in the system. # +class DetachMethod(Enum): + RETURN_TO_POOL = "return_to_pool" + DESTROY_WITHOUT_REPLACEMENT = "destroy_without_replacement" + DESTROY_AND_REPLACE = "replace" + class Worker(threading.Thread): def __init__(self, job, vmms, jobQueue, preallocator, preVM): @@ -37,12 +43,18 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM): self.preVM = preVM threading.Thread.__init__(self) self.log = logging.getLogger("Worker") + self.cleanupStatus = False # # Worker helper functions # - # TODO: These should not have default values in my opinion - def detachVM(self, return_vm=False, replace_vm=False): + def __del__(self): + if self.job.stopBefore == "": # We don't want to cleanup the VM if we are stopping early for debugging + assert self.cleanupStatus, "Worker must call detachVM before returning" + + + # TODO: Return_vm should not have default values in my opinion + def detachVM(self, detachMethod: DetachMethod): """detachVM - Detach the VM from this worker. The options are to return it to the pool's free list (return_vm), destroy it (not return_vm), and if destroying it, whether to replace it @@ -50,20 +62,25 @@ def detachVM(self, return_vm=False, replace_vm=False): this function before returning. """ # job-owned instance, simply destroy after job is completed + self.cleanupStatus = True if self.job.vm.ec2_vmms: self.vmms.safeDestroyVM(self.job.vm) - elif return_vm: - self.preallocator.freeVM(self.job.vm) else: - self.vmms.safeDestroyVM(self.job.vm) - if replace_vm: + if detachMethod == DetachMethod.RETURN_TO_POOL: + self.preallocator.freeVM(self.job.vm) + elif detachMethod == DetachMethod.DESTROY_WITHOUT_REPLACEMENT: + self.vmms.safeDestroyVM(self.job.vm) + self.preallocator.removeVM(self.job.vm) + elif detachMethod == DetachMethod.DESTROY_AND_REPLACE: + self.vmms.safeDestroyVM(self.job.vm) self.preallocator.createVM(self.job.vm) - - # Important: don't remove the VM from the pool until its - # replacement has been created. Otherwise there is a - # potential race where the job manager thinks that the - # pool is empty and creates a spurious vm. - self.preallocator.removeVM(self.job.vm) + # Important: don't remove the VM from the pool until its + # replacement has been created. Otherwise there is a + # potential race where the job manager thinks that the + # pool is empty and creates a spurious vm. + self.preallocator.removeVM(self.job.vm) + else: + raise ValueError(f"Invalid detach method: {detachMethod}") # TODO: figure out what hdrfile, ret and err are def rescheduleJob(self, hdrfile, ret, err): @@ -88,32 +105,18 @@ def rescheduleJob(self, hdrfile, ret, err): os.remove(hdrfile) except OSError: pass - self.detachVM(return_vm=False, replace_vm=True) + self.detachVM(DetachMethod.DESTROY_AND_REPLACE) self.jobQueue.unassignJob(self.job.id) # Here is where we give up else: - self.log.error("Giving up on job %s:%d" % (self.job.name, self.job.id)) + full_err = f"Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}" + self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err)) self.job.appendTrace( - "%s|Giving up on job %s:%d" - % (datetime.now().ctime(), self.job.name, self.job.id) + "%s|Giving up on job %s:%d. %s" + % (datetime.now().ctime(), self.job.name, self.job.id, full_err) ) - self.jobQueue.makeDead(self.job, err) # Note: this reports the error that caused the last call to rescheduleJob to fail - - self.appendMsg( - hdrfile, - "Internal error: Unable to complete job after %d tries. Pleae resubmit" - % (Config.JOB_RETRIES), - ) - self.appendMsg( - hdrfile, - "Job status: waitVM=%s initializeVM=%s copyIn=%s runJob=%s copyOut=%s" - % (ret["waitvm"], ret["initializevm"], ret["copyin"], ret["runjob"], ret["copyout"]), - ) - - self.catFiles(hdrfile, self.job.outputFile) - self.detachVM(return_vm=False, replace_vm=True) - self.notifyServer(self.job) + self.afterJobExecution(hdrfile, full_err, DetachMethod.DESTROY_AND_REPLACE) def appendMsg(self, filename, msg): """appendMsg - Append a timestamped Tango message to a file""" @@ -166,10 +169,7 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): - # TODO: I don't think killVM is a good variable name, it can refer to either returning or destroying the VM - # TODO: Also, what does it mean to not kill the VM? (i.e. not returning it)? It only gets called when we have a job.stopBefore. - # TODO: This directly contradicts the documentation of detachVM ("The worker must always call this function before returning.") + def afterJobExecution(self, hdrfile, msg, detachMethod: Optional[DetachMethod]): self.jobQueue.makeDead(self.job, msg) # Update the text that users see in the autodriver output file @@ -178,8 +178,8 @@ def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): os.chmod(self.job.outputFile, 0o644) # Thread exit after termination - if killVM: - self.detachVM(return_vm=returnVM) + if detachMethod is not None: + self.detachVM(detachMethod) self.notifyServer(self.job) return @@ -286,7 +286,7 @@ def run(self): if self.job.stopBefore == "waitvm": msg = "Execution stopped before %s" % self.job.stopBefore returnVM = True - self.afterJobExecution(hdrfile, msg, returnVM, False) + self.afterJobExecution(hdrfile, msg, detachMethod=None) return ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT) @@ -324,7 +324,7 @@ def run(self): if (self.job.stopBefore == "copyin"): msg = "Execution stopped before %s" % self.job.stopBefore returnVM = True - self.afterJobExecution(hdrfile, msg, returnVM, False) + self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Copy input files to VM self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}") @@ -356,7 +356,7 @@ def run(self): if (self.job.stopBefore == "runjob"): msg = "Execution stopped before %s" % self.job.stopBefore returnVM = True - self.afterJobExecution(hdrfile, msg, returnVM, False) + self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Run the job on the virtual machine ret["runjob"] = self.vmms.runJob( @@ -388,7 +388,7 @@ def run(self): if (self.job.stopBefore == "copyout"): msg = "Execution stopped before %s" % self.job.stopBefore returnVM = True - self.afterJobExecution(hdrfile, msg, returnVM, False) + self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Copy the output back. ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile) @@ -422,7 +422,7 @@ def run(self): # Move the job from the live queue to the dead queue # with an explanatory message msg = "Success: Autodriver returned normally" - (returnVM, replaceVM) = (True, False) + detachMethod = DetachMethod.RETURN_TO_POOL if ret["copyin"] != 0: msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) elif ret["runjob"] != 0: @@ -436,7 +436,7 @@ def run(self): # and do not retry the job since the job may have damaged # the VM. msg = "Error: OS error while running job on VM" - (returnVM, replaceVM) = (False, True) + detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT self.job.vm.notes = str(self.job.id) + "_" + self.job.name self.job.vm.keep_for_debugging = True else: # This should never happen @@ -447,7 +447,7 @@ def run(self): elif ret["copyout"] != 0: msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"]) - self.afterJobExecution(hdrfile, msg, returnVM) + self.afterJobExecution(hdrfile, msg, detachMethod) return # @@ -463,4 +463,4 @@ def run(self): if self.preVM and not vm: vm = self.job.vm = self.preVM if vm: - self.detachVM(return_vm=False, replace_vm=True) + self.detachVM(DetachMethod.DESTROY_AND_REPLACE) From bc8c8a7c0b33fb33c49af83bcd9c33a8be1ef66b Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 28 Sep 2025 15:48:00 -0400 Subject: [PATCH 13/39] error messages --- worker.py | 81 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/worker.py b/worker.py index 271a9b7f..a57e9e5f 100644 --- a/worker.py +++ b/worker.py @@ -53,7 +53,6 @@ def __del__(self): assert self.cleanupStatus, "Worker must call detachVM before returning" - # TODO: Return_vm should not have default values in my opinion def detachVM(self, detachMethod: DetachMethod): """detachVM - Detach the VM from this worker. The options are to return it to the pool's free list (return_vm), destroy it @@ -65,6 +64,7 @@ def detachVM(self, detachMethod: DetachMethod): self.cleanupStatus = True if self.job.vm.ec2_vmms: self.vmms.safeDestroyVM(self.job.vm) + # TODO: what about the preallocator? else: if detachMethod == DetachMethod.RETURN_TO_POOL: self.preallocator.freeVM(self.job.vm) @@ -110,7 +110,7 @@ def rescheduleJob(self, hdrfile, ret, err): # Here is where we give up else: - full_err = f"Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}" + full_err = f"Internal Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}" self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err)) self.job.appendTrace( "%s|Giving up on job %s:%d. %s" @@ -333,7 +333,7 @@ def run(self): if ret["copyin"] != 0: Config.copyin_errors += 1 - msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) + msg = "Copy in to VM failed (status=%d)" % (ret["copyin"]) self.job.vm.notes = str(self.job.id) + "_" + self.job.name self.job.vm.keep_for_debugging = True self.log.debug(msg) @@ -366,13 +366,31 @@ def run(self): self.job.disableNetwork, ) if ret["runjob"] != 0: - Config.runjob_errors += 1 - if ret["runjob"] == -1: + if ret["runjob"] == 1: # This should never happen + msg = "RunJob: Autodriver usage error (status=%d)" % (ret["runjob"]) + elif ret["runjob"] == 2: + msg = "RunJob: Job timed out after %d seconds" % (self.job.timeout) + elif ret["runjob"] == 3: # EXIT_OSERROR in Autodriver + # Abnormal job termination (Autodriver encountered an OS + # error). Assume that the VM is damaged. Destroy this VM + # and do not retry the job since the job may have damaged + # the VM. + msg = "RunJob: OS error while running job on VM" + # TODO: do we need to not reschedule the job? + self.job.vm.notes = str(self.job.id) + "_" + self.job.name + self.job.vm.keep_for_debugging = True + elif ret["runjob"] == -1: Config.runjob_timeouts += 1 + # TODO: difference between 2 and -1? + else: # This should never happen + msg = "RunJob: Unknown autodriver error (status=%d)" % ( + ret["runjob"] + ) + Config.runjob_errors += 1 self.rescheduleJob( hdrfile, ret, - "Internal error: runJob failed" + msg ) return @@ -397,7 +415,7 @@ def run(self): self.rescheduleJob( hdrfile, ret, - "Internal error: copyOut failed" + f"Internal error: copyOut failed (status={ret['copyout']})" ) return @@ -422,32 +440,31 @@ def run(self): # Move the job from the live queue to the dead queue # with an explanatory message msg = "Success: Autodriver returned normally" - detachMethod = DetachMethod.RETURN_TO_POOL - if ret["copyin"] != 0: - msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) - elif ret["runjob"] != 0: - if ret["runjob"] == 1: # This should never happen - msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"]) - elif ret["runjob"] == 2: - msg = "Error: Job timed out after %d seconds" % (self.job.timeout) - elif ret["runjob"] == 3: # EXIT_OSERROR in Autodriver - # Abnormal job termination (Autodriver encountered an OS - # error). Assume that the VM is damaged. Destroy this VM - # and do not retry the job since the job may have damaged - # the VM. - msg = "Error: OS error while running job on VM" - detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT - self.job.vm.notes = str(self.job.id) + "_" + self.job.name - self.job.vm.keep_for_debugging = True - else: # This should never happen - msg = "Error: Unknown autodriver error (status=%d)" % ( - ret["runjob"] - ) - - elif ret["copyout"] != 0: - msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"]) + self.afterJobExecution(hdrfile, msg, DetachMethod.RETURN_TO_POOL) + # if ret["copyin"] != 0: + # msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) + # elif ret["runjob"] != 0: + # if ret["runjob"] == 1: # This should never happen + # msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"]) + # elif ret["runjob"] == 2: + # msg = "Error: Job timed out after %d seconds" % (self.job.timeout) + # elif ret["runjob"] == 3: # EXIT_OSERROR in Autodriver + # # Abnormal job termination (Autodriver encountered an OS + # # error). Assume that the VM is damaged. Destroy this VM + # # and do not retry the job since the job may have damaged + # # the VM. + # msg = "Error: OS error while running job on VM" + # detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT + # self.job.vm.notes = str(self.job.id) + "_" + self.job.name + # self.job.vm.keep_for_debugging = True + # else: # This should never happen + # msg = "Error: Unknown autodriver error (status=%d)" % ( + # ret["runjob"] + # ) + + # elif ret["copyout"] != 0: + # msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"]) - self.afterJobExecution(hdrfile, msg, detachMethod) return # From 20b833b0017de3f25161c59d54087de89850a81c Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 28 Sep 2025 16:55:49 -0400 Subject: [PATCH 14/39] code cleanup: worker always gets initialized with a preallocated VM (else preVM.name will fail) --- vmms/ec2SSH.py | 1 + worker.py | 88 ++++++++++++++------------------------------------ 2 files changed, 25 insertions(+), 64 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 02a3f577..73e63a1b 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -347,6 +347,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: """initializeVM - Tell EC2 to create a new VM instance. Returns a boto.ec2.instance.Instance object. + Reads from vm's id and name, writes to vm's instance_id and domain_name """ newInstance: Optional[Instance] = None # Create the instance and obtain the reservation diff --git a/worker.py b/worker.py index a57e9e5f..8bc84d5f 100644 --- a/worker.py +++ b/worker.py @@ -15,6 +15,8 @@ from datetime import datetime from config import Config from jobQueue import JobQueue +from typing import Optional +from tangoObjects import TangoMachine # # Worker - The worker class is very simple and very dumb. The goal is # to walk through the VMMS interface, track the job's progress, and if @@ -32,8 +34,9 @@ class DetachMethod(Enum): DESTROY_AND_REPLACE = "replace" +# We always preallocate a VM for the worker to use class Worker(threading.Thread): - def __init__(self, job, vmms, jobQueue, preallocator, preVM): + def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine): threading.Thread.__init__(self) self.daemon = True self.job = job @@ -204,69 +207,30 @@ def run(self): hdrfile = tempfile.mktemp() self.appendMsg(hdrfile, "Received job %s:%d" % (self.job.name, self.job.id)) - # Assigning job to a preallocated VM - if self.preVM: # self.preVM: - assert not Config.VMMS_NAME == "ec2ssh", "Unimplemented" - self.log.debug("Assigning job to preallocated VM") - self.job.makeVM(self.preVM) - self.log.info( - "Assigned job %s:%d existing VM %s" - % ( - self.job.name, - self.job.id, - self.vmms.instanceName(self.preVM.id, self.preVM.name), - ) - ) - self.job.appendTrace( - "%s|Assigned job %s:%d existing VM %s" - % ( - datetime.now().ctime(), - self.job.name, - self.job.id, - self.vmms.instanceName(self.preVM.id, self.preVM.name), - ) - ) - self.log.debug("Assigned job to preallocated VM") - ret["initializevm"] = 0 # Vacuous success since it doesn't happen - # Assigning job to a new VM - else: - self.log.debug("Assigning job to a new VM") - self.job.syncRemote() - self.job.vm.id = self.job.id - self.job.updateRemote() - - self.log.info( - "Assigned job %s:%d new VM %s" - % ( - self.job.name, - self.job.id, - self.vmms.instanceName(self.job.vm.id, self.job.vm.name), - ) + # Assigning job to the preallocated VM + self.log.debug("Assigning job to preallocated VM") + self.job.makeVM(self.preVM) + self.log.info( + "Assigned job %s:%d existing VM %s" + % ( + self.job.name, + self.job.id, + self.vmms.instanceName(self.preVM.id, self.preVM.name), ) - self.job.appendTrace( - "%s|Assigned job %s:%d new VM %s" - % ( - datetime.now().ctime(), - self.job.name, - self.job.id, - self.vmms.instanceName(self.job.vm.id, self.job.vm.name), - ) + ) + self.job.appendTrace( + "%s|Assigned job %s:%d existing VM %s" + % ( + datetime.now().ctime(), + self.job.name, + self.job.id, + self.vmms.instanceName(self.preVM.id, self.preVM.name), ) - - # Host name returned from EC2 is stored in the vm object - ret["initializevm"] = self.vmms.initializeVM(self.job.vm) - if ret["initializevm"] != 0: - self.rescheduleJob( - hdrfile, - ret, - "Internal error: initializeVM failed" - ) - return - - self.log.debug("Asigned job to a new VM") + ) + self.log.debug("Assigned job to preallocated VM") + ret["initializevm"] = 0 # Vacuous success since it doesn't happen vm = self.job.vm - returnVM = True # Wait for the instance to be ready self.log.debug( @@ -285,7 +249,6 @@ def run(self): self.log.debug("Waiting for VM") if self.job.stopBefore == "waitvm": msg = "Execution stopped before %s" % self.job.stopBefore - returnVM = True self.afterJobExecution(hdrfile, msg, detachMethod=None) return ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT) @@ -323,7 +286,6 @@ def run(self): if (self.job.stopBefore == "copyin"): msg = "Execution stopped before %s" % self.job.stopBefore - returnVM = True self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Copy input files to VM @@ -355,7 +317,6 @@ def run(self): if (self.job.stopBefore == "runjob"): msg = "Execution stopped before %s" % self.job.stopBefore - returnVM = True self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Run the job on the virtual machine @@ -405,7 +366,6 @@ def run(self): if (self.job.stopBefore == "copyout"): msg = "Execution stopped before %s" % self.job.stopBefore - returnVM = True self.afterJobExecution(hdrfile, msg, detachMethod=None) return # Copy the output back. From 138e33299479eca5ccf10c60bded480a5fcd488c Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 26 Oct 2025 22:17:55 -0400 Subject: [PATCH 15/39] fixed logging type safety --- jobManager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jobManager.py b/jobManager.py index 635df83e..e4e4cb54 100644 --- a/jobManager.py +++ b/jobManager.py @@ -135,7 +135,7 @@ def __manage(self) -> None: if __name__ == "__main__": if not Config.USE_REDIS: - tango.log.error( + print( "You need to have Redis running to be able to initiate stand-alone\ JobManager" ) @@ -155,5 +155,5 @@ def __manage(self) -> None: # with the total pool. jobs = JobManager(tango_server.jobQueue) - tango.log.info("Starting the stand-alone Tango JobManager") + tango_server.log.info("Starting the stand-alone Tango JobManager") jobs.run() From d1a8b27aa7c5725b33f13c3f8104fbddc8f319df Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Sun, 26 Oct 2025 22:25:14 -0400 Subject: [PATCH 16/39] always assert that detachVM is called (taken care of with .keep_for_debuggin) --- worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/worker.py b/worker.py index 2c3e078a..569bb89d 100644 --- a/worker.py +++ b/worker.py @@ -51,8 +51,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine): # Worker helper functions # def __del__(self): - if self.job.stopBefore == "": # We don't want to cleanup the VM if we are stopping early for debugging - assert self.cleanupStatus, "Worker must call detachVM before returning" + assert self.cleanupStatus, "Worker must call detachVM before returning" def detachVM(self, detachMethod: DetachMethod): From de383297745dcba1b27d70aaf55aeae62a683857 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 27 Oct 2025 01:12:56 -0400 Subject: [PATCH 17/39] more todos --- tangoObjects.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tangoObjects.py b/tangoObjects.py index 63ed8f99..adfa71db 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -106,7 +106,7 @@ def __init__( stopBefore="", ): self._assigned = False - self._retries = 0 + self._retries: int = 0 self._vm = vm if input is None: @@ -130,6 +130,7 @@ def __repr__(self): self.syncRemote() return f"ID: {self.id} - Name: {self.name}" + # TODO: reduce code size/duplication by setting TangoJob as a dataclass # Getters for private variables @property def assigned(self): From 299e9998eaa2b19f664042e51de6f4b6050946b4 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 28 Oct 2025 15:21:47 -0400 Subject: [PATCH 18/39] stop Before bug fix --- tangoObjects.py | 7 ++++++- worker.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/tangoObjects.py b/tangoObjects.py index adfa71db..b0ce2da0 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -124,7 +124,7 @@ def __init__( self._accessKeyId = accessKeyId self._accessKey = accessKey self._disableNetwork = disableNetwork - self._stopBefore = "stopBefore" + self._stopBefore = stopBefore def __repr__(self): self.syncRemote() @@ -257,6 +257,11 @@ def setTimeout(self, new_timeout): self._timeout = new_timeout self.updateRemote() + def setKeepForDebugging(self, keep_for_debugging: bool): + self.syncRemote() + self._vm.keep_for_debugging = keep_for_debugging + self.updateRemote() + # Private method def __updateSelf(self, other_job): self._assigned = other_job._assigned diff --git a/worker.py b/worker.py index 569bb89d..c68400f0 100644 --- a/worker.py +++ b/worker.py @@ -246,7 +246,7 @@ def run(self): self.log.debug("Waiting for VM") if self.job.stopBefore == "waitvm": msg = "Execution stopped before %s" % self.job.stopBefore - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) return ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT) @@ -281,11 +281,11 @@ def run(self): self.job.id, ) ) - if (self.job.stopBefore == "copyin"): msg = "Execution stopped before %s" % self.job.stopBefore - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) + self.log.debug(msg) return # Copy input files to VM self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}") @@ -296,7 +296,7 @@ def run(self): Config.copyin_errors += 1 msg = "Copy in to VM failed (status=%d)" % (ret["copyin"]) self.job.vm.notes = str(self.job.id) + "_" + self.job.name - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) self.log.debug(msg) self.rescheduleJob( hdrfile, @@ -316,7 +316,7 @@ def run(self): if (self.job.stopBefore == "runjob"): msg = "Execution stopped before %s" % self.job.stopBefore - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) return # Run the job on the virtual machine @@ -339,7 +339,7 @@ def run(self): msg = "RunJob: OS error while running job on VM" # TODO: do we need to not reschedule the job? self.job.vm.notes = str(self.job.id) + "_" + self.job.name - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) elif ret["runjob"] == -1: Config.runjob_timeouts += 1 # TODO: difference between 2 and -1? @@ -366,7 +366,7 @@ def run(self): if (self.job.stopBefore == "copyout"): msg = "Execution stopped before %s" % self.job.stopBefore - self.job.vm.keep_for_debugging = True + self.job.setKeepForDebugging(True) self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) return # Copy the output back. @@ -417,7 +417,7 @@ def run(self): # msg = "Error: OS error while running job on VM" # detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT # self.job.vm.notes = str(self.job.id) + "_" + self.job.name - # self.job.vm.keep_for_debugging = True + # self.job.setKeepForDebugging(True) # else: # This should never happen # msg = "Error: Unknown autodriver error (status=%d)" % ( # ret["runjob"] From f44735e10af65112bf7b3837e72ad003fbcb7a69 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Thu, 6 Nov 2025 00:20:31 -0500 Subject: [PATCH 19/39] tangodictionary expanded key type --- jobQueue.py | 5 +++-- tangoObjects.py | 34 ++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 34c3ae5b..0cda7fd3 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -54,7 +54,7 @@ def __init__(self, preallocator): using the makeUnassigned api. """ self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs") - self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") + self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs self.unassignedJobs = TangoQueue.create("unassignedLiveJobs") self.queueLock = threading.Lock() self.preallocator = preallocator @@ -168,7 +168,8 @@ def add(self, job): return str(job.id) - def addDead(self, job): + # TODO: get rid of this return value, it is not used anywhere + def addDead(self, job) -> int: """addDead - add a job to the dead queue. Called by validateJob when a job validation fails. Returns -1 on failure and the job id on success diff --git a/tangoObjects.py b/tangoObjects.py index adfa71db..4dfc1170 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -7,7 +7,7 @@ from queue import Queue import pickle import redis -from typing import Optional, Protocol, TypeVar +from typing import Optional, Protocol, TypeVar, Union from abc import abstractmethod redisConnection = None @@ -125,10 +125,11 @@ def __init__( self._accessKey = accessKey self._disableNetwork = disableNetwork self._stopBefore = "stopBefore" + self._id: Optional[int] = None # uninitialized until it gets added to either the live or dead queue def __repr__(self): self.syncRemote() - return f"ID: {self.id} - Name: {self.name}" + return f"ID: {self._id} - Name: {self.name}" # TODO: reduce code size/duplication by setting TangoJob as a dataclass # Getters for private variables @@ -207,6 +208,11 @@ def stopBefore(self): self.syncRemote() return self._stopBefore + @property + def id(self) -> int: + self.syncRemote() + assert self._id is not None, "Job ID is not set, add it to the job queue first" + return self._id def makeAssigned(self): self.syncRemote() @@ -242,8 +248,8 @@ def appendTrace(self, trace_str): self._trace.append(trace_str) self.updateRemote() - def setId(self, new_id): - self.id = new_id + def setId(self, new_id: int) -> None: + self._id = new_id if self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] @@ -269,6 +275,7 @@ def __updateSelf(self, other_job): self._timeout = other_job._timeout self._trace = other_job._trace self._maxOutputFileSize = other_job._maxOutputFileSize + self._id = other_job._id def syncRemote(self): @@ -290,14 +297,16 @@ def updateRemote(self): dictionary.set(key, self) def deleteFromDict(self, dictionary : TangoDictionary) -> None: - dictionary.delete(self.id) + assert self._id is not None + dictionary.delete(self._id) self._remoteLocation = None def addToDict(self, dictionary : TangoDictionary) -> None: - dictionary.set(self.id, self) + assert self._id is not None + dictionary.set(self._id, self) assert self._remoteLocation is None, "Job already has a remote location" if Config.USE_REDIS: - self._remoteLocation = dictionary.hash_name + ":" + str(self.id) + self._remoteLocation = dictionary.hash_name + ":" + str(self._id) self.updateRemote() @@ -460,6 +469,7 @@ def make_empty(self) -> None: self.__db.delete(self.key) T = TypeVar('T') +KeyType = Union[str, int] # Dictionary from string to T class TangoDictionary(Protocol[T]): @@ -476,16 +486,16 @@ def hash_name(self) -> str: ... @abstractmethod - def __contains__(self, id: str) -> bool: + def __contains__(self, id: KeyType) -> bool: ... @abstractmethod - def set(self, id: str, obj: T) -> str: + def set(self, id: KeyType, obj: T) -> str: ... @abstractmethod - def get(self, id: str) -> Optional[T]: + def get(self, id: KeyType) -> Optional[T]: ... @abstractmethod - def getExn(self, id: str) -> T: + def getExn(self, id: KeyType) -> T: ... @abstractmethod def keys(self) -> list[str]: @@ -494,7 +504,7 @@ def keys(self) -> list[str]: def values(self) -> list[T]: ... @abstractmethod - def delete(self, id: str) -> None: + def delete(self, id: KeyType) -> None: ... @abstractmethod def _clean(self) -> None: From f7b4acbe22fbddb63258619043b6ffb99df1145e Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Thu, 6 Nov 2025 00:39:25 -0500 Subject: [PATCH 20/39] more no-op type annotations --- tangoObjects.py | 11 ++++++----- vmms/ec2SSH.py | 10 +++++----- worker.py | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tangoObjects.py b/tangoObjects.py index 82a08a45..8c49b3d3 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -264,9 +264,10 @@ def setTimeout(self, new_timeout): self.updateRemote() def setKeepForDebugging(self, keep_for_debugging: bool): - self.syncRemote() - self._vm.keep_for_debugging = keep_for_debugging - self.updateRemote() + if (self._vm is not None): + self.syncRemote() + self._vm.keep_for_debugging = keep_for_debugging + self.updateRemote() # Private method def __updateSelf(self, other_job): @@ -283,7 +284,7 @@ def __updateSelf(self, other_job): self._id = other_job._id - def syncRemote(self): + def syncRemote(self) -> None: if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] @@ -294,7 +295,7 @@ def syncRemote(self): return self.__updateSelf(temp_job) - def updateRemote(self): + def updateRemote(self) -> None: if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 914de719..6753d757 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -22,9 +22,9 @@ import config from tangoObjects import TangoMachine -from typing import Optional, Literal, List, Sequence +from typing import Optional, Literal, List, Dict, Sequence from mypy_boto3_ec2 import EC2ServiceResource -from mypy_boto3_ec2.service_resource import Instance +from mypy_boto3_ec2.service_resource import Instance, Image from mypy_boto3_ec2.type_defs import FilterTypeDef from vmms.interface import VMMSInterface @@ -170,7 +170,7 @@ def release_vm_semaphore(): Ec2SSH._vm_semaphore.release() # TODO: the arguments accessKeyId and accessKey don't do anything - def __init__(self, accessKeyId=None, accessKey=None): + def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None) -> None: """log - logger for the instance connection - EC2Connection object that stores the connection info to the EC2 network @@ -203,8 +203,8 @@ def __init__(self, accessKeyId=None, accessKey=None): # self.createKeyPair() # create boto3resource - self.img2ami = {} # this is a bad name, should really be img_name to img - self.images = [] + self.img2ami: Dict[str, Image] = {} # this is a bad name, should really be img_name to img + self.images: List[Image] = [] try: # This is a service resource self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource diff --git a/worker.py b/worker.py index c68400f0..692a30de 100644 --- a/worker.py +++ b/worker.py @@ -439,6 +439,6 @@ def run(self): # if vm is not set but self.preVM is set, we still need # to return the VM, but have to initialize self.job.vm first if self.preVM and not vm: - vm = self.job.vm = self.preVM + vm = self.job.vm = self.preVM if vm: self.detachVM(DetachMethod.DESTROY_AND_REPLACE) From b643a9862ca5c2c942a230b91f9d1ae190ab318f Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Thu, 6 Nov 2025 01:01:14 -0500 Subject: [PATCH 21/39] 'no issues found in 25 source files' --- jobQueue.py | 9 +++++---- tango.py | 5 +++-- tangoObjects.py | 15 ++++++++------- vmms/distDocker.py | 2 +- vmms/tashiSSH.py | 4 ++-- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 0cda7fd3..acbb31c8 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -14,6 +14,7 @@ from datetime import datetime from tangoObjects import TangoDictionary, TangoJob, TangoQueue from config import Config +from preallocator import Preallocator # # JobQueue - This class defines the job queue and the functions for @@ -31,7 +32,7 @@ class JobQueue(object): - def __init__(self, preallocator): + def __init__(self, preallocator: Preallocator) -> None: """ Here we maintain several data structures used to keep track of the jobs present for the autograder. @@ -55,9 +56,9 @@ def __init__(self, preallocator): """ self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs") self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs - self.unassignedJobs = TangoQueue.create("unassignedLiveJobs") + self.unassignedJobs: TangoQueue[int] = TangoQueue.create("unassignedLiveJobs") self.queueLock = threading.Lock() - self.preallocator = preallocator + self.preallocator: Preallocator = preallocator self.log = logging.getLogger("JobQueue") self.nextID = 1 @@ -357,6 +358,7 @@ def getNextPendingJob(self) -> TangoJob: """ # Blocks till the next item is added id = self.unassignedJobs.get() + assert id is not None, ".get with default arguments should block and never return None" self.log.debug("_getNextPendingJob|Acquiring lock to job queue.") self.queueLock.acquire() @@ -366,7 +368,6 @@ def getNextPendingJob(self) -> TangoJob: job = self.liveJobs.get(id) if job is None: raise Exception("Cannot find unassigned job in live jobs") - self.log.debug("getNextPendingJob| Releasing lock to job queue.") self.queueLock.release() self.log.debug("getNextPendingJob| Released lock to job queue.") diff --git a/tango.py b/tango.py index 3f94cd12..925413f0 100755 --- a/tango.py +++ b/tango.py @@ -48,16 +48,17 @@ from jobQueue import JobQueue from tangoObjects import TangoJob from config import Config +from vmms.interface import VMMSInterface class TangoServer(object): """TangoServer - Implements the API functions that the server accepts""" - def __init__(self): + def __init__(self) -> None: self.daemon = True - vmms = None + vmms: VMMSInterface if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH diff --git a/tangoObjects.py b/tangoObjects.py index 8c49b3d3..8dd4d888 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -360,9 +360,10 @@ def set(self, val): return val -class TangoQueue(Protocol): +QueueElem = TypeVar('QueueElem') +class TangoQueue(Protocol[QueueElem]): @staticmethod - def create(key_name: str) -> TangoQueue: + def create(key_name: str) -> TangoQueue[QueueElem]: if Config.USE_REDIS: return TangoRemoteQueue(key_name) else: @@ -373,20 +374,20 @@ def qsize(self) -> int: ... def empty(self) -> bool: ... - def put(self, item) -> None: + def put(self, item: QueueElem) -> None: ... - def get(self, block=True, timeout=None) -> Optional[T]: + def get(self, block=True, timeout=None) -> Optional[QueueElem]: ... - def get_nowait(self) -> Optional[T]: + def get_nowait(self) -> Optional[QueueElem]: ... - def remove(self, item) -> None: + def remove(self, item: QueueElem) -> None: ... def _clean(self) -> None: ... def make_empty(self) -> None: ... -class ExtendedQueue(Queue, TangoQueue): +class ExtendedQueue(Queue, TangoQueue[QueueElem]): """Python Thread safe Queue with the remove and clean function added""" def test(self): diff --git a/vmms/distDocker.py b/vmms/distDocker.py index b6d498e5..1dc93bb6 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -187,7 +187,7 @@ def waitVM(self, vm, max_secs): # Sleep a bit before trying again time.sleep(config.Config.TIMER_POLL_INTERVAL) - def copyIn(self, vm, inputFiles): + def copyIn(self, vm, inputFiles, job_id=None): """copyIn - Create a directory to be mounted as a volume for the docker containers on the host machine for this VM. Copy input files to this directory on the host machine. diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index e0e7c58e..74113755 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -254,7 +254,7 @@ def waitVM(self, vm, max_secs): # Sleep a bit before trying again time.sleep(config.Config.TIMER_POLL_INTERVAL) - def copyIn(self, vm, inputFiles): + def copyIn(self, vm, inputFiles, job_id=None): """copyIn - Copy input files to VM""" domain_name = self.domainName(vm.id, vm.name) self.log.debug("Creating autolab directory on VM") @@ -292,7 +292,7 @@ def copyIn(self, vm, inputFiles): return ret return 0 - def runJob(self, vm, runTimeout, maxOutputFileSize): + def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): """runJob - Run the make command on a VM using SSH and redirect output to file "output". """ From 5439c159a525982bb71f64eb2f2be49673ecabbf Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Thu, 6 Nov 2025 13:04:02 -0500 Subject: [PATCH 22/39] changed ret in worker.py to not have Nones --- clients/tango-cli.py | 1 - jobQueue.py | 10 +++++----- mypy.ini | 6 ++++++ preallocator.py | 8 ++++---- tangoObjects.py | 6 +++--- vmms/distDocker.py | 3 ++- vmms/localDocker.py | 9 ++++++--- worker.py | 14 +++++--------- 8 files changed, 31 insertions(+), 26 deletions(-) diff --git a/clients/tango-cli.py b/clients/tango-cli.py index a2a47ba0..bfa2c14b 100755 --- a/clients/tango-cli.py +++ b/clients/tango-cli.py @@ -274,7 +274,6 @@ def tango_upload(): def tango_addJob(): try: - requestObj = {} res = checkKey() + checkCourselab() + checkInfiles() if res != 0: raise Exception("Invalid usage: [addJob] " + addJob_help) diff --git a/jobQueue.py b/jobQueue.py index acbb31c8..6633adab 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -12,10 +12,10 @@ import time from datetime import datetime -from tangoObjects import TangoDictionary, TangoJob, TangoQueue +from tangoObjects import TangoDictionary, TangoJob, TangoQueue, TangoMachine from config import Config from preallocator import Preallocator - +from typing import Optional # # JobQueue - This class defines the job queue and the functions for # manipulating it. The actual queue is made up of two smaller @@ -251,7 +251,7 @@ def get(self, id): # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. # TODO: It does assign the job to a particular VM though. # Precondition: jobId is in self.liveJobs - def assignJob(self, jobId, vm=None) -> None: + def assignJob(self, jobId, vm=None): """assignJob - marks a job to be assigned""" self.queueLock.acquire() self.log.debug("assignJob| Acquired lock to job queue.") @@ -274,7 +274,7 @@ def assignJob(self, jobId, vm=None) -> None: # return job # TODO: Rename this job to be more accurate in its description - def unassignJob(self, jobId): + def unassignJob(self, jobId: int) -> None: """unassignJob - marks a job to be unassigned Note: We assume here that a job is to be rescheduled or 'retried' when you unassign it. This retry is done by @@ -284,7 +284,7 @@ def unassignJob(self, jobId): self.log.debug("unassignJob| Acquired lock to job queue.") # Get the current job - job = self.liveJobs.get(jobId) + job = self.liveJobs.getExn(jobId) # Increment the number of retires if job.retries is None: diff --git a/mypy.ini b/mypy.ini index f6332a89..7fe2ea10 100644 --- a/mypy.ini +++ b/mypy.ini @@ -4,5 +4,11 @@ explicit_package_bases = True [mypy-vmms.tashiSSH] ignore_errors = True +[mypy-vmms.distDocker] +ignore_errors = True + +[mypy-tests.*] +ignore_errors = True + [mypy-preallocator] ignore_errors = True \ No newline at end of file diff --git a/preallocator.py b/preallocator.py index 69291933..acfd9a91 100644 --- a/preallocator.py +++ b/preallocator.py @@ -9,7 +9,7 @@ from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine from config import Config from typing import Tuple, List - +from vmms.interface import VMMSInterface # # Preallocator - This class maintains a pool of active VMs for future # job requests. The pool is stored in dictionary called @@ -22,11 +22,11 @@ class Preallocator(object): - def __init__(self, vmms): - self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue]] = TangoDictionary.create("machines") + def __init__(self, vmms: VMMSInterface) -> None: + self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines") self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) - self.vmms = vmms + self.vmms: VMMSInterface = vmms self.log = logging.getLogger("Preallocator") def poolSize(self, vmName): diff --git a/tangoObjects.py b/tangoObjects.py index 8dd4d888..49b05ea9 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -93,7 +93,7 @@ class TangoJob(object): def __init__( self, - vm: Optional[TangoMachine] = None, + vm: TangoMachine, outputFile=None, name=None, input=None, @@ -108,7 +108,7 @@ def __init__( self._assigned = False self._retries: int = 0 - self._vm = vm + self._vm: TangoMachine = vm if input is None: self._input = [] else: @@ -229,7 +229,7 @@ def incrementRetries(self): self._retries += 1 self.updateRemote() - def makeVM(self, vm): + def makeVM(self, vm: TangoMachine) -> None: self.syncRemote() self._vm = vm self.updateRemote() diff --git a/vmms/distDocker.py b/vmms/distDocker.py index 1dc93bb6..ec0e475c 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -21,6 +21,7 @@ import socket import config from tangoObjects import TangoMachine +from vmms.interface import VMMSInterface def timeout(command, time_out=1): @@ -75,7 +76,7 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0): return ret -class DistDocker(object): +class DistDocker(VMMSInterface): _SSH_FLAGS = ["-q", "-o", "BatchMode=yes"] _SSH_AUTH_FLAGS = [ diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 8ccfb7d5..fe2ad73c 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -13,11 +13,11 @@ import shutil import config from tangoObjects import TangoMachine - +from typing import List from vmms.interface import VMMSInterface -def timeout(command, time_out=1): +def timeout(command: List[str], time_out: float = 1) -> int: """timeout - Run a unix command with a timeout. Return -1 on timeout, otherwise return the return value from the command, which is typically 0 for success, 1-255 for failure. @@ -35,6 +35,7 @@ def timeout(command, time_out=1): t += config.Config.TIMER_POLL_INTERVAL # Determine why the while loop terminated + returncode: int if p.poll() is None: try: os.kill(p.pid, 9) @@ -42,7 +43,9 @@ def timeout(command, time_out=1): pass returncode = -1 else: - returncode = p.poll() + poll_result = p.poll() + assert poll_result is not None + returncode = poll_result return returncode diff --git a/worker.py b/worker.py index 692a30de..cf6e5a03 100644 --- a/worker.py +++ b/worker.py @@ -16,6 +16,8 @@ from config import Config from jobQueue import JobQueue from tangoObjects import TangoMachine +from typing import Dict, Optional +from vmms.interface import VMMSInterface # # Worker - The worker class is very simple and very dumb. The goal is # to walk through the VMMS interface, track the job's progress, and if @@ -39,7 +41,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine): threading.Thread.__init__(self) self.daemon = True self.job = job - self.vmms = vmms + self.vmms: VMMSInterface = vmms self.jobQueue : JobQueue = jobQueue self.preallocator = preallocator self.preVM = preVM @@ -154,7 +156,7 @@ def notifyServer(self, job): self.log.debug("Sending request to %s" % job.notifyURL) with requests.session() as s: # urllib3 retry, allow POST to be retried, use backoffs - r = Retry(total=10, allowed_methods=False, backoff_factor=1) + r = Retry(total=10, allowed_methods=None, backoff_factor=1) s.mount("http://", HTTPAdapter(max_retries=r)) s.mount("https://", HTTPAdapter(max_retries=r)) response = s.post( @@ -190,13 +192,7 @@ def run(self): """run - Step a job through its execution sequence""" try: # Hash of return codes for each step - ret = {} - ret["waitvm"] = None - ret["initializevm"] = None - ret["copyin"] = None - ret["runjob"] = None - ret["copyout"] = None - print("HELLO") + ret: Dict[str, int] = {} self.log.debug("Run worker") vm = None From 2dc0ee744a48f0972a75f7b39dbdbc89f3a6385a Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 11 Nov 2025 12:27:41 -0500 Subject: [PATCH 23/39] moved timeout into VMMSUtils --- jobManager.py | 2 ++ preallocator.py | 6 ++-- vmms/distDocker.py | 81 +++++++++------------------------------------ vmms/ec2SSH.py | 58 ++------------------------------ vmms/interface.py | 3 ++ vmms/localDocker.py | 63 +++-------------------------------- vmms/tashiSSH.py | 34 ------------------- 7 files changed, 31 insertions(+), 216 deletions(-) diff --git a/jobManager.py b/jobManager.py index e4e4cb54..50cd9553 100644 --- a/jobManager.py +++ b/jobManager.py @@ -23,6 +23,7 @@ from tangoObjects import TangoJob, TangoQueue, TangoMachine from typing import List, Tuple from worker import Worker +from vmms.interface import VMMSInterface @@ -73,6 +74,7 @@ def __manage(self) -> None: # Sleep for a bit and then check again time.sleep(Config.DISPATCH_PERIOD) + vmms: VMMSInterface try: # if the job is a ec2 vmms job # spin up an ec2 instance for that job diff --git a/preallocator.py b/preallocator.py index acfd9a91..16b800b3 100644 --- a/preallocator.py +++ b/preallocator.py @@ -8,7 +8,7 @@ from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine from config import Config -from typing import Tuple, List +from typing import Tuple, List, Dict from vmms.interface import VMMSInterface # # Preallocator - This class maintains a pool of active VMs for future @@ -22,11 +22,11 @@ class Preallocator(object): - def __init__(self, vmms: VMMSInterface) -> None: + def __init__(self, vmms: Dict[str, VMMSInterface]) -> None: self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines") self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) - self.vmms: VMMSInterface = vmms + self.vmms: Dict[str, VMMSInterface] = vmms self.log = logging.getLogger("Preallocator") def poolSize(self, vmName): diff --git a/vmms/distDocker.py b/vmms/distDocker.py index ec0e475c..ab90bcb6 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -22,61 +22,10 @@ import config from tangoObjects import TangoMachine from vmms.interface import VMMSInterface +from vmms.sharedUtils import VMMSUtils -def timeout(command, time_out=1): - """timeout - Run a unix command with a timeout. Return -1 on - timeout, otherwise return the return value from the command, which - is typically 0 for success, 1-255 for failure. - """ - - # Launch the command - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - - # Wait for the command to complete - t = 0.0 - while t < time_out and p.poll() is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - - # Determine why the while loop terminated - if p.poll() is None: - try: - os.kill(p.pid, 9) - except OSError: - pass - returncode = -1 - else: - returncode = p.poll() - return returncode - - -def timeoutWithReturnStatus(command, time_out, returnValue=0): - """timeoutWithReturnStatus - Run a Unix command with a timeout, - until the expected value is returned by the command; On timeout, - return last error code obtained from the command. - """ - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - t = 0.0 - while t < time_out: - ret = p.poll() - if ret is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - elif ret == returnValue: - return ret - else: - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - return ret - - -class DistDocker(VMMSInterface): +class DistDocker(VMMSInterface, VMMSUtils): _SSH_FLAGS = ["-q", "-o", "BatchMode=yes"] _SSH_AUTH_FLAGS = [ @@ -172,7 +121,7 @@ def waitVM(self, vm, max_secs): # If the call to ssh returns timeout (-1) or ssh error # (255), then success. Otherwise, keep trying until we run # out of time. - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -197,7 +146,7 @@ def copyIn(self, vm, inputFiles, job_id=None): volumePath = self.getVolumePath(instanceName) if vm.use_ssh_master: - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -209,7 +158,7 @@ def copyIn(self, vm, inputFiles, job_id=None): return ret # Create a fresh volume - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -225,7 +174,7 @@ def copyIn(self, vm, inputFiles, job_id=None): return ret for file in inputFiles: - ret = timeout( + ret = VMMSUtils.timeout( ["scp"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -261,7 +210,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): volumePath = self.getVolumePath(instanceName) if vm.use_ssh_master: - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -303,7 +252,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): self.log.debug("Running job: %s" % args) - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -324,7 +273,7 @@ def copyOut(self, vm, destFile): volumePath = self.getVolumePath(instanceName) if vm.use_ssh_master: - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -335,7 +284,7 @@ def copyOut(self, vm, destFile): self.log.debug("Lost persistent SSH connection") return ret - ret = timeout( + ret = VMMSUtils.timeout( ["scp"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -356,7 +305,7 @@ def destroyVM(self, vm): instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) if vm.use_ssh_master: - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -372,7 +321,7 @@ def destroyVM(self, vm): # Do a hard kill on corresponding docker container. # Return status does not matter. args = "(docker rm -f %s)" % (instanceName) - timeout( + VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -380,7 +329,7 @@ def destroyVM(self, vm): config.Config.DOCKER_RM_TIMEOUT, ) # Destroy corresponding volume if it exists. - timeout( + VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -389,7 +338,7 @@ def destroyVM(self, vm): ) self.log.debug("Deleted volume %s" % instanceName) if vm.use_ssh_master: - timeout( + VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags @@ -490,7 +439,7 @@ def getPartialOutput(self, vm): instanceName = self.instanceName(vm.id, vm.image) if vm.use_ssh_master: - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + DistDocker._SSH_FLAGS + vm.ssh_flags diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 6753d757..27134f6a 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -28,7 +28,7 @@ from mypy_boto3_ec2.type_defs import FilterTypeDef from vmms.interface import VMMSInterface - +from vmms.sharedUtils import VMMSUtils # suppress most boto logging logging.getLogger("boto3").setLevel(logging.CRITICAL) @@ -36,35 +36,6 @@ logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL) -def timeout(command, time_out=1): - """timeout - Run a unix command with a timeout. Return -1 on - timeout, otherwise return the return value from the command, which - is typically 0 for success, 1-255 for failure. - """ - - # Launch the command - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - - # Wait for the command to complete - t = 0.0 - while t < time_out and p.poll() is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - if t >= time_out: - print("ERROR: timeout trying ", command) - # Determine why the while loop terminated - if p.poll() is None: - try: - os.kill(p.pid, 9) - except OSError: - pass - returncode = -1 - else: - returncode = p.poll() - return returncode - def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2): """timeout - Run a unix command with a timeout. Return -1 on @@ -108,29 +79,6 @@ def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2): return returncode -def timeoutWithReturnStatus(command, time_out, returnValue=0): - """timeoutWithReturnStatus - Run a Unix command with a timeout, - until the expected value is returned by the command; On timeout, - return last error code obtained from the command. - """ - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - t = 0.0 - while t < time_out: - ret = p.poll() - if ret is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - elif ret == returnValue: - return ret - else: - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - return ret - - @backoff.on_exception(backoff.expo, ClientError, max_tries=3, jitter=None) def try_load_instance(newInstance): newInstance.load() @@ -146,7 +94,7 @@ class ec2CallError(Exception): pass -class Ec2SSH(VMMSInterface): +class Ec2SSH(VMMSInterface, VMMSUtils): _SSH_FLAGS = [ "-i", config.Config.SECURITY_KEY_PATH, @@ -517,7 +465,7 @@ def waitVM(self, vm, max_secs) -> Literal[0, -1]: # If the call to ssh returns timeout (-1) or ssh error # (255), then success. Otherwise, keep trying until we run # out of time. - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), "(:)"], diff --git a/vmms/interface.py b/vmms/interface.py index d71500f9..04043c13 100644 --- a/vmms/interface.py +++ b/vmms/interface.py @@ -40,3 +40,6 @@ def getVMs(self) -> List[TangoMachine]: def existsVM(self, vm: TangoMachine) -> bool: ... + @abstractmethod + def instanceName(self, id: int, name: str) -> str: + ... diff --git a/vmms/localDocker.py b/vmms/localDocker.py index fe2ad73c..b002a06e 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -15,69 +15,16 @@ from tangoObjects import TangoMachine from typing import List from vmms.interface import VMMSInterface +from vmms.sharedUtils import VMMSUtils -def timeout(command: List[str], time_out: float = 1) -> int: - """timeout - Run a unix command with a timeout. Return -1 on - timeout, otherwise return the return value from the command, which - is typically 0 for success, 1-255 for failure. - """ - - # Launch the command - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - - # Wait for the command to complete - t = 0.0 - while t < time_out and p.poll() is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - - # Determine why the while loop terminated - returncode: int - if p.poll() is None: - try: - os.kill(p.pid, 9) - except OSError: - pass - returncode = -1 - else: - poll_result = p.poll() - assert poll_result is not None - returncode = poll_result - return returncode - - -def timeoutWithReturnStatus(command, time_out, returnValue=0): - """timeoutWithReturnStatus - Run a Unix command with a timeout, - until the expected value is returned by the command; On timeout, - return last error code obtained from the command. - """ - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - t = 0.0 - while t < time_out: - ret = p.poll() - if ret is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - elif ret == returnValue: - return ret - else: - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - return ret - # # User defined exceptions # -class LocalDocker(VMMSInterface): +class LocalDocker(VMMSInterface, VMMSUtils): def __init__(self): """Checks if the machine is ready to run docker containers. Initialize boot2docker if running on OS X. @@ -188,7 +135,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): ] self.log.debug("Running job: %s" % str(args)) - ret = timeout(args, runTimeout * 2) + ret = VMMSUtils.timeout(args, runTimeout * 2) self.log.debug("runJob returning %d" % ret) return ret @@ -212,7 +159,7 @@ def destroyVM(self, vm): volumePath = self.getVolumePath("") # Do a hard kill on corresponding docker container. # Return status does not matter. - timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT) + VMMSUtils.timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT) # Destroy corresponding volume if it exists. if instanceName in os.listdir(volumePath): shutil.rmtree(volumePath + instanceName) @@ -254,7 +201,7 @@ def existsVM(self, vm): a non-zero status upon not finding a container. """ instanceName = self.instanceName(vm.id, vm.name) - ret = timeout(["docker", "inspect", instanceName]) + ret = VMMSUtils.timeout(["docker", "inspect", instanceName]) return ret == 0 def getImages(self): diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index 74113755..93486bae 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -54,40 +54,6 @@ def timeout(command, time_out=1): return returncode -def timeoutWithReturnStatus(command, time_out, returnValue=0): - """timeoutWithReturnStatus - Run a Unix command with a timeout, - until the expected value is returned by the command; On timeout, - return last error code obtained from the command. - """ - if (config.Config.LOGLEVEL is logging.DEBUG) and ( - "ssh" in command or "scp" in command - ): - out = sys.stdout - err = sys.stderr - else: - out = open("/dev/null", "w") - err = sys.stdout - - # Launch the command - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - - t = 0.0 - while t < time_out: - ret = p.poll() - if ret is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - elif ret == returnValue: - return ret - else: - p = subprocess.Popen( - command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT - ) - return ret - - # # User defined exceptions # From bababf95e39b9d02180faa8fb57aa2895860634b Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 11 Nov 2025 12:48:54 -0500 Subject: [PATCH 24/39] tidied code: return statement in timeout_with_retries --- vmms/ec2SSH.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 27134f6a..e4692c03 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -37,12 +37,12 @@ -def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2): +def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2) -> int: """timeout - Run a unix command with a timeout. Return -1 on timeout, otherwise return the return value from the command, which is typically 0 for success, 1-255 for failure. """ - for attempt in range(retries + 1): + for _ in range(retries + 1): # Launch the command p = subprocess.Popen( command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT @@ -57,26 +57,24 @@ def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2): print("ERROR: timeout trying ", command) # Determine why the while loop terminated - if p.poll() is None: + poll_result: Optional[int] = p.poll() + if poll_result is None: try: os.kill(p.pid, 9) except OSError: pass returncode = -1 else: - returncode = p.poll() + returncode = poll_result # try to retry the command on a timeout if returncode == -1: - if attempt < retries: - print(f"Retrying in {retry_delay} seconds...") - time.sleep(retry_delay) - else: - # attempt == retries -> failure - print("All retries exhausted.") - return -1 + print(f"Retrying in {retry_delay} seconds...") + time.sleep(retry_delay) else: return returncode + print("All retries exhausted.") + return -1 @backoff.on_exception(backoff.expo, ClientError, max_tries=3, jitter=None) @@ -566,7 +564,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): ) # no logging for now - ret = timeout( + ret = VMMSUtils.timeout( ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), runcmd], @@ -625,7 +623,7 @@ def copyOut(self, vm, destFile): # Error copying out the timing data (probably runJob failed) pass - return timeout( + return VMMSUtils.timeout( ["scp"] + self.ssh_flags + [ From ee7d71e4033c4c093ddace6f6e47ffe6d1758f5a Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 11 Nov 2025 12:59:18 -0500 Subject: [PATCH 25/39] added to the VMMSInterfae --- vmms/interface.py | 8 ++++++++ vmms/tashiSSH.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vmms/interface.py b/vmms/interface.py index 04043c13..6235e227 100644 --- a/vmms/interface.py +++ b/vmms/interface.py @@ -43,3 +43,11 @@ def existsVM(self, vm: TangoMachine) -> bool: @abstractmethod def instanceName(self, id: int, name: str) -> str: ... + + @abstractmethod + def getImages(self) -> List[str]: + ... + + @abstractmethod + def getPartialOutput(self, vm: TangoMachine) -> str: + ... diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index 93486bae..6562abfe 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -64,7 +64,7 @@ class tashiCallError(Exception): pass -class TashiSSH(object): +class TashiSSH(VMMSInterface): _SSH_FLAGS = [ "-q", "-i", From 6abab6cab4381fe0212986d1547ed78fd724588d Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 11 Nov 2025 14:14:10 -0500 Subject: [PATCH 26/39] all the changes that are basically no-ops --- restful_tango/tangoREST.py | 8 ++++++-- tango.py | 9 --------- tangoObjects.py | 1 + vmms/ec2SSH.py | 17 +++++++++-------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py index 0e67b0b2..d246daba 100644 --- a/restful_tango/tangoREST.py +++ b/restful_tango/tangoREST.py @@ -306,7 +306,9 @@ def upload(self, key, courselab, file, tempfile, fileMD5): os.unlink(tempfile) return self.status.wrong_courselab except Exception as e: - exc_type, exc_obj, exc_tb = sys.exc_info() + exc_type, _, exc_tb = sys.exc_info() + assert exc_type is not None + assert exc_tb is not None # currently handling an exception fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) self.log.error("upload request failed: %s" % str(e)) @@ -335,7 +337,9 @@ def addJob(self, key, courselab, jobStr): result["jobId"] = jobId return result except Exception as e: - exc_type, exc_obj, exc_tb = sys.exc_info() + exc_type, _, exc_tb = sys.exc_info() + assert exc_type is not None + assert exc_tb is not None # currently handling an exception fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) self.log.error("addJob request failed: %s" % str(e)) diff --git a/tango.py b/tango.py index 925413f0..a8cfa7dd 100755 --- a/tango.py +++ b/tango.py @@ -355,15 +355,6 @@ def __validateJob(self, job, vmms): ) errors += 1 - # Check for max output file size parameter - if not job.maxOutputFileSize: - self.log.debug( - "validateJob: Setting job.maxOutputFileSize " - "to default value: %d bytes", - Config.MAX_OUTPUT_FILE_SIZE, - ) - job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE - # Check the list of input files hasMakefile = False for inputFile in job.input: diff --git a/tangoObjects.py b/tangoObjects.py index 49b05ea9..9297db13 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -91,6 +91,7 @@ class TangoJob(object): TangoJob - A job that is to be run on a TangoMachine """ + # TODO: do we really want all of these default values? def __init__( self, vm: TangoMachine, diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index e4692c03..b3aa7197 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -260,7 +260,7 @@ def deleteKeyPair(self): def createSecurityGroup(self): try: # Check if the security group already exists - response = self.boto3client.describe_security_groups( + description_response = self.boto3client.describe_security_groups( Filters=[ { "Name": "group-name", @@ -268,18 +268,18 @@ def createSecurityGroup(self): } ] ) - if response["SecurityGroups"]: - security_group_id = response["SecurityGroups"][0]["GroupId"] + if description_response["SecurityGroups"]: + security_group_id = description_response["SecurityGroups"][0]["GroupId"] return except Exception as e: self.log.debug("ERROR checking for existing security group: %s", e) try: - response = self.boto3resource.create_security_group( + security_group_response = self.boto3resource.create_security_group( GroupName=config.Config.DEFAULT_SECURITY_GROUP, Description="Autolab security group - allowing all traffic", ) - security_group_id = response["GroupId"] + security_group_id = security_group_response["GroupId"] self.boto3resource.authorize_security_group_ingress( GroupId=security_group_id ) @@ -619,9 +619,10 @@ def copyOut(self, vm, destFile): except subprocess.CalledProcessError as xxx_todo_changeme: # Error copying out the timing data (probably runJob failed) - re.error = xxx_todo_changeme + # re.error = xxx_todo_changeme # Error copying out the timing data (probably runJob failed) pass + return VMMSUtils.timeout( ["scp"] @@ -689,7 +690,7 @@ def getVMs(self): """ try: vms = list() - filters = [ + filters: Sequence[FilterTypeDef] = [ { "Name": "instance-state-name", "Values": ["running", "pending"], @@ -741,7 +742,7 @@ def getVMs(self): def existsVM(self, vm): """existsVM - Checks whether a VM exists in the vmms.""" # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html - filters = [{"Name": "instance-state-name", "Values": ["running"]}] + filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}] # gets all running instances instances = self.boto3resource.instances.filter(Filters=filters) for instance in instances: From 2325e54236133135eaeed797211b2440cb18cedd Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 11 Nov 2025 15:36:35 -0500 Subject: [PATCH 27/39] empty line --- vmms/localDocker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 8ccfb7d5..338bc6d4 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -17,6 +17,7 @@ from vmms.interface import VMMSInterface + def timeout(command, time_out=1): """timeout - Run a unix command with a timeout. Return -1 on timeout, otherwise return the return value from the command, which From 5b34e24f76e39cc19cc2a6fb48328d67ef311f1f Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 18 Nov 2025 12:52:13 -0500 Subject: [PATCH 28/39] replaced _clean with make_empty for both TangoQueue and TangoDictionary --- jobQueue.py | 8 ++++---- tangoObjects.py | 17 ++++------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 34c3ae5b..d00969fc 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -341,13 +341,13 @@ def getInfo(self): return info - def reset(self): + def reset(self) -> None: """reset - resets and clears all the internal dictionaries and queues """ - self.liveJobs._clean() - self.deadJobs._clean() - self.unassignedJobs._clean() + self.liveJobs.make_empty() + self.deadJobs.make_empty() + self.unassignedJobs.make_empty() def getNextPendingJob(self) -> TangoJob: """Gets the next unassigned live job. Note that this is a diff --git a/tangoObjects.py b/tangoObjects.py index b0ce2da0..191bf141 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -371,8 +371,6 @@ def get_nowait(self) -> Optional[T]: ... def remove(self, item) -> None: ... - def _clean(self) -> None: - ... def make_empty(self) -> None: ... @@ -391,14 +389,10 @@ def remove(self, value): with self.mutex: self.queue.remove(value) - def _clean(self): + def make_empty(self): with self.mutex: self.queue.clear() - def make_empty(self): - self._clean() - - class TangoRemoteQueue(TangoQueue): """Simple Queue with Redis Backend""" @@ -458,9 +452,6 @@ def remove(self, item): pickled_item = pickle.dumps(item) return self.__db.lrem(self.key, 0, pickled_item) - def _clean(self): - self.__db.delete(self.key) - def make_empty(self) -> None: self.__db.delete(self.key) @@ -502,7 +493,7 @@ def values(self) -> list[T]: def delete(self, id: str) -> None: ... @abstractmethod - def _clean(self) -> None: + def make_empty(self) -> None: ... @abstractmethod def items(self) -> list[tuple[str, T]]: @@ -568,7 +559,7 @@ def values(self): def delete(self, id): self.r.hdel(self.hash_name, id) - def _clean(self): + def make_empty(self): # only for testing self.r.delete(self.hash_name) @@ -629,6 +620,6 @@ def items(self): ] ) - def _clean(self): + def make_empty(self): # only for testing return From ee45c4d9c024271e8dfc1398277e493965404811 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 18 Nov 2025 12:52:59 -0500 Subject: [PATCH 29/39] removed dead code --- worker.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/worker.py b/worker.py index c68400f0..cf62b61f 100644 --- a/worker.py +++ b/worker.py @@ -402,30 +402,6 @@ def run(self): # with an explanatory message msg = "Success: Autodriver returned normally" self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL) - # if ret["copyin"] != 0: - # msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) - # elif ret["runjob"] != 0: - # if ret["runjob"] == 1: # This should never happen - # msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"]) - # elif ret["runjob"] == 2: - # msg = "Error: Job timed out after %d seconds" % (self.job.timeout) - # elif ret["runjob"] == 3: # EXIT_OSERROR in Autodriver - # # Abnormal job termination (Autodriver encountered an OS - # # error). Assume that the VM is damaged. Destroy this VM - # # and do not retry the job since the job may have damaged - # # the VM. - # msg = "Error: OS error while running job on VM" - # detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT - # self.job.vm.notes = str(self.job.id) + "_" + self.job.name - # self.job.setKeepForDebugging(True) - # else: # This should never happen - # msg = "Error: Unknown autodriver error (status=%d)" % ( - # ret["runjob"] - # ) - - # elif ret["copyout"] != 0: - # msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"]) - return # From ff65d39a85e1b841386a636fd682154c8f7aa241 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 1 Dec 2025 23:51:32 -0500 Subject: [PATCH 30/39] changed some dead code in createSecurityGroup to allow for type safety --- vmms/ec2SSH.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index b3aa7197..edb68280 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -257,6 +257,8 @@ def deleteKeyPair(self): # except OSError: # pass + # Creates a security group if it doesn't exist. + # ^ Note: strangely, the security group id is never used. def createSecurityGroup(self): try: # Check if the security group already exists @@ -273,14 +275,16 @@ def createSecurityGroup(self): return except Exception as e: self.log.debug("ERROR checking for existing security group: %s", e) - + # ! Note: We've never encountered the lines below before (there was a type error), + # ! because we've always had a security group. + # ! Difficult to test because it involves deleting all security groups. try: - security_group_response = self.boto3resource.create_security_group( + security_group_response = self.boto3client.create_security_group( GroupName=config.Config.DEFAULT_SECURITY_GROUP, Description="Autolab security group - allowing all traffic", ) security_group_id = security_group_response["GroupId"] - self.boto3resource.authorize_security_group_ingress( + self.boto3client.authorize_security_group_ingress( GroupId=security_group_id ) except Exception as e: From 0c7d45baf25ccec95f068a12e796738788d3415d Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 09:58:08 -0500 Subject: [PATCH 31/39] removed unused variable --- vmms/ec2SSH.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index edb68280..6681a488 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -726,7 +726,6 @@ def getVMs(self): vm.name = instName vm.id = int(instName.split("-")[1]) - vm.pool = instName.split("-")[2] vm.name = instName # needed for SSH From 40b7ace15b1fe9ba99a25bdd912bf0b9111c115d Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 09:59:11 -0500 Subject: [PATCH 32/39] fixed error logging in server.py --- restful_tango/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/restful_tango/server.py b/restful_tango/server.py index b31f0e8b..1a177ce0 100755 --- a/restful_tango/server.py +++ b/restful_tango/server.py @@ -46,7 +46,7 @@ def prepare(self): if not os.path.exists(tempdir): os.mkdir(tempdir, 0o700) if os.path.exists(tempdir) and not os.path.isdir(tempdir): - tangoREST.log("Cannot process uploads, %s is not a directory" % (tempdir,)) + tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,)) return self.send_error() self.tempfile = NamedTemporaryFile(prefix="upload", dir=tempdir, delete=False) self.hasher = hashlib.md5() @@ -129,7 +129,7 @@ def prepare(self): if not os.path.exists(tempdir): os.mkdir(tempdir, 0o700) if os.path.exists(tempdir) and not os.path.isdir(tempdir): - tangoREST.log("Cannot process uploads, %s is not a directory" % (tempdir,)) + tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,)) return self.send_error() self.tempfile = NamedTemporaryFile(prefix="docker", dir=tempdir, delete=False) From 25a9cf267a439958455c9570baf26ab59b4a84dc Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 10:01:45 -0500 Subject: [PATCH 33/39] proper decoding of bytes --- worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker.py b/worker.py index cf6e5a03..fc439d59 100644 --- a/worker.py +++ b/worker.py @@ -164,7 +164,7 @@ def notifyServer(self, job): ) self.log.info( "Response from callback to %s:%s" - % (job.notifyURL, response.content) + % (job.notifyURL, response.content.decode()) ) fh.close() else: From d4a8c5562ee935ab85360887a16a5a7e25db3c63 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 10:22:32 -0500 Subject: [PATCH 34/39] preallocator changes --- mypy.ini | 3 --- preallocator.py | 47 ++++++++++++++++++++++++++--------------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/mypy.ini b/mypy.ini index 7fe2ea10..f607e0a5 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,6 +9,3 @@ ignore_errors = True [mypy-tests.*] ignore_errors = True - -[mypy-preallocator] -ignore_errors = True \ No newline at end of file diff --git a/preallocator.py b/preallocator.py index 16b800b3..a5ba8e63 100644 --- a/preallocator.py +++ b/preallocator.py @@ -47,13 +47,13 @@ def update(self, vm, num): """ self.lock.acquire() if vm.name not in self.machines: - initial_queue = TangoQueue.create(vm.name) + initial_queue: TangoQueue[TangoMachine] = TangoQueue.create(vm.name) initial_queue.make_empty() self.machines.set(vm.name, ([], initial_queue)) self.log.debug("Creating empty pool of %s instances" % (vm.name)) self.lock.release() - delta = num - len(self.machines.get(vm.name)[0]) + delta = num - len(self.machines.getExn(vm.name)[0]) if delta > 0: # We need more self.machines, spin them up. self.log.debug("update: Creating %d new %s instances" % (delta, vm.name)) @@ -75,8 +75,8 @@ def allocVM(self, vmName): if vmName in self.machines: self.lock.acquire() - if not self.machines.get(vmName)[1].empty(): - vm = self.machines.get(vmName)[1].get_nowait() + if not self.machines.getExn(vmName)[1].empty(): + vm = self.machines.getExn(vmName)[1].get_nowait() self.lock.release() @@ -92,8 +92,8 @@ def freeVM(self, vm): # still a member of the pool. not_found = False self.lock.acquire() - if vm and vm.id in self.machines.get(vm.name)[0]: - machine = self.machines.get(vm.name) + if vm and vm.id in self.machines.getExn(vm.name)[0]: + machine = self.machines.getExn(vm.name) machine[1].put(vm) self.machines.set(vm.name, machine) else: @@ -108,7 +108,7 @@ def freeVM(self, vm): def addVM(self, vm): """addVM - add a particular VM instance to the pool""" self.lock.acquire() - machine = self.machines.get(vm.name) + machine = self.machines.getExn(vm.name) machine[0].append(vm.id) self.machines.set(vm.name, machine) self.lock.release() @@ -116,7 +116,7 @@ def addVM(self, vm): def removeVM(self, vm): """removeVM - remove a particular VM instance from the pool""" self.lock.acquire() - machine = self.machines.get(vm.name) + machine = self.machines.getExn(vm.name) machine[0].remove(vm.id) self.machines.set(vm.name, machine) self.lock.release() @@ -167,7 +167,7 @@ def __destroy(self, vm): the free list is empty. """ self.lock.acquire() - dieVM = self.machines.get(vm.name)[1].get_nowait() + dieVM = self.machines.getExn(vm.name)[1].get_nowait() self.lock.release() if dieVM: @@ -203,12 +203,14 @@ def destroyVM(self, vmName, id): dieVM = None self.lock.acquire() - size = self.machines.get(vmName)[1].qsize() - if size == len(self.machines.get(vmName)[0]): - for i in range(size): - vm = self.machines.get(vmName)[1].get_nowait() + size = self.machines.getExn(vmName)[1].qsize() + if size == len(self.machines.getExn(vmName)[0]): + for _ in range(size): + vm = self.machines.getExn(vmName)[1].get_nowait() + if vm is None: + break if vm.id != id: - machine = self.machines.get(vmName) + machine = self.machines.getExn(vmName) machine[1].put(vm) self.machines.set(vmName, machine) else: @@ -217,7 +219,7 @@ def destroyVM(self, vmName, id): if dieVM: self.removeVM(dieVM) - vmms = self.vmms[vm.vmms] + vmms = self.vmms[dieVM.vmms] vmms.safeDestroyVM(dieVM) return 0 else: @@ -229,9 +231,10 @@ def getAllPools(self): result[vmName] = self.getPool(vmName) return result + # TODO: replace with a named tuple def getPool(self, vmName): """getPool - returns the members of a pool and its free list""" - result = {} + result: Dict[str, List[TangoMachine]] = {} if vmName not in self.machines: return result @@ -239,15 +242,17 @@ def getPool(self, vmName): result["free"] = [] free_list = [] self.lock.acquire() - size = self.machines.get(vmName)[1].qsize() - for i in range(size): - vm = self.machines.get(vmName)[1].get_nowait() + size = self.machines.getExn(vmName)[1].qsize() + for _ in range(size): + vm = self.machines.getExn(vmName)[1].get_nowait() + if vm is None: + break free_list.append(vm.id) - machine = self.machines.get(vmName) + machine = self.machines.getExn(vmName) machine[1].put(vm) self.machines.set(vmName, machine) self.lock.release() - result["total"] = self.machines.get(vmName)[0] + result["total"] = self.machines.getExn(vmName)[0] result["free"] = free_list return result From e71b5b1120192e6aa61fda6b3a2ee52849118f72 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 11:05:20 -0500 Subject: [PATCH 35/39] fixes --- vmms/sharedUtils.py | 38 ++++++++++++++++++++++++++++++++++++++ worker.py | 1 - 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 vmms/sharedUtils.py diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py new file mode 100644 index 00000000..55e16195 --- /dev/null +++ b/vmms/sharedUtils.py @@ -0,0 +1,38 @@ +import subprocess +import time +import os +from typing import List +import config + +class VMMSUtils: + @staticmethod + def timeout(command: List[str], time_out: float = 1) -> int: + """timeout - Run a unix command with a timeout. Return -1 on + timeout, otherwise return the return value from the command, which + is typically 0 for success, 1-255 for failure. + """ + + # Launch the command + p = subprocess.Popen( + command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT + ) + + # Wait for the command to complete + t = 0.0 + while t < time_out and p.poll() is None: + time.sleep(config.Config.TIMER_POLL_INTERVAL) + t += config.Config.TIMER_POLL_INTERVAL + + # Determine why the while loop terminated + returncode: int + poll_result = p.poll() + if poll_result is None: + try: + os.kill(p.pid, 9) + except OSError: + pass + returncode = -1 + else: + returncode = poll_result + return returncode + diff --git a/worker.py b/worker.py index fc439d59..33b2f12d 100644 --- a/worker.py +++ b/worker.py @@ -284,7 +284,6 @@ def run(self): self.log.debug(msg) return # Copy input files to VM - self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}") ret["copyin"] = self.vmms.copyIn(vm, self.job.input, self.job.id) self.log.debug(f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}") From 620b949369fa58bd76464be136c6c8761026f2a9 Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 13:27:25 -0500 Subject: [PATCH 36/39] added type annotations to everything such that mypy . doesn't give any --check-untyped-defs suggestions --- preallocator.py | 22 ++++++------- vmms/ec2SSH.py | 80 ++++++++++++++++++++++------------------------- vmms/interface.py | 10 +++--- worker.py | 22 +++++++------ 4 files changed, 65 insertions(+), 69 deletions(-) diff --git a/preallocator.py b/preallocator.py index a5ba8e63..eb2d27c9 100644 --- a/preallocator.py +++ b/preallocator.py @@ -29,14 +29,14 @@ def __init__(self, vmms: Dict[str, VMMSInterface]) -> None: self.vmms: Dict[str, VMMSInterface] = vmms self.log = logging.getLogger("Preallocator") - def poolSize(self, vmName): + def poolSize(self, vmName: str): """poolSize - returns the size of the vmName pool, for external callers""" if vmName not in self.machines: return 0 else: return len(self.machines.getExn(vmName)[0]) - def update(self, vm, num): + def update(self, vm: TangoMachine, num: int): """update - Updates the number of machines of a certain type to be preallocated. @@ -69,7 +69,7 @@ def update(self, vm, num): # If delta == 0 then we are the perfect number! - def allocVM(self, vmName): + def allocVM(self, vmName: str): """allocVM - Allocate a VM from the free list""" vm = None if vmName in self.machines: @@ -86,7 +86,7 @@ def allocVM(self, vmName): return vm - def freeVM(self, vm): + def freeVM(self, vm: TangoMachine): """freeVM - Returns a VM instance to the free list""" # Sanity check: Return a VM to the free list only if it is # still a member of the pool. @@ -105,7 +105,7 @@ def freeVM(self, vm): vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(vm) - def addVM(self, vm): + def addVM(self, vm: TangoMachine): """addVM - add a particular VM instance to the pool""" self.lock.acquire() machine = self.machines.getExn(vm.name) @@ -113,7 +113,7 @@ def addVM(self, vm): self.machines.set(vm.name, machine) self.lock.release() - def removeVM(self, vm): + def removeVM(self, vm: TangoMachine): """removeVM - remove a particular VM instance from the pool""" self.lock.acquire() machine = self.machines.getExn(vm.name) @@ -137,7 +137,7 @@ def _getNextID(self): self.lock.release() return id - def __create(self, vm, cnt): + def __create(self, vm: TangoMachine, cnt: int): """__create - Creates count VMs and adds them to the pool This function should always be called in a thread since it @@ -157,7 +157,7 @@ def __create(self, vm, cnt): self.freeVM(newVM) self.log.debug("__create: Added vm %s to pool %s " % (newVM.id, newVM.name)) - def __destroy(self, vm): + def __destroy(self, vm: TangoMachine): """__destroy - Removes a VM from the pool If the user asks for fewer preallocated VMs, then we will @@ -175,7 +175,7 @@ def __destroy(self, vm): vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(dieVM) - def createVM(self, vm): + def createVM(self, vm: TangoMachine): """createVM - Called in non-thread context to create a single VM and add it to the pool """ @@ -192,7 +192,7 @@ def createVM(self, vm): self.freeVM(newVM) self.log.debug("createVM: Added vm %s to pool %s" % (newVM.id, newVM.name)) - def destroyVM(self, vmName, id): + def destroyVM(self, vmName: str, id: int): """destroyVM - Called by the delVM API function to remove and destroy a particular VM instance from a pool. We only allow this function when the system is queiscent (pool size == free @@ -232,7 +232,7 @@ def getAllPools(self): return result # TODO: replace with a named tuple - def getPool(self, vmName): + def getPool(self, vmName: str): """getPool - returns the members of a pool and its free list""" result: Dict[str, List[TangoMachine]] = {} if vmName not in self.machines: diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 6681a488..2c928650 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -2,11 +2,7 @@ # ec2SSH.py - Implements the Tango VMMS interface to run Tango jobs on Amazon EC2. # # This implementation uses the AWS EC2 SDK to manage the virtual machines and -# ssh and scp to access them. The following excecption are raised back -# to the caller: -# -# Ec2Exception - EC2 raises this if it encounters any problem -# ec2CallError - raised by ec2Call() function +# ssh and scp to access them. # import logging @@ -21,11 +17,13 @@ from botocore.exceptions import ClientError import config -from tangoObjects import TangoMachine -from typing import Optional, Literal, List, Dict, Sequence +from tangoObjects import TangoMachine, InputFile +from typing import Optional, Literal, List, Dict, Sequence, Set, get_args +from typing_extensions import TypeGuard from mypy_boto3_ec2 import EC2ServiceResource +from mypy_boto3_ec2.literals import InstanceTypeType from mypy_boto3_ec2.service_resource import Instance, Image -from mypy_boto3_ec2.type_defs import FilterTypeDef +from mypy_boto3_ec2.type_defs import FilterTypeDef, TagTypeDef from vmms.interface import VMMSInterface from vmms.sharedUtils import VMMSUtils @@ -35,9 +33,11 @@ logging.getLogger("botocore").setLevel(logging.CRITICAL) logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL) +valid_instance_types: Set[InstanceTypeType] = set(get_args(InstanceTypeType)) +def check_instance_type(instance_type: str) -> TypeGuard[InstanceTypeType]: + return instance_type in valid_instance_types - -def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2) -> int: +def timeout_with_retries(command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2) -> int: """timeout - Run a unix command with a timeout. Return -1 on timeout, otherwise return the return value from the command, which is typically 0 for success, 1-255 for failure. @@ -82,16 +82,6 @@ def try_load_instance(newInstance): newInstance.load() -# -# User defined exceptions -# -# ec2Call() exception - - -class ec2CallError(Exception): - pass - - class Ec2SSH(VMMSInterface, VMMSUtils): _SSH_FLAGS = [ "-i", @@ -106,12 +96,12 @@ class Ec2SSH(VMMSInterface, VMMSUtils): _vm_semaphore = threading.Semaphore(config.Config.MAX_EC2_VMS) @staticmethod - def acquire_vm_semaphore(): + def acquire_vm_semaphore() -> None: """Blocks until a VM is available to limit load""" Ec2SSH._vm_semaphore.acquire() # This blocks until a slot is available @staticmethod - def release_vm_semaphore(): + def release_vm_semaphore() -> None: """Releases the VM sempahore""" Ec2SSH._vm_semaphore.release() @@ -188,18 +178,18 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = % str(ignoredAMIs) ) - def instanceName(self, id, name): + def instanceName(self, id: int, name: str) -> str: """instanceName - Constructs a VM instance name. Always use this function when you need a VM instance name. Never generate instance names manually. """ return "%s-%d-%s" % (config.Config.PREFIX, id, name) - def keyPairName(self, id, name): + def keyPairName(self, id: int, name: str) -> str: """keyPairName - Constructs a unique key pair name.""" return "%s-%d-%s" % (config.Config.PREFIX, id, name) - def domainName(self, vm): + def domainName(self, vm: TangoMachine) -> str: """Returns the domain name that is stored in the vm instance. """ @@ -209,7 +199,8 @@ def domainName(self, vm): # VMMS helper methods # - def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict: + # TODO: return a dataclass with the instance_type member of type InstanceTypeType of type str + def tangoMachineToEC2Instance(self, vm: TangoMachine) -> Dict[str, str]: """tangoMachineToEC2Instance - returns an object with EC2 instance type and AMI. Only general-purpose instances are used. Defalt AMI is currently used. @@ -233,7 +224,7 @@ def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict: self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance)) return ec2instance - def createKeyPair(self): + def createKeyPair(self) -> None: # TODO: SUPPORT raise # # try to delete the key to avoid collision @@ -247,7 +238,7 @@ def createKeyPair(self): # # change the SSH_FLAG accordingly # self.ssh_flags[1] = self.key_pair_path - def deleteKeyPair(self): + def deleteKeyPair(self) -> None: # TODO: SUPPORT raise # self.boto3client.delete_key_pair(self.key_pair_name) @@ -259,7 +250,7 @@ def deleteKeyPair(self): # Creates a security group if it doesn't exist. # ^ Note: strangely, the security group id is never used. - def createSecurityGroup(self): + def createSecurityGroup(self) -> None: try: # Check if the security group already exists description_response = self.boto3client.describe_security_groups( @@ -304,6 +295,9 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: try: instanceName = self.instanceName(vm.id, vm.name) ec2instance = self.tangoMachineToEC2Instance(vm) + instance_type = ec2instance["instance_type"] + if not check_instance_type(instance_type): + raise ValueError(f"Invalid instance type: {instance_type}") self.log.debug("instanceName: %s" % instanceName) # ensure that security group exists self.createSecurityGroup() @@ -313,7 +307,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: ImageId=ec2instance["ami"], KeyName=self.key_pair_name, SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP], - InstanceType=ec2instance["instance_type"], + InstanceType=instance_type, MaxCount=1, MinCount=1, InstanceMarketOptions= @@ -416,7 +410,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: return -1 return -1 - def waitVM(self, vm, max_secs) -> Literal[0, -1]: + def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: """waitVM - Wait at most max_secs for a VM to become ready. Return error if it takes too long. @@ -482,7 +476,7 @@ def waitVM(self, vm, max_secs) -> Literal[0, -1]: # Sleep a bit before trying again time.sleep(config.Config.TIMER_POLL_INTERVAL) - def copyIn(self, vm, inputFiles, job_id=None): + def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: """copyIn - Copy input files to VM Args: - vm is a TangoMachine object @@ -546,7 +540,7 @@ def copyIn(self, vm, inputFiles, job_id=None): return 0 - def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): + def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: """runJob - Run the make command on a VM using SSH and redirect output to file "output". """ @@ -578,7 +572,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): # runTimeout * 2 is a temporary hack. The driver will handle the timout return ret - def copyOut(self, vm, destFile): + def copyOut(self, vm: TangoMachine, destFile: str) -> int: """copyOut - Copy the file output on the VM to the file outputFile on the Tango host. """ @@ -638,7 +632,7 @@ def copyOut(self, vm, destFile): config.Config.COPYOUT_TIMEOUT, ) - def destroyVM(self, vm): + def destroyVM(self, vm: TangoMachine) -> None: """destroyVM - Removes a VM from the system""" self.log.info( "destroyVM: %s %s %s %s" @@ -685,15 +679,15 @@ def destroyVM(self, vm): Ec2SSH.release_vm_semaphore() - def safeDestroyVM(self, vm): + def safeDestroyVM(self, vm: TangoMachine) -> None: return self.destroyVM(vm) - def getVMs(self): + def getVMs(self) -> List[TangoMachine]: """getVMs - Returns the complete list of VMs on this account. Each list entry is a boto.ec2.instance.Instance object. """ try: - vms = list() + vms: List[TangoMachine] = [] filters: Sequence[FilterTypeDef] = [ { "Name": "instance-state-name", @@ -742,7 +736,7 @@ def getVMs(self): return vms - def existsVM(self, vm): + def existsVM(self, vm: TangoMachine) -> bool: """existsVM - Checks whether a VM exists in the vmms.""" # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}] @@ -757,19 +751,19 @@ def existsVM(self, vm): # for instance in instances.filter(InstanceIds) return False - def getImages(self): + def getImages(self) -> List[str]: """getImages - return a constant; actually use the ami specified in config""" return [key for key in self.img2ami] - # getTag: to do later - def getTag(self, tagList, tagKey): + # TODO: later + def getTag(self, tagList: List[TagTypeDef], tagKey: str) -> Optional[str]: if tagList: for tag in tagList: if tag["Key"] == tagKey: return tag["Value"] return None - def getPartialOutput(self, vm): + def getPartialOutput(self, vm: TangoMachine) -> str: domain_name = self.domainName(vm) runcmd = "head -c %s /home/autograde/output.log" % ( diff --git a/vmms/interface.py b/vmms/interface.py index 6235e227..a437401a 100644 --- a/vmms/interface.py +++ b/vmms/interface.py @@ -1,5 +1,5 @@ from typing import Protocol, Optional, Literal, List -from tangoObjects import TangoMachine +from tangoObjects import TangoMachine, InputFile from abc import abstractmethod @@ -13,7 +13,7 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: ... @abstractmethod - def copyIn(self, vm: TangoMachine, inputFiles: List[str], job_id: Optional[int] = None) -> Literal[0, -1]: + def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: ... @abstractmethod @@ -21,15 +21,15 @@ def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disa ... @abstractmethod - def copyOut(self, vm: TangoMachine, destFile: str) -> Literal[0, -1]: + def copyOut(self, vm: TangoMachine, destFile: str) -> int: ... @abstractmethod - def destroyVM(self, vm: TangoMachine) -> Literal[0, -1]: + def destroyVM(self, vm: TangoMachine) -> None: ... @abstractmethod - def safeDestroyVM(self, vm: TangoMachine) -> Literal[0, -1]: + def safeDestroyVM(self, vm: TangoMachine) -> None: ... @abstractmethod diff --git a/worker.py b/worker.py index 36f81b97..8d92123c 100644 --- a/worker.py +++ b/worker.py @@ -15,9 +15,10 @@ from datetime import datetime from config import Config from jobQueue import JobQueue -from tangoObjects import TangoMachine +from tangoObjects import TangoMachine, TangoJob from typing import Dict, Optional from vmms.interface import VMMSInterface +from preallocator import Preallocator # # Worker - The worker class is very simple and very dumb. The goal is # to walk through the VMMS interface, track the job's progress, and if @@ -37,7 +38,7 @@ class DetachMethod(Enum): # We always preallocate a VM for the worker to use class Worker(threading.Thread): - def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine): + def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preallocator: Preallocator, preVM: TangoMachine): threading.Thread.__init__(self) self.daemon = True self.job = job @@ -85,8 +86,7 @@ def detachVM(self, detachMethod: DetachMethod): else: raise ValueError(f"Invalid detach method: {detachMethod}") - # TODO: figure out what hdrfile, ret and err are - def rescheduleJob(self, hdrfile, ret, err): + def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None: """rescheduleJob - Reschedule a job that has failed because of a system error, such as a VM timing out or a connection failure. @@ -121,13 +121,13 @@ def rescheduleJob(self, hdrfile, ret, err): ) self.afterJobExecution(hdrfile, full_err, DetachMethod.DESTROY_AND_REPLACE) - def appendMsg(self, filename, msg): + def appendMsg(self, filename: str, msg: str) -> None: """appendMsg - Append a timestamped Tango message to a file""" f = open(filename, "a") f.write("Autograder [%s]: %s\n" % (datetime.now().ctime(), msg)) f.close() - def catFiles(self, f1, f2): + def catFiles(self, f1: str, f2: str) -> None: """catFiles - cat f1 f2 > f2, where f1 is the Tango header and f2 is the output from the Autodriver """ @@ -146,7 +146,7 @@ def catFiles(self, f1, f2): os.rename(tmpname, f2) os.remove(f1) - def notifyServer(self, job): + def notifyServer(self, job: TangoJob) -> None: try: if job.notifyURL: outputFileName = job.outputFile.split("/")[-1] # get filename from path @@ -172,7 +172,7 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - def afterJobExecution(self, hdrfile, msg, detachMethod: DetachMethod): + def afterJobExecution(self, hdrfile: str, msg: str, detachMethod: DetachMethod) -> None: self.jobQueue.makeDead(self.job, msg) # Update the text that users see in the autodriver output file @@ -188,7 +188,7 @@ def afterJobExecution(self, hdrfile, msg, detachMethod: DetachMethod): # # Main worker function # - def run(self): + def run(self) -> None: """run - Step a job through its execution sequence""" try: # Hash of return codes for each step @@ -409,7 +409,9 @@ def run(self): # and detachVM can be run # if vm is not set but self.preVM is set, we still need # to return the VM, but have to initialize self.job.vm first + # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set if self.preVM and not vm: - vm = self.job.vm = self.preVM + self.job.makeVM(self.preVM) + vm = self.preVM if vm: self.detachVM(DetachMethod.DESTROY_AND_REPLACE) From 8eed4dbffdf669404c9be849e8bea5a413f99b0e Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 14:34:45 -0500 Subject: [PATCH 37/39] local docker type annotations --- vmms/ec2SSH.py | 9 ++------- vmms/localDocker.py | 49 ++++++++++++++++++++------------------------- vmms/sharedUtils.py | 8 ++++++++ 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 2c928650..4b2e4090 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -178,13 +178,6 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = % str(ignoredAMIs) ) - def instanceName(self, id: int, name: str) -> str: - """instanceName - Constructs a VM instance name. Always use - this function when you need a VM instance name. Never generate - instance names manually. - """ - return "%s-%d-%s" % (config.Config.PREFIX, id, name) - def keyPairName(self, id: int, name: str) -> str: """keyPairName - Constructs a unique key pair name.""" return "%s-%d-%s" % (config.Config.PREFIX, id, name) @@ -198,6 +191,8 @@ def domainName(self, vm: TangoMachine) -> str: # # VMMS helper methods # + def instanceName(self, id: int, name: str) -> str: + return VMMSUtils.constructInstanceName(id, name) # TODO: return a dataclass with the instance_type member of type InstanceTypeType of type str def tangoMachineToEC2Instance(self, vm: TangoMachine) -> Dict[str, str]: diff --git a/vmms/localDocker.py b/vmms/localDocker.py index b002a06e..fa727bb6 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -12,8 +12,8 @@ import sys import shutil import config -from tangoObjects import TangoMachine -from typing import List +from tangoObjects import TangoMachine, InputFile +from typing import List, Literal, Optional from vmms.interface import VMMSInterface from vmms.sharedUtils import VMMSUtils @@ -40,25 +40,21 @@ def __init__(self): self.log.error(str(e)) exit(1) - def instanceName(self, id, name): - """instanceName - Constructs a VM instance name. Always use - this function when you need a VM instance name. Never generate - instance names manually. - """ - return "%s-%s-%s" % (config.Config.PREFIX, id, name) + def instanceName(self, id: int, name: str) -> str: + return VMMSUtils.constructInstanceName(id, name) - def getVolumePath(self, instanceName): + def getVolumePath(self, instanceName: str) -> str: volumePath = config.Config.DOCKER_VOLUME_PATH # Last empty string to cause trailing '/' volumePath = os.path.join(volumePath, instanceName, "") return volumePath - def getDockerVolumePath(self, dockerPath, instanceName): + def getDockerVolumePath(self, dockerPath: str, instanceName: str) -> str: # Last empty string to cause trailing '/' volumePath = os.path.join(dockerPath, instanceName, "") return volumePath - def domainName(self, vm): + def domainName(self, vm: TangoMachine) -> str: """Returns the domain name that is stored in the vm instance. """ @@ -67,15 +63,15 @@ def domainName(self, vm): # # VMMS API functions # - def initializeVM(self, vm): + def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: """initializeVM - Nothing to do for initializeVM""" return 0 - def waitVM(self, vm, max_secs): + def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: """waitVM - Nothing to do for waitVM""" - return + return 0 - def copyIn(self, vm, inputFiles, job_id=None): + def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: """copyIn - Create a directory to be mounted as a volume for the docker containers. Copy input files to this directory. """ @@ -94,7 +90,7 @@ def copyIn(self, vm, inputFiles, job_id=None): ) return 0 - def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): + def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: """runJob - Run a docker container by doing the follows: - mount directory corresponding to this job to /home/autolab in the container @@ -103,10 +99,9 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): """ instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) - if os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH"): - volumePath = self.getDockerVolumePath( - os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH"), instanceName - ) + host_volume_path = os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH") + if host_volume_path: + volumePath = self.getDockerVolumePath(host_volume_path, instanceName) args = ["docker", "run", "--name", instanceName, "-v"] args = args + ["%s:%s" % (volumePath, "/home/mount")] if vm.cores: @@ -140,7 +135,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork): return ret - def copyOut(self, vm, destFile): + def copyOut(self, vm: TangoMachine, destFile: str) -> int: """copyOut - Copy the autograder feedback from container to destFile on the Tango host. Then, destroy that container. Containers are never reused. @@ -153,7 +148,7 @@ def copyOut(self, vm, destFile): return 0 - def destroyVM(self, vm): + def destroyVM(self, vm: TangoMachine) -> None: """destroyVM - Delete the docker container.""" instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath("") @@ -166,7 +161,7 @@ def destroyVM(self, vm): self.log.debug("Deleted volume %s" % instanceName) return - def safeDestroyVM(self, vm): + def safeDestroyVM(self, vm: TangoMachine) -> None: """safeDestroyVM - Delete the docker container and make sure it is removed. """ @@ -178,7 +173,7 @@ def safeDestroyVM(self, vm): self.destroyVM(vm) return - def getVMs(self): + def getVMs(self) -> List[TangoMachine]: """getVMs - Executes and parses `docker ps`. This function is a lot of parsing and can break easily. """ @@ -196,7 +191,7 @@ def getVMs(self): machines.append(machine) return machines - def existsVM(self, vm): + def existsVM(self, vm: TangoMachine) -> bool: """existsVM - Executes `docker inspect CONTAINER`, which returns a non-zero status upon not finding a container. """ @@ -204,7 +199,7 @@ def existsVM(self, vm): ret = VMMSUtils.timeout(["docker", "inspect", instanceName]) return ret == 0 - def getImages(self): + def getImages(self) -> List[str]: """getImages - Executes `docker images` and returns a list of images that can be used to boot a docker container with. This function is a lot of parsing and so can break easily. @@ -221,7 +216,7 @@ def getImages(self): result.add(re.sub(r".*/([^/]*)", r"\1", row_l[0])) return list(result) - def getPartialOutput(self, vm): + def getPartialOutput(self, vm: TangoMachine) -> str: """getPartialOutput - Get the partial output of a job. It does not check if the docker container exists before executing as the command will not fail even if the container does not exist. diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py index 55e16195..0165b313 100644 --- a/vmms/sharedUtils.py +++ b/vmms/sharedUtils.py @@ -5,6 +5,14 @@ import config class VMMSUtils: + @staticmethod + def constructInstanceName(id: int, name: str) -> str: + """instanceName - Constructs a VM instance name. Always use + this function when you need a VM instance name. Never generate + instance names manually. + """ + return "%s-%d-%s" % (config.Config.PREFIX, id, name) + @staticmethod def timeout(command: List[str], time_out: float = 1) -> int: """timeout - Run a unix command with a timeout. Return -1 on From 9b1d8980b43f8471decd3035a0542dc025b2af5f Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Tue, 2 Dec 2025 16:12:41 -0500 Subject: [PATCH 38/39] linting fix --- clients/tango-cli.py | 305 ++++++++++++++++---------- jobManager.py | 22 +- jobQueue.py | 23 +- preallocator.py | 5 +- restful_tango/server.py | 8 +- restful_tango/tangoREST.py | 6 +- tangoObjects.py | 97 ++++---- tests/sample_test/expected_output.txt | 2 +- tests/stressTest.py | 118 +++++++--- vmms/ec2SSH.py | 115 +++++----- vmms/interface.py | 19 +- vmms/localDocker.py | 20 +- vmms/sharedUtils.py | 4 +- worker.py | 95 +++++--- 14 files changed, 518 insertions(+), 321 deletions(-) diff --git a/clients/tango-cli.py b/clients/tango-cli.py index 78c5155f..d51d2da3 100755 --- a/clients/tango-cli.py +++ b/clients/tango-cli.py @@ -28,6 +28,7 @@ def get_arg(name, default=None): @dataclass class RequestObj: """Dataclass for job request objects""" + image: str files: str timeout: int @@ -46,6 +47,7 @@ class RequestObj: @dataclass class VmObj: """Dataclass for VM allocation objects""" + vmms: str cores: int memory: int @@ -161,45 +163,48 @@ class VmObj: parser.add_argument("--accessKeyId", default="", help="AWS account access key ID") parser.add_argument("--accessKey", default="", help="AWS account access key content") parser.add_argument("--instanceType", default="", help="AWS EC2 instance type") -parser.add_argument("--stopBefore", default="", help="Stops the worker before a function is executed") +parser.add_argument( + "--stopBefore", default="", help="Stops the worker before a function is executed" +) + def checkKey(): - if get_arg('key') is None: + if get_arg("key") is None: print("Key must be specified with -k") return -1 return 0 def checkCourselab(): - if get_arg('courselab') is None: + if get_arg("courselab") is None: print("Courselab must be specified with -l") return -1 return 0 def checkFilename(): - if get_arg('filename') is None: + if get_arg("filename") is None: print("Filename must be specified with --filename") return -1 return 0 def checkInfiles(): - if get_arg('infiles') is None: + if get_arg("infiles") is None: print("Input files must be specified with --infiles") return -1 return 0 def checkDeadjobs(): - if get_arg('deadJobs') is None: + if get_arg("deadJobs") is None: print("Deadjobs must be specified with --deadJobs") return -1 return 0 def checkImageName(): - if get_arg('imageName') is None: + if get_arg("imageName") is None: print("Image name must be specified with --imageName") return -1 return 0 @@ -218,18 +223,24 @@ def tango_open(): response = requests.get( "%s://%s:%d/open/%s/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')) + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + ) ) print( "Sent request to %s:%d/open/%s/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("courselab")) ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/open/%s/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("courselab")) ) print(str(err)) sys.exit(0) @@ -244,28 +255,46 @@ def tango_upload(): if res != 0: raise Exception("Invalid usage: [upload] " + upload_help) - dirs = get_arg('filename').split("/") + dirs = get_arg("filename").split("/") filename = dirs[len(dirs) - 1] header = {"Filename": filename} - f = open(get_arg('filename'), 'rb') + f = open(get_arg("filename"), "rb") response = requests.post( "%s://%s:%d/upload/%s/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')), + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + ), data=f.read(), headers=header, ) f.close() print( "Sent request to %s:%d/upload/%s/%s/ filename=%s" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), get_arg('filename')) + % ( + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + get_arg("filename"), + ) ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/upload/%s/%s/ filename=%s" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), get_arg('filename')) + % ( + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + get_arg("filename"), + ) ) print(str(err)) sys.exit(0) @@ -281,36 +310,54 @@ def tango_addJob(): raise Exception("Invalid usage: [addJob] " + addJob_help) requestObj = RequestObj( - image=get_arg('image'), - files=get_arg('infiles'), - timeout=get_arg('timeout'), - max_kb=get_arg('maxsize'), - output_file=get_arg('outputFile'), - jobName=get_arg('jobname'), - accessKeyId=get_arg('accessKeyId'), - accessKey=get_arg('accessKey'), - disable_network=get_arg('disableNetwork'), - instanceType=get_arg('instanceType'), - stopBefore=get_arg('stopBefore'), - notifyURL=get_arg('notifyURL'), - callback_url=get_arg('callbackURL'), + image=get_arg("image"), + files=get_arg("infiles"), + timeout=get_arg("timeout"), + max_kb=get_arg("maxsize"), + output_file=get_arg("outputFile"), + jobName=get_arg("jobname"), + accessKeyId=get_arg("accessKeyId"), + accessKey=get_arg("accessKey"), + disable_network=get_arg("disableNetwork"), + instanceType=get_arg("instanceType"), + stopBefore=get_arg("stopBefore"), + notifyURL=get_arg("notifyURL"), + callback_url=get_arg("callbackURL"), ) response = requests.post( "%s://%s:%d/addJob/%s/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')), + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + ), data=json.dumps(asdict(requestObj)), ) print( "Sent request to %s:%d/addJob/%s/%s/ \t jobObj=%s" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), json.dumps(asdict(requestObj))) + % ( + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + json.dumps(asdict(requestObj)), + ) ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/addJob/%s/%s/ \t jobObj=%s" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), json.dumps(asdict(requestObj)) if 'requestObj' in locals() else 'N/A') + % ( + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + json.dumps(asdict(requestObj)) if "requestObj" in locals() else "N/A", + ) ) print(str(err)) sys.exit(0) @@ -325,19 +372,19 @@ def tango_getPartialOutput(): "%s://%s:%d/getPartialOutput/%s/%s/" % ( _tango_protocol, - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('jobid'), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("jobid"), ) ) print( "Sent request to %s:%d/getPartialOutput/%s/%s/" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('jobid'), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("jobid"), ) ) print(response.text) @@ -345,10 +392,10 @@ def tango_getPartialOutput(): print( "Failed to send request to %s:%d/getPartialOutput/%s/%s/" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('jobid'), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("jobid"), ) ) print(str(err)) @@ -368,21 +415,21 @@ def tango_poll(): "%s://%s:%d/poll/%s/%s/%s/" % ( _tango_protocol, - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('courselab'), - urllib.parse.quote(get_arg('outputFile')), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + urllib.parse.quote(get_arg("outputFile")), ) ) print( "Sent request to %s:%d/poll/%s/%s/%s/" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('courselab'), - urllib.parse.quote(get_arg('outputFile')), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + urllib.parse.quote(get_arg("outputFile")), ) ) print(response.text) @@ -391,11 +438,11 @@ def tango_poll(): print( "Failed to send request to %s:%d/poll/%s/%s/%s/" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('courselab'), - urllib.parse.quote(get_arg('outputFile')), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("courselab"), + urllib.parse.quote(get_arg("outputFile")), ) ) print(str(err)) @@ -412,15 +459,19 @@ def tango_info(): raise Exception("Invalid usage: [info] " + info_help) response = requests.get( - "%s://%s:%d/info/%s/" % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key')) + "%s://%s:%d/info/%s/" + % (_tango_protocol, get_arg("server"), get_arg("port"), get_arg("key")) + ) + print( + "Sent request to %s:%d/info/%s/" + % (get_arg("server"), get_arg("port"), get_arg("key")) ) - print("Sent request to %s:%d/info/%s/" % (get_arg('server'), get_arg('port'), get_arg('key'))) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/info/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key')) + % (get_arg("server"), get_arg("port"), get_arg("key")) ) print(str(err)) sys.exit(0) @@ -437,18 +488,24 @@ def tango_jobs(): response = requests.get( "%s://%s:%d/jobs/%s/%d/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs')) + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("deadJobs"), + ) ) print( "Sent request to %s:%d/jobs/%s/%d/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("deadJobs")) ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/jobs/%s/%d/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("deadJobs")) ) print(str(err)) sys.exit(0) @@ -465,18 +522,24 @@ def tango_pool(): response = requests.get( "%s://%s:%d/pool/%s/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image')) + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("image"), + ) ) print( "Sent request to %s:%d/pool/%s/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("image")) ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/pool/%s/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image')) + % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("image")) ) print(str(err)) sys.exit(0) @@ -492,23 +555,30 @@ def tango_prealloc(): if res != 0: raise Exception("Invalid usage: [prealloc] " + prealloc_help) - vmObj["vmms"] = get_arg('vmms') - vmObj["cores"] = get_arg('cores') - vmObj["memory"] = get_arg('memory') + vmObj["vmms"] = get_arg("vmms") + vmObj["cores"] = get_arg("cores") + vmObj["memory"] = get_arg("memory") response = requests.post( "%s://%s:%d/prealloc/%s/%s/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image'), get_arg('num')), + % ( + _tango_protocol, + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("image"), + get_arg("num"), + ), data=json.dumps(vmObj), ) print( "Sent request to %s:%d/prealloc/%s/%s/%s/ \t vmObj=%s" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('image'), - get_arg('num'), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("image"), + get_arg("num"), json.dumps(vmObj), ) ) @@ -518,11 +588,11 @@ def tango_prealloc(): print( "Failed to send request to %s:%d/prealloc/%s/%s/%s/ \t vmObj=%s" % ( - get_arg('server'), - get_arg('port'), - get_arg('key'), - get_arg('image'), - get_arg('num'), + get_arg("server"), + get_arg("port"), + get_arg("key"), + get_arg("image"), + get_arg("num"), json.dumps(vmObj), ) ) @@ -548,21 +618,24 @@ def tango_build(): if res != 0: raise Exception("Invalid usage: [build] " + build_help) - f = open(get_arg('filename'), "rb") - header = {"imageName": get_arg('imageName')} + f = open(get_arg("filename"), "rb") + header = {"imageName": get_arg("imageName")} response = requests.post( "%s://%s:%d/build/%s/" - % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key')), + % (_tango_protocol, get_arg("server"), get_arg("port"), get_arg("key")), data=f.read(), headers=header, ) - print("Sent request to %s:%d/build/%s/" % (get_arg('server'), get_arg('port'), get_arg('key'))) + print( + "Sent request to %s:%d/build/%s/" + % (get_arg("server"), get_arg("port"), get_arg("key")) + ) print(response.text) except Exception as err: print( "Failed to send request to %s:%d/build/%s/" - % (get_arg('server'), get_arg('port'), get_arg('key')) + % (get_arg("server"), get_arg("port"), get_arg("key")) ) print(str(err)) sys.exit(0) @@ -572,11 +645,11 @@ def tango_build(): def tango_runJob(): - if get_arg('runJob') is None: + if get_arg("runJob") is None: print("Invalid usage: [runJob]") sys.exit(0) - dir = get_arg('runJob') + dir = get_arg("runJob") infiles = [ file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file)) ] @@ -585,7 +658,7 @@ def tango_runJob(): args.jobname += "-0" args.outputFile += "-0" - for i in range(1, get_arg('numJobs') + 1): + for i in range(1, get_arg("numJobs") + 1): print( "----------------------------------------- STARTING JOB " + str(i) @@ -608,27 +681,27 @@ def tango_runJob(): def router(): - if get_arg('open'): + if get_arg("open"): tango_open() - elif get_arg('upload'): + elif get_arg("upload"): tango_upload() - elif get_arg('addJob'): + elif get_arg("addJob"): tango_addJob() - elif get_arg('poll'): + elif get_arg("poll"): tango_poll() - elif get_arg('info'): + elif get_arg("info"): tango_info() - elif get_arg('jobs'): + elif get_arg("jobs"): tango_jobs() - elif get_arg('pool'): + elif get_arg("pool"): tango_pool() - elif get_arg('prealloc'): + elif get_arg("prealloc"): tango_prealloc() - elif get_arg('runJob'): + elif get_arg("runJob"): tango_runJob() - elif get_arg('getPartialOutput'): + elif get_arg("getPartialOutput"): tango_getPartialOutput() - elif get_arg('build'): + elif get_arg("build"): tango_build() @@ -637,32 +710,34 @@ def router(): # args = parser.parse_args() if ( - not get_arg('open') - and not get_arg('upload') - and not get_arg('addJob') - and not get_arg('poll') - and not get_arg('info') - and not get_arg('jobs') - and not get_arg('pool') - and not get_arg('prealloc') - and not get_arg('runJob') - and not get_arg('getPartialOutput') - and not get_arg('build') + not get_arg("open") + and not get_arg("upload") + and not get_arg("addJob") + and not get_arg("poll") + and not get_arg("info") + and not get_arg("jobs") + and not get_arg("pool") + and not get_arg("prealloc") + and not get_arg("runJob") + and not get_arg("getPartialOutput") + and not get_arg("build") ): parser.print_help() sys.exit(0) -if get_arg('ssl'): +if get_arg("ssl"): _tango_protocol = "https" - if get_arg('port') == 3000: + if get_arg("port") == 3000: args.port = 443 try: - response = requests.get("%s://%s:%d/" % (_tango_protocol, get_arg('server'), get_arg('port'))) + response = requests.get( + "%s://%s:%d/" % (_tango_protocol, get_arg("server"), get_arg("port")) + ) response.raise_for_status() except BaseException: - print("Tango not reachable on %s:%d!\n" % (get_arg('server'), get_arg('port'))) + print("Tango not reachable on %s:%d!\n" % (get_arg("server"), get_arg("port"))) sys.exit(0) router() diff --git a/jobManager.py b/jobManager.py index 50cd9553..a9f85f46 100644 --- a/jobManager.py +++ b/jobManager.py @@ -26,7 +26,6 @@ from vmms.interface import VMMSInterface - class JobManager(object): def __init__(self, queue: JobQueue): self.daemon = True @@ -67,7 +66,9 @@ def __manage(self) -> None: # Blocks until we get a next job job: TangoJob = self.jobQueue.getNextPendingJob() if not job.accessKey and Config.REUSE_VMS: - self.log.info(f"job has access key {job.accessKey} and is calling reuseVM") + self.log.info( + f"job has access key {job.accessKey} and is calling reuseVM" + ) vm = None while vm is None: vm = self.jobQueue.reuseVM(job) @@ -92,9 +93,7 @@ def __manage(self) -> None: self.log.error("ERROR initialization VM: %s", e) self.log.error(traceback.format_exc()) if preVM is None: - raise Exception( - "EC2 SSH VM initialization failed: see log" - ) + raise Exception("EC2 SSH VM initialization failed: see log") else: self.log.info(f"job {job.id} is not an ec2 vmms job") # Try to find a vm on the free list and allocate it to @@ -122,15 +121,15 @@ def __manage(self) -> None: ) # Mark the job assigned self.jobQueue.assignJob(job.id, preVM) - Worker( - job, vmms, self.jobQueue, self.preallocator, preVM - ).start() + Worker(job, vmms, self.jobQueue, self.preallocator, preVM).start() except Exception as err: if job is None: self.log.info("job_manager: job is None") else: - self.log.error("job failed during creation %d %s" % (job.id, str(err))) + self.log.error( + "job failed during creation %d %s" % (job.id, str(err)) + ) self.jobQueue.makeDead(job, str(err)) @@ -146,7 +145,10 @@ def __manage(self) -> None: tango_server.log.debug("Resetting Tango VMs") tango_server.resetTango(tango_server.preallocator.vmms) for key in tango_server.preallocator.machines.keys(): - machine: Tuple[List[TangoMachine], TangoQueue] = ([], TangoQueue.create(key)) + machine: Tuple[List[TangoMachine], TangoQueue] = ( + [], + TangoQueue.create(key), + ) machine[1].make_empty() tango_server.preallocator.machines.set(key, machine) diff --git a/jobQueue.py b/jobQueue.py index 9e9c664d..56bc2ad7 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -16,6 +16,7 @@ from config import Config from preallocator import Preallocator from typing import Optional + # # JobQueue - This class defines the job queue and the functions for # manipulating it. The actual queue is made up of two smaller @@ -55,7 +56,9 @@ def __init__(self, preallocator: Preallocator) -> None: using the makeUnassigned api. """ self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs") - self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs + self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create( + "deadJobs" + ) # Servees as a record of both failed and completed jobs self.unassignedJobs: TangoQueue[int] = TangoQueue.create("unassignedLiveJobs") self.queueLock = threading.Lock() self.preallocator: Preallocator = preallocator @@ -137,7 +140,7 @@ def add(self, job): # Since we assume that the job is new, we set the number of retries # of this job to 0 - assert(job.retries == 0) + assert job.retries == 0 # Add the job to the queue. Careful not to append the trace until we # know the job has actually been added to the queue. @@ -169,8 +172,8 @@ def add(self, job): return str(job.id) - # TODO: get rid of this return value, it is not used anywhere - def addDead(self, job) -> int: + # TODO: get rid of this return value, it is not used anywhere + def addDead(self, job) -> int: """addDead - add a job to the dead queue. Called by validateJob when a job validation fails. Returns -1 on failure and the job id on success @@ -248,7 +251,7 @@ def get(self, id): self.log.debug("get| Released lock to job queue.") return job - # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. + # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. # TODO: It does assign the job to a particular VM though. # Precondition: jobId is in self.liveJobs def assignJob(self, jobId, vm=None): @@ -315,11 +318,11 @@ def makeDead(self, job: TangoJob, reason): if job.id not in self.liveJobs: self.log.error("makeDead| Job ID: %s not found in live jobs" % (job.id)) return -1 - + self.log.info("makeDead| Found job ID: %s in the live queue" % (job.id)) status = 0 self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason)) - + # Remove the job from the live jobs dictionary job.deleteFromDict(self.liveJobs) # Add the job to the dead jobs dictionary @@ -358,7 +361,9 @@ def getNextPendingJob(self) -> TangoJob: """ # Blocks till the next item is added id = self.unassignedJobs.get() - assert id is not None, ".get with default arguments should block and never return None" + assert ( + id is not None + ), ".get with default arguments should block and never return None" self.log.debug("_getNextPendingJob|Acquiring lock to job queue.") self.queueLock.acquire() @@ -395,5 +400,3 @@ def reuseVM(self, job): return job.vm else: raise Exception("Job assigned without vm") - - diff --git a/preallocator.py b/preallocator.py index eb2d27c9..da7f8b1d 100644 --- a/preallocator.py +++ b/preallocator.py @@ -10,6 +10,7 @@ from config import Config from typing import Tuple, List, Dict from vmms.interface import VMMSInterface + # # Preallocator - This class maintains a pool of active VMs for future # job requests. The pool is stored in dictionary called @@ -23,7 +24,9 @@ class Preallocator(object): def __init__(self, vmms: Dict[str, VMMSInterface]) -> None: - self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines") + self.machines: TangoDictionary[ + Tuple[List[TangoMachine], TangoQueue[TangoMachine]] + ] = TangoDictionary.create("machines") self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) self.vmms: Dict[str, VMMSInterface] = vmms diff --git a/restful_tango/server.py b/restful_tango/server.py index 1a177ce0..5b4c495d 100755 --- a/restful_tango/server.py +++ b/restful_tango/server.py @@ -46,7 +46,9 @@ def prepare(self): if not os.path.exists(tempdir): os.mkdir(tempdir, 0o700) if os.path.exists(tempdir) and not os.path.isdir(tempdir): - tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,)) + tangoREST.log.error( + "Cannot process uploads, %s is not a directory" % (tempdir,) + ) return self.send_error() self.tempfile = NamedTemporaryFile(prefix="upload", dir=tempdir, delete=False) self.hasher = hashlib.md5() @@ -129,7 +131,9 @@ def prepare(self): if not os.path.exists(tempdir): os.mkdir(tempdir, 0o700) if os.path.exists(tempdir) and not os.path.isdir(tempdir): - tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,)) + tangoREST.log.error( + "Cannot process uploads, %s is not a directory" % (tempdir,) + ) return self.send_error() self.tempfile = NamedTemporaryFile(prefix="docker", dir=tempdir, delete=False) diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py index d246daba..1b0f9881 100644 --- a/restful_tango/tangoREST.py +++ b/restful_tango/tangoREST.py @@ -195,7 +195,7 @@ def convertJobObj(self, dirName, jobObj): accessKey=accessKey, accessKeyId=accessKeyId, disableNetwork=disableNetwork, - stopBefore=stopBefore + stopBefore=stopBefore, ) self.log.debug("inputFiles: %s" % [file.localFile for file in input]) @@ -308,7 +308,7 @@ def upload(self, key, courselab, file, tempfile, fileMD5): except Exception as e: exc_type, _, exc_tb = sys.exc_info() assert exc_type is not None - assert exc_tb is not None # currently handling an exception + assert exc_tb is not None # currently handling an exception fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) self.log.error("upload request failed: %s" % str(e)) @@ -339,7 +339,7 @@ def addJob(self, key, courselab, jobStr): except Exception as e: exc_type, _, exc_tb = sys.exc_info() assert exc_type is not None - assert exc_tb is not None # currently handling an exception + assert exc_tb is not None # currently handling an exception fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) self.log.error("addJob request failed: %s" % str(e)) diff --git a/tangoObjects.py b/tangoObjects.py index e679bce0..37dbf0dd 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -1,4 +1,5 @@ from __future__ import annotations + # tangoREST.py # # Implements objects used to pass state within Tango. @@ -118,7 +119,9 @@ def __init__( self._outputFile = outputFile self._name = name self._notifyURL = notifyURL - self._timeout = timeout # How long to run the autodriver on the job for before timing out. + self._timeout = ( + timeout # How long to run the autodriver on the job for before timing out. + ) self._trace: list[str] = [] self._maxOutputFileSize = maxOutputFileSize self._remoteLocation: Optional[str] = None @@ -126,19 +129,21 @@ def __init__( self._accessKey = accessKey self._disableNetwork = disableNetwork self._stopBefore = stopBefore - self._id: Optional[int] = None # uninitialized until it gets added to either the live or dead queue + self._id: Optional[ + int + ] = None # uninitialized until it gets added to either the live or dead queue def __repr__(self): self.syncRemote() return f"ID: {self._id} - Name: {self.name}" - + # TODO: reduce code size/duplication by setting TangoJob as a dataclass # Getters for private variables @property def assigned(self): - self.syncRemote() # Is it necessary to sync here? + self.syncRemote() # Is it necessary to sync here? return self._assigned - + @property def retries(self): self.syncRemote() @@ -148,67 +153,67 @@ def retries(self): def vm(self): self.syncRemote() return self._vm - + @property def input(self): self.syncRemote() return self._input - + @property def outputFile(self): self.syncRemote() return self._outputFile - + @property def name(self): self.syncRemote() return self._name - + @property def notifyURL(self): self.syncRemote() return self._notifyURL - + @property def timeout(self): self.syncRemote() return self._timeout - + @property def trace(self): self.syncRemote() return self._trace - + @property def maxOutputFileSize(self): self.syncRemote() return self._maxOutputFileSize - + @property def remoteLocation(self): self.syncRemote() return self._remoteLocation - + @property def accessKeyId(self): self.syncRemote() return self._accessKeyId - + @property def accessKey(self): self.syncRemote() return self._accessKey - + @property def disableNetwork(self): self.syncRemote() return self._disableNetwork - + @property def stopBefore(self): self.syncRemote() return self._stopBefore - + @property def id(self) -> int: self.syncRemote() @@ -224,7 +229,7 @@ def resetRetries(self): self.syncRemote() self._retries = 0 self.updateRemote() - + def incrementRetries(self): self.syncRemote() self._retries += 1 @@ -258,14 +263,14 @@ def setId(self, new_id: int) -> None: dictionary.delete(key) self._remoteLocation = dict_hash + ":" + str(new_id) self.updateRemote() - + def setTimeout(self, new_timeout): self.syncRemote() self._timeout = new_timeout self.updateRemote() def setKeepForDebugging(self, keep_for_debugging: bool): - if (self._vm is not None): + if self._vm is not None: self.syncRemote() self._vm.keep_for_debugging = keep_for_debugging self.updateRemote() @@ -284,15 +289,16 @@ def __updateSelf(self, other_job): self._maxOutputFileSize = other_job._maxOutputFileSize self._id = other_job._id - def syncRemote(self) -> None: if Config.USE_REDIS and self._remoteLocation is not None: dict_hash = self._remoteLocation.split(":")[0] key = self._remoteLocation.split(":")[1] dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash) - temp_job = dictionary.get(key) # Key should be in dictionary + temp_job = dictionary.get(key) # Key should be in dictionary if temp_job is None: - print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob + print( + f"Job {key} not found in dictionary {dict_hash}" + ) # TODO: add better error handling for TangoJob return self.__updateSelf(temp_job) @@ -302,20 +308,19 @@ def updateRemote(self) -> None: key = self._remoteLocation.split(":")[1] dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash) dictionary.set(key, self) - - def deleteFromDict(self, dictionary : TangoDictionary) -> None: + + def deleteFromDict(self, dictionary: TangoDictionary) -> None: assert self._id is not None dictionary.delete(self._id) self._remoteLocation = None - - def addToDict(self, dictionary : TangoDictionary) -> None: + + def addToDict(self, dictionary: TangoDictionary) -> None: assert self._id is not None dictionary.set(self._id, self) assert self._remoteLocation is None, "Job already has a remote location" if Config.USE_REDIS: self._remoteLocation = dictionary.hash_name + ":" + str(self._id) self.updateRemote() - def TangoIntValue(object_name, obj): @@ -359,9 +364,11 @@ def get(self): def set(self, val): self.val = val return val - -QueueElem = TypeVar('QueueElem') + +QueueElem = TypeVar("QueueElem") + + class TangoQueue(Protocol[QueueElem]): @staticmethod def create(key_name: str) -> TangoQueue[QueueElem]: @@ -373,19 +380,26 @@ def create(key_name: str) -> TangoQueue[QueueElem]: @abstractmethod def qsize(self) -> int: ... + def empty(self) -> bool: ... + def put(self, item: QueueElem) -> None: ... + def get(self, block=True, timeout=None) -> Optional[QueueElem]: ... + def get_nowait(self) -> Optional[QueueElem]: ... + def remove(self, item: QueueElem) -> None: ... + def make_empty(self) -> None: ... + class ExtendedQueue(Queue, TangoQueue[QueueElem]): """Python Thread safe Queue with the remove and clean function added""" @@ -404,7 +418,8 @@ def remove(self, value): def make_empty(self): with self.mutex: self.queue.clear() - + + class TangoRemoteQueue(TangoQueue): """Simple Queue with Redis Backend""" @@ -467,47 +482,55 @@ def remove(self, item): def make_empty(self) -> None: self.__db.delete(self.key) -T = TypeVar('T') + +T = TypeVar("T") KeyType = Union[str, int] # Dictionary from string to T class TangoDictionary(Protocol[T]): - @staticmethod def create(dictionary_name: str) -> TangoDictionary[T]: if Config.USE_REDIS: return TangoRemoteDictionary(dictionary_name) else: return TangoNativeDictionary() - + @property @abstractmethod def hash_name(self) -> str: ... - + @abstractmethod def __contains__(self, id: KeyType) -> bool: ... + @abstractmethod def set(self, id: KeyType, obj: T) -> str: ... + @abstractmethod def get(self, id: KeyType) -> Optional[T]: ... + @abstractmethod def getExn(self, id: KeyType) -> T: ... + @abstractmethod def keys(self) -> list[str]: ... + @abstractmethod def values(self) -> list[T]: ... + @abstractmethod def delete(self, id: KeyType) -> None: ... + @abstractmethod def make_empty(self) -> None: ... + @abstractmethod def items(self) -> list[tuple[str, T]]: ... @@ -525,8 +548,6 @@ def items(self) -> list[tuple[str, T]]: # return TangoNativeDictionary() - - class TangoRemoteDictionary(TangoDictionary[T]): def __init__(self, object_name): self.r = getRedisConnection() diff --git a/tests/sample_test/expected_output.txt b/tests/sample_test/expected_output.txt index 70c379b6..802992c4 100644 --- a/tests/sample_test/expected_output.txt +++ b/tests/sample_test/expected_output.txt @@ -1 +1 @@ -Hello world \ No newline at end of file +Hello world diff --git a/tests/stressTest.py b/tests/stressTest.py index 6494a8c4..eeb46bfa 100644 --- a/tests/stressTest.py +++ b/tests/stressTest.py @@ -14,7 +14,17 @@ start_time = time.time() expected_output = "" -def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"): + +def printProgressBar( + iteration, + total, + prefix="", + suffix="", + decimals=1, + length=100, + fill="█", + printEnd="\r", +): """ Call in a loop to create terminal progress bar @params: @@ -29,39 +39,65 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, """ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) - bar = fill * filledLength + '-' * (length - filledLength) - print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd) + bar = fill * filledLength + "-" * (length - filledLength) + print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd) # Print New Line on Complete - if iteration == total: + if iteration == total: print() -def run_stress_test(num_submissions, submission_delay, autograder_image, output_file, tango_port, cli_path, - job_name, job_path, instance_type, timeout, ec2): - printProgressBar(0, num_submissions, prefix = 'Jobs Added:', suffix = 'Complete', length = 50) - with open(output_file, 'a') as f: + +def run_stress_test( + num_submissions, + submission_delay, + autograder_image, + output_file, + tango_port, + cli_path, + job_name, + job_path, + instance_type, + timeout, + ec2, +): + printProgressBar( + 0, num_submissions, prefix="Jobs Added:", suffix="Complete", length=50 + ) + with open(output_file, "a") as f: f.write(f"Stress testing with {num_submissions} submissions\n") - + for i in range(1, num_submissions + 1): command = [ - 'python3', cli_path, - '-P', str(tango_port), - '-k', 'test', - '-l', job_name, - '--runJob', job_path, - '--image', autograder_image, - '--instanceType', instance_type, - '--timeout', str(timeout), - '--callbackURL', ("http://localhost:8888/autograde_done?id=%d" % (i)) + "python3", + cli_path, + "-P", + str(tango_port), + "-k", + "test", + "-l", + job_name, + "--runJob", + job_path, + "--image", + autograder_image, + "--instanceType", + instance_type, + "--timeout", + str(timeout), + "--callbackURL", + ("http://localhost:8888/autograde_done?id=%d" % (i)), ] if ec2: - command += ['--ec2'] + command += ["--ec2"] subprocess.run(command, stdout=f, stderr=f) f.write(f"Submission {i} completed\n") - printProgressBar(i, num_submissions, prefix = 'Jobs Added:', suffix = 'Complete', length = 50) + printProgressBar( + i, num_submissions, prefix="Jobs Added:", suffix="Complete", length=50 + ) if submission_delay > 0: time.sleep(submission_delay) print() + class AutogradeDoneHandler(tornado.web.RequestHandler): def post(self): global finished_tests @@ -71,10 +107,16 @@ def post(self): id = self.get_query_argument("id") fileBody = self.request.files["file"][0]["body"].decode() scoreJson = fileBody.split("\n")[-2] - with open(os.path.join(test_dir, "output", "output%s.txt" % id), 'w') as f: + with open(os.path.join(test_dir, "output", "output%s.txt" % id), "w") as f: f.write(fileBody) finished_tests[str(id)] = scoreJson - printProgressBar(len(finished_tests), sub_num, prefix = 'Tests Done:', suffix = 'Complete', length = 50) + printProgressBar( + len(finished_tests), + sub_num, + prefix="Tests Done:", + suffix="Complete", + length=50, + ) self.write("ok") self.flush() @@ -87,6 +129,7 @@ def on_finish(self): print("\nShutting down server...") tornado.ioloop.IOLoop.current().stop() + def create_summary(): success = [] failed = [] @@ -95,7 +138,7 @@ def create_summary(): success.append(i) else: failed.append(i) - with open(os.path.join(test_dir, "summary.txt"), 'w') as f: + with open(os.path.join(test_dir, "summary.txt"), "w") as f: f.write("Total Time: %d seconds\n" % (time.time() - start_time)) f.write("Total Succeeded: %d / %d\n" % (len(success), sub_num)) f.write("Total Failed: %d / %d\n" % (len(failed), sub_num)) @@ -107,21 +150,28 @@ def create_summary(): for i in range(0, len(failed)): f.write("Test Case #%d: %s\n" % (failed[i], finished_tests[str(failed[i])])) + def make_app(): - return tornado.web.Application([ - (r"/autograde_done", AutogradeDoneHandler), - ]) + return tornado.web.Application( + [ + (r"/autograde_done", AutogradeDoneHandler), + ] + ) + def notifyServer(): global shutdown_event app = make_app() app.listen(8888) - printProgressBar(0, sub_num, prefix = 'Tests Done:', suffix = 'Complete', length = 50) + printProgressBar(0, sub_num, prefix="Tests Done:", suffix="Complete", length=50) tornado.ioloop.IOLoop.current().start() + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Stress test script for Tango") - parser.add_argument('--test_dir', type=str, required=True, help="Directory to run the test in") + parser.add_argument( + "--test_dir", type=str, required=True, help="Directory to run the test in" + ) args = parser.parse_args() @@ -129,10 +179,10 @@ def notifyServer(): test_dir = args.test_dir - with open(os.path.join(args.test_dir, dirname + '.yaml'), 'r') as f: + with open(os.path.join(args.test_dir, dirname + ".yaml"), "r") as f: data = yaml.load(f, Loader=yaml.SafeLoader) - - with open(os.path.join(args.test_dir, data["expected_output"]), 'r') as f: + + with open(os.path.join(args.test_dir, data["expected_output"]), "r") as f: expected_output = f.read() sub_num = data["num_submissions"] @@ -152,10 +202,10 @@ def notifyServer(): data["tango_port"], data["cli_path"], dirname, - os.path.join(args.test_dir, 'input'), + os.path.join(args.test_dir, "input"), data["instance_type"], data["timeout"], - data["ec2"] + data["ec2"], ) - notifyServer() \ No newline at end of file + notifyServer() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 4b2e4090..533a8cd0 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -34,10 +34,15 @@ logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL) valid_instance_types: Set[InstanceTypeType] = set(get_args(InstanceTypeType)) + + def check_instance_type(instance_type: str) -> TypeGuard[InstanceTypeType]: return instance_type in valid_instance_types -def timeout_with_retries(command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2) -> int: + +def timeout_with_retries( + command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2 +) -> int: """timeout - Run a unix command with a timeout. Return -1 on timeout, otherwise return the return value from the command, which is typically 0 for success, 1-255 for failure. @@ -106,7 +111,9 @@ def release_vm_semaphore() -> None: Ec2SSH._vm_semaphore.release() # TODO: the arguments accessKeyId and accessKey don't do anything - def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None) -> None: + def __init__( + self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None + ) -> None: """log - logger for the instance connection - EC2Connection object that stores the connection info to the EC2 network @@ -139,11 +146,15 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = # self.createKeyPair() # create boto3resource - self.img2ami: Dict[str, Image] = {} # this is a bad name, should really be img_name to img + self.img2ami: Dict[ + str, Image + ] = {} # this is a bad name, should really be img_name to img self.images: List[Image] = [] try: # This is a service resource - self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource + self.boto3resource: EC2ServiceResource = boto3.resource( + "ec2", config.Config.EC2_REGION + ) # TODO: rename this ot self.ec2resource self.boto3client = boto3.client("ec2", config.Config.EC2_REGION) # Get images from ec2 @@ -262,7 +273,7 @@ def createSecurityGroup(self) -> None: except Exception as e: self.log.debug("ERROR checking for existing security group: %s", e) # ! Note: We've never encountered the lines below before (there was a type error), - # ! because we've always had a security group. + # ! because we've always had a security group. # ! Difficult to test because it involves deleting all security groups. try: security_group_response = self.boto3client.create_security_group( @@ -270,9 +281,7 @@ def createSecurityGroup(self) -> None: Description="Autolab security group - allowing all traffic", ) security_group_id = security_group_response["GroupId"] - self.boto3client.authorize_security_group_ingress( - GroupId=security_group_id - ) + self.boto3client.authorize_security_group_ingress(GroupId=security_group_id) except Exception as e: self.log.debug("ERROR in creating security group: %s", e) @@ -297,7 +306,6 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: # ensure that security group exists self.createSecurityGroup() - reservation: List[Instance] = self.boto3resource.create_instances( ImageId=ec2instance["ami"], KeyName=self.key_pair_name, @@ -305,13 +313,12 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: InstanceType=instance_type, MaxCount=1, MinCount=1, - InstanceMarketOptions= - { + InstanceMarketOptions={ "MarketType": "spot", "SpotOptions": { "SpotInstanceType": "one-time", - "InstanceInterruptionBehavior": "terminate" - } + "InstanceInterruptionBehavior": "terminate", + }, }, ) @@ -339,18 +346,13 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]: # reload the state of the new instance try_load_instance(newInstance) for inst in instances.filter(InstanceIds=[newInstance.id]): - self.log.debug( - "VM %s %s: is running" % (vm.name, newInstance.id) - ) + self.log.debug("VM %s %s: is running" % (vm.name, newInstance.id)) instanceRunning = True if instanceRunning: break - if ( - time.time() - start_time - > config.Config.INITIALIZEVM_TIMEOUT - ): + if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: raise ValueError( "VM %s %s: timeout (%d seconds) before reaching 'running' state" % ( @@ -471,18 +473,21 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: # Sleep a bit before trying again time.sleep(config.Config.TIMER_POLL_INTERVAL) - def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: + def copyIn( + self, + vm: TangoMachine, + inputFiles: List[InputFile], + job_id: Optional[int] = None, + ) -> int: """copyIn - Copy input files to VM Args: - vm is a TangoMachine object - - inputFiles is a list of objects with attributes localFile and destFile. + - inputFiles is a list of objects with attributes localFile and destFile. localFile is the file on the host, destFile is the file on the VM. - - job_id is the job id of the job being run on the VM. + - job_id is the job id of the job being run on the VM. It is used for logging purposes only. """ - self.log.info( - "copyIn %s - writing files" % self.instanceName(vm.id, vm.name) - ) + self.log.info("copyIn %s - writing files" % self.instanceName(vm.id, vm.name)) domain_name = self.domainName(vm) @@ -504,18 +509,14 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional self.log.info("%s for job %s" % (line, job_id)) self.log.info("Return Code: %s, job: %s" % (result.returncode, job_id)) if result.stderr != 0: - self.log.info( - "Standard Error: %s, job: %s" % (result.stderr, job_id) - ) + self.log.info("Standard Error: %s, job: %s" % (result.stderr, job_id)) # Validate inputFiles structure if not inputFiles or not all( hasattr(file, "localFile") and hasattr(file, "destFile") for file in inputFiles ): - self.log.info( - "Error: Invalid inputFiles Structure, job: %s" % job_id - ) + self.log.info("Error: Invalid inputFiles Structure, job: %s" % job_id) for file in inputFiles: self.log.info("%s - %s" % (file.localFile, file.destFile)) @@ -524,8 +525,7 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional + self.ssh_flags + [ file.localFile, - "%s@%s:~/autolab/%s" - % (self.ec2User, domain_name, file.destFile), + "%s@%s:~/autolab/%s" % (self.ec2User, domain_name, file.destFile), ], config.Config.COPYIN_TIMEOUT, ) @@ -535,7 +535,13 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional return 0 - def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: + def runJob( + self, + vm: TangoMachine, + runTimeout: int, + maxOutputFileSize: int, + disableNetwork: bool, + ) -> int: """runJob - Run the make command on a VM using SSH and redirect output to file "output". """ @@ -558,9 +564,7 @@ def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disa # no logging for now ret = VMMSUtils.timeout( - ["ssh"] - + self.ssh_flags - + ["%s@%s" % (self.ec2User, domain_name), runcmd], + ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), runcmd], runTimeout * 2, ) @@ -615,7 +619,6 @@ def copyOut(self, vm: TangoMachine, destFile: str) -> int: # re.error = xxx_todo_changeme # Error copying out the timing data (probably runJob failed) pass - return VMMSUtils.timeout( ["scp"] @@ -639,9 +642,7 @@ def destroyVM(self, vm: TangoMachine) -> None: InstanceIds=[vm.instance_id] ) if not instances: - self.log.debug( - "no instances found with instance id %s", vm.instance_id - ) + self.log.debug("no instances found with instance id %s", vm.instance_id) # Keep the vm and mark with meaningful tags for debugging if ( hasattr(config.Config, "KEEP_VM_AFTER_FAILURE") @@ -668,9 +669,7 @@ def destroyVM(self, vm: TangoMachine) -> None: if not self.useDefaultKeyPair: self.deleteKeyPair() except Exception as e: - self.log.error( - "destroyVM failed: %s for vm %s" % (e, vm.instance_id) - ) + self.log.error("destroyVM failed: %s for vm %s" % (e, vm.instance_id)) Ec2SSH.release_vm_semaphore() @@ -704,13 +703,8 @@ def getVMs(self) -> List[TangoMachine]: instance.tags, "Name" ) # inst name PREFIX-serial-IMAGE # Name tag is the standard form of prefix-serial-image - if not ( - instName - and re.match("%s-" % config.Config.PREFIX, instName) - ): - self.log.debug( - "getVMs: Instance id %s skipped" % vm.instance_id - ) + if not (instName and re.match("%s-" % config.Config.PREFIX, instName)): + self.log.debug("getVMs: Instance id %s skipped" % vm.instance_id) continue # instance without name tag or proper prefix vm.name = instName @@ -723,8 +717,7 @@ def getVMs(self) -> List[TangoMachine]: vms.append(vm) self.log.debug( - "getVMs: Instance id %s, name %s" - % (vm.instance_id, vm.name) + "getVMs: Instance id %s, name %s" % (vm.instance_id, vm.name) ) except Exception as e: self.log.debug("getVMs Failed: %s" % e) @@ -734,7 +727,9 @@ def getVMs(self) -> List[TangoMachine]: def existsVM(self, vm: TangoMachine) -> bool: """existsVM - Checks whether a VM exists in the vmms.""" # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html - filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}] + filters: Sequence[FilterTypeDef] = [ + {"Name": "instance-state-name", "Values": ["running"]} + ] # gets all running instances instances = self.boto3resource.instances.filter(Filters=filters) for instance in instances: @@ -766,13 +761,11 @@ def getPartialOutput(self, vm: TangoMachine) -> str: ) sshcmd = ( - ["ssh"] - + self.ssh_flags - + ["%s@%s" % (self.ec2User, domain_name), runcmd] + ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), runcmd] ) - output = subprocess.check_output( - sshcmd, stderr=subprocess.STDOUT - ).decode("utf-8") + output = subprocess.check_output(sshcmd, stderr=subprocess.STDOUT).decode( + "utf-8" + ) return output diff --git a/vmms/interface.py b/vmms/interface.py index a437401a..33454f13 100644 --- a/vmms/interface.py +++ b/vmms/interface.py @@ -13,11 +13,22 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: ... @abstractmethod - def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: + def copyIn( + self, + vm: TangoMachine, + inputFiles: List[InputFile], + job_id: Optional[int] = None, + ) -> int: ... @abstractmethod - def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: # -1 to infinity + def runJob( + self, + vm: TangoMachine, + runTimeout: int, + maxOutputFileSize: int, + disableNetwork: bool, + ) -> int: # -1 to infinity ... @abstractmethod @@ -27,7 +38,7 @@ def copyOut(self, vm: TangoMachine, destFile: str) -> int: @abstractmethod def destroyVM(self, vm: TangoMachine) -> None: ... - + @abstractmethod def safeDestroyVM(self, vm: TangoMachine) -> None: ... @@ -47,7 +58,7 @@ def instanceName(self, id: int, name: str) -> str: @abstractmethod def getImages(self) -> List[str]: ... - + @abstractmethod def getPartialOutput(self, vm: TangoMachine) -> str: ... diff --git a/vmms/localDocker.py b/vmms/localDocker.py index fa727bb6..5617b807 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -18,7 +18,6 @@ from vmms.sharedUtils import VMMSUtils - # # User defined exceptions # @@ -71,7 +70,12 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]: """waitVM - Nothing to do for waitVM""" return 0 - def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int: + def copyIn( + self, + vm: TangoMachine, + inputFiles: List[InputFile], + job_id: Optional[int] = None, + ) -> int: """copyIn - Create a directory to be mounted as a volume for the docker containers. Copy input files to this directory. """ @@ -90,7 +94,13 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional ) return 0 - def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: + def runJob( + self, + vm: TangoMachine, + runTimeout: int, + maxOutputFileSize: int, + disableNetwork: bool, + ) -> int: """runJob - Run a docker container by doing the follows: - mount directory corresponding to this job to /home/autolab in the container @@ -154,7 +164,9 @@ def destroyVM(self, vm: TangoMachine) -> None: volumePath = self.getVolumePath("") # Do a hard kill on corresponding docker container. # Return status does not matter. - VMMSUtils.timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT) + VMMSUtils.timeout( + ["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT + ) # Destroy corresponding volume if it exists. if instanceName in os.listdir(volumePath): shutil.rmtree(volumePath + instanceName) diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py index 0165b313..15cb2777 100644 --- a/vmms/sharedUtils.py +++ b/vmms/sharedUtils.py @@ -4,6 +4,7 @@ from typing import List import config + class VMMSUtils: @staticmethod def constructInstanceName(id: int, name: str) -> str: @@ -12,7 +13,7 @@ def constructInstanceName(id: int, name: str) -> str: instance names manually. """ return "%s-%d-%s" % (config.Config.PREFIX, id, name) - + @staticmethod def timeout(command: List[str], time_out: float = 1) -> int: """timeout - Run a unix command with a timeout. Return -1 on @@ -43,4 +44,3 @@ def timeout(command: List[str], time_out: float = 1) -> int: else: returncode = poll_result return returncode - diff --git a/worker.py b/worker.py index 8d92123c..8b7b115b 100644 --- a/worker.py +++ b/worker.py @@ -19,6 +19,7 @@ from typing import Dict, Optional from vmms.interface import VMMSInterface from preallocator import Preallocator + # # Worker - The worker class is very simple and very dumb. The goal is # to walk through the VMMS interface, track the job's progress, and if @@ -30,6 +31,7 @@ # anything else in the system. # + class DetachMethod(Enum): RETURN_TO_POOL = "return_to_pool" DESTROY_WITHOUT_REPLACEMENT = "destroy_without_replacement" @@ -38,12 +40,19 @@ class DetachMethod(Enum): # We always preallocate a VM for the worker to use class Worker(threading.Thread): - def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preallocator: Preallocator, preVM: TangoMachine): + def __init__( + self, + job: TangoJob, + vmms: VMMSInterface, + jobQueue: JobQueue, + preallocator: Preallocator, + preVM: TangoMachine, + ): threading.Thread.__init__(self) self.daemon = True self.job = job self.vmms: VMMSInterface = vmms - self.jobQueue : JobQueue = jobQueue + self.jobQueue: JobQueue = jobQueue self.preallocator = preallocator self.preVM = preVM threading.Thread.__init__(self) @@ -55,8 +64,7 @@ def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preal # def __del__(self): assert self.cleanupStatus, "Worker must call detachVM before returning" - - + def detachVM(self, detachMethod: DetachMethod): """detachVM - Detach the VM from this worker. The options are to return it to the pool's free list (return_vm), destroy it @@ -99,7 +107,10 @@ def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None: # Try a few times before giving up if self.job.retries < Config.JOB_RETRIES: - self.log.error("Retrying job %s:%d, retries: %d" % (self.job.name, self.job.id, self.job.retries)) + self.log.error( + "Retrying job %s:%d, retries: %d" + % (self.job.name, self.job.id, self.job.retries) + ) self.job.appendTrace( "%s|Retrying job %s:%d, retries: %d" % (datetime.now().ctime(), self.job.name, self.job.id, self.job.retries) @@ -114,7 +125,9 @@ def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None: # Here is where we give up else: full_err = f"Internal Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}" - self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err)) + self.log.error( + f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err) + ) self.job.appendTrace( "%s|Giving up on job %s:%d. %s" % (datetime.now().ctime(), self.job.name, self.job.id, full_err) @@ -168,18 +181,22 @@ def notifyServer(self, job: TangoJob) -> None: ) fh.close() else: - self.log.info("No callback URL for job %s:%d" % (self.job.name, self.job.id)) + self.log.info( + "No callback URL for job %s:%d" % (self.job.name, self.job.id) + ) except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - def afterJobExecution(self, hdrfile: str, msg: str, detachMethod: DetachMethod) -> None: + def afterJobExecution( + self, hdrfile: str, msg: str, detachMethod: DetachMethod + ) -> None: self.jobQueue.makeDead(self.job, msg) - + # Update the text that users see in the autodriver output file self.appendMsg(hdrfile, msg) self.catFiles(hdrfile, self.job.outputFile) os.chmod(self.job.outputFile, 0o644) - + # Thread exit after termination self.detachVM(detachMethod) self.notifyServer(self.job) @@ -221,7 +238,7 @@ def run(self) -> None: ) ) self.log.debug("Assigned job to preallocated VM") - ret["initializevm"] = 0 # Vacuous success since it doesn't happen + ret["initializevm"] = 0 # Vacuous success since it doesn't happen vm = self.job.vm @@ -243,7 +260,9 @@ def run(self) -> None: if self.job.stopBefore == "waitvm": msg = "Execution stopped before %s" % self.job.stopBefore self.job.setKeepForDebugging(True) - self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) + self.afterJobExecution( + hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE + ) return ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT) @@ -277,15 +296,19 @@ def run(self) -> None: self.job.id, ) ) - if (self.job.stopBefore == "copyin"): + if self.job.stopBefore == "copyin": msg = "Execution stopped before %s" % self.job.stopBefore self.job.setKeepForDebugging(True) - self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) + self.afterJobExecution( + hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE + ) self.log.debug(msg) return # Copy input files to VM ret["copyin"] = self.vmms.copyIn(vm, self.job.input, self.job.id) - self.log.debug(f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}") + self.log.debug( + f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}" + ) if ret["copyin"] != 0: Config.copyin_errors += 1 @@ -293,11 +316,7 @@ def run(self) -> None: self.job.vm.notes = str(self.job.id) + "_" + self.job.name self.job.setKeepForDebugging(True) self.log.debug(msg) - self.rescheduleJob( - hdrfile, - ret, - msg - ) + self.rescheduleJob(hdrfile, ret, msg) return self.log.info( @@ -309,10 +328,12 @@ def run(self) -> None: % (datetime.now().ctime(), self.job.name, self.job.id, ret["copyin"]) ) - if (self.job.stopBefore == "runjob"): + if self.job.stopBefore == "runjob": msg = "Execution stopped before %s" % self.job.stopBefore self.job.setKeepForDebugging(True) - self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) + self.afterJobExecution( + hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE + ) return # Run the job on the virtual machine ret["runjob"] = self.vmms.runJob( @@ -343,13 +364,9 @@ def run(self) -> None: ret["runjob"] ) Config.runjob_errors += 1 - self.rescheduleJob( - hdrfile, - ret, - msg - ) + self.rescheduleJob(hdrfile, ret, msg) return - + self.log.info( "Job %s:%d executed [status=%s]" % (self.job.name, self.job.id, ret["runjob"]) @@ -359,10 +376,12 @@ def run(self) -> None: % (datetime.now().ctime(), self.job.name, self.job.id, ret["runjob"]) ) - if (self.job.stopBefore == "copyout"): + if self.job.stopBefore == "copyout": msg = "Execution stopped before %s" % self.job.stopBefore self.job.setKeepForDebugging(True) - self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE) + self.afterJobExecution( + hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE + ) return # Copy the output back. ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile) @@ -371,10 +390,10 @@ def run(self) -> None: self.rescheduleJob( hdrfile, ret, - f"Internal error: copyOut failed (status={ret['copyout']})" + f"Internal error: copyOut failed (status={ret['copyout']})", ) return - + self.log.info( "Output copied for job %s:%d [status=%d]" % (self.job.name, self.job.id, ret["copyout"]) @@ -390,13 +409,17 @@ def run(self) -> None: self.log.info("Success: job %s:%d finished" % (self.job.name, self.job.id)) for status in ret.values(): - assert status == 0, "Should not get to the success point if any stage failed" + assert ( + status == 0 + ), "Should not get to the success point if any stage failed" # TODO: test this, then remove everything below this point - + # Move the job from the live queue to the dead queue # with an explanatory message msg = "Success: Autodriver returned normally" - self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL) + self.afterJobExecution( + hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL + ) return # @@ -412,6 +435,6 @@ def run(self) -> None: # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set if self.preVM and not vm: self.job.makeVM(self.preVM) - vm = self.preVM + vm = self.preVM if vm: self.detachVM(DetachMethod.DESTROY_AND_REPLACE) From b0bb611a66f8955f16f775a63d683808407dd15f Mon Sep 17 00:00:00 2001 From: Anthony Yip Date: Mon, 16 Mar 2026 13:47:08 -0400 Subject: [PATCH 39/39] clean up worker.py --- vmms/ec2SSH.py | 1 - worker.py | 14 +++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 533a8cd0..f55c3c05 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -745,7 +745,6 @@ def getImages(self) -> List[str]: """getImages - return a constant; actually use the ami specified in config""" return [key for key in self.img2ami] - # TODO: later def getTag(self, tagList: List[TagTypeDef], tagKey: str) -> Optional[str]: if tagList: for tag in tagList: diff --git a/worker.py b/worker.py index 8b7b115b..aab2e46b 100644 --- a/worker.py +++ b/worker.py @@ -38,7 +38,7 @@ class DetachMethod(Enum): DESTROY_AND_REPLACE = "replace" -# We always preallocate a VM for the worker to use +# We always preallocate a VM for the worker to use, hence it isn't Optional class Worker(threading.Thread): def __init__( self, @@ -428,13 +428,5 @@ def run(self) -> None: except Exception as err: self.log.exception("Internal Error") self.appendMsg(self.job.outputFile, "Internal Error: %s" % err) - # if vm is set, then the normal job assignment completed, - # and detachVM can be run - # if vm is not set but self.preVM is set, we still need - # to return the VM, but have to initialize self.job.vm first - # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set - if self.preVM and not vm: - self.job.makeVM(self.preVM) - vm = self.preVM - if vm: - self.detachVM(DetachMethod.DESTROY_AND_REPLACE) + # error must have occurred after the job had its VM set + self.detachVM(DetachMethod.DESTROY_AND_REPLACE)