From ce52b324ef9ea1707c1c96f0f96a5735e6f842f0 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Fri, 21 Mar 2025 15:26:21 -0400
Subject: [PATCH 01/39] Fixed the change by making the retries attribute
 private (and also the assigned attribute), improving logging surrounding
 retries and giving up

---
 jobQueue.py         |  4 +--
 tangoObjects.py     | 59 +++++++++++++++++++++++++++++++--------------
 vmms/localDocker.py |  2 +-
 worker.py           | 14 ++++++++++-
 4 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index a583b3f1..1b0612e8 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -281,9 +281,9 @@ def unassignJob(self, jobId):
 
         # Increment the number of retires
         if job.retries is None:
-            job.retries = 0
+            job.resetRetries()
         else:
-            job.retries += 1
+            job.incrementRetries()
             Config.job_retries += 1
 
         self.log.info("unassignJob|Unassigning job %s" % str(job.id))
diff --git a/tangoObjects.py b/tangoObjects.py
index 03c33c58..83da888f 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -102,8 +102,8 @@ def __init__(
         disableNetwork=None,
         stopBefore="",
     ):
-        self.assigned = False
-        self.retries = 0
+        self._assigned = False
+        self._retries = 0
 
         self.vm = vm
         if input is None:
@@ -126,10 +126,31 @@ def __init__(
     def __repr__(self):
         self.syncRemote()
         return f"ID: {self.id} - Name: {self.name}"
+    
+    # Getters for private variables
+    @property
+    def assigned(self):
+        self.syncRemote() # Is it necessary to sync here?
+        return self._assigned
+    
+    @property
+    def retries(self):
+        self.syncRemote()
+        return self._retries
 
     def makeAssigned(self):
         self.syncRemote()
-        self.assigned = True
+        self._assigned = True
+        self.updateRemote()
+
+    def resetRetries(self):
+        self.syncRemote()
+        self._retries = 0
+        self.updateRemote()
+        
+    def incrementRetries(self):
+        self.syncRemote()
+        self._retries += 1
         self.updateRemote()
 
     def makeVM(self, vm):
@@ -139,12 +160,12 @@ def makeVM(self, vm):
 
     def makeUnassigned(self):
         self.syncRemote()
-        self.assigned = False
+        self._assigned = False
         self.updateRemote()
 
     def isNotAssigned(self):
         self.syncRemote()
-        return not self.assigned
+        return not self._assigned
 
     def appendTrace(self, trace_str):
         self.syncRemote()
@@ -161,13 +182,27 @@ def setId(self, new_id):
             self._remoteLocation = dict_hash + ":" + str(new_id)
             self.updateRemote()
 
+    # Private method
+    def __updateSelf(self, other_job):
+        self._assigned = other_job._assigned
+        self._retries = other_job._retries
+        self.vm = other_job.vm
+        self.input = other_job.input
+        self.outputFile = other_job.outputFile
+        self.name = other_job.name
+        self.notifyURL = other_job.notifyURL
+        self.timeout = other_job.timeout
+        self.trace = other_job.trace
+        self.maxOutputFileSize = other_job.maxOutputFileSize
+
+
     def syncRemote(self):
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
             dictionary = TangoDictionary(dict_hash)
             temp_job = dictionary.get(key)
-            self.updateSelf(temp_job)
+            self.__updateSelf(temp_job)
 
     def updateRemote(self):
         if Config.USE_REDIS and self._remoteLocation is not None:
@@ -176,18 +211,6 @@ def updateRemote(self):
             dictionary = TangoDictionary(dict_hash)
             dictionary.set(key, self)
 
-    def updateSelf(self, other_job):
-        self.assigned = other_job.assigned
-        self.retries = other_job.retries
-        self.vm = other_job.vm
-        self.input = other_job.input
-        self.outputFile = other_job.outputFile
-        self.name = other_job.name
-        self.notifyURL = other_job.notifyURL
-        self.timeout = other_job.timeout
-        self.trace = other_job.trace
-        self.maxOutputFileSize = other_job.maxOutputFileSize
-
 
 def TangoIntValue(object_name, obj):
     if Config.USE_REDIS:
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index 45dda03d..1a1e1dd0 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -123,7 +123,7 @@ def waitVM(self, vm, max_secs):
         """waitVM - Nothing to do for waitVM"""
         return
 
-    def copyIn(self, vm, inputFiles):
+    def copyIn(self, vm, inputFiles, job_id=None):
         """copyIn - Create a directory to be mounted as a volume
         for the docker containers. Copy input files to this directory.
         """
diff --git a/worker.py b/worker.py
index 0c6b6614..f06e1f10 100644
--- a/worker.py
+++ b/worker.py
@@ -77,6 +77,11 @@ def rescheduleJob(self, hdrfile, ret, err):
 
         # Try a few times before giving up
         if self.job.retries < Config.JOB_RETRIES:
+            self.log.error("Retrying job %s:%d, retries: %d" % (self.job.name, self.job.id, self.job.retries))
+            self.job.appendTrace(
+                "%s|Retrying job %s:%d, retries: %d"
+                % (datetime.now().ctime(), self.job.name, self.job.id, self.job.retries)
+            )
             try:
                 os.remove(hdrfile)
             except OSError:
@@ -86,7 +91,12 @@ def rescheduleJob(self, hdrfile, ret, err):
 
         # Here is where we give up
         else:
-            self.jobQueue.makeDead(self.job.id, err)
+            self.log.error("Giving up on job %s:%d" % (self.job.name, self.job.id))
+            self.job.appendTrace(
+                "%s|Giving up on job %s:%d"
+                % (datetime.now().ctime(), self.job.name, self.job.id)
+            )
+            self.jobQueue.makeDead(self.job.id, err) # Note: this reports the error that caused the last call to rescheduleJob to fail
 
             self.appendMsg(
                 hdrfile,
@@ -149,6 +159,8 @@ def notifyServer(self, job):
                     % (job.notifyURL, response.content)
                 )
                 fh.close()
+            else:
+                self.log.info("No callback URL for job %s:%d" % (self.job.name, self.job.id))
         except Exception as e:
             self.log.debug("Error in notifyServer: %s" % str(e))
 

From 28db9e5ef0a13f9ef22a6d7896c4ddb2404f8845 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 23 Mar 2025 13:43:50 -0400
Subject: [PATCH 02/39] Made all of tangoJob's attributes read only

---
 tango.py        |   2 +-
 tangoObjects.py | 119 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/tango.py b/tango.py
index 1a03ed0d..e487481a 100755
--- a/tango.py
+++ b/tango.py
@@ -399,7 +399,7 @@ def __validateJob(self, job, vmms):
                 "validateJob: Setting job.timeout to" " default config value: %d secs",
                 Config.RUNJOB_TIMEOUT,
             )
-            job.timeout = Config.RUNJOB_TIMEOUT
+            job.setTimeout(Config.RUNJOB_TIMEOUT)
 
         # Any problems, return an error status
         if errors > 0:
diff --git a/tangoObjects.py b/tangoObjects.py
index 83da888f..dcd97c67 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -105,23 +105,23 @@ def __init__(
         self._assigned = False
         self._retries = 0
 
-        self.vm = vm
+        self._vm = vm
         if input is None:
-            self.input = []
+            self._input = []
         else:
-            self.input = input
-
-        self.outputFile = outputFile
-        self.name = name
-        self.notifyURL = notifyURL
-        self.timeout = timeout
-        self.trace = []
-        self.maxOutputFileSize = maxOutputFileSize
+            self._input = input
+
+        self._outputFile = outputFile
+        self._name = name
+        self._notifyURL = notifyURL
+        self._timeout = timeout # How long to run the autodriver on the job for before timing out.
+        self._trace = []
+        self._maxOutputFileSize = maxOutputFileSize
         self._remoteLocation = None
-        self.accessKeyId = accessKeyId
-        self.accessKey = accessKey
-        self.disableNetwork = disableNetwork
-        self.stopBefore = "stopBefore"
+        self._accessKeyId = accessKeyId
+        self._accessKey = accessKey
+        self._disableNetwork = disableNetwork
+        self._stopBefore = "stopBefore"
 
     def __repr__(self):
         self.syncRemote()
@@ -138,6 +138,72 @@ def retries(self):
         self.syncRemote()
         return self._retries
 
+    @property
+    def vm(self):
+        self.syncRemote()
+        return self._vm
+    
+    @property
+    def input(self):
+        self.syncRemote()
+        return self._input
+    
+    @property
+    def outputFile(self):
+        self.syncRemote()
+        return self._outputFile
+    
+    @property
+    def name(self):
+        self.syncRemote()
+        return self._name
+    
+    @property
+    def notifyURL(self):
+        self.syncRemote()
+        return self._notifyURL
+    
+    @property
+    def timeout(self):
+        self.syncRemote()
+        return self._timeout
+    
+    @property
+    def trace(self):
+        self.syncRemote()
+        return self._trace
+    
+    @property
+    def maxOutputFileSize(self):
+        self.syncRemote()
+        return self._maxOutputFileSize
+    
+    @property
+    def remoteLocation(self):
+        self.syncRemote()
+        return self._remoteLocation
+    
+    @property
+    def accessKeyId(self):
+        self.syncRemote()
+        return self._accessKeyId
+    
+    @property
+    def accessKey(self):
+        self.syncRemote()
+        return self._accessKey
+    
+    @property
+    def disableNetwork(self):
+        self.syncRemote()
+        return self._disableNetwork
+    
+    @property
+    def stopBefore(self):
+        self.syncRemote()
+        return self._stopBefore
+    
+
     def makeAssigned(self):
         self.syncRemote()
         self._assigned = True
@@ -155,7 +221,7 @@ def incrementRetries(self):
 
     def makeVM(self, vm):
         self.syncRemote()
-        self.vm = vm
+        self._vm = vm
         self.updateRemote()
 
     def makeUnassigned(self):
@@ -169,7 +235,7 @@ def isNotAssigned(self):
 
     def appendTrace(self, trace_str):
         self.syncRemote()
-        self.trace.append(trace_str)
+        self._trace.append(trace_str)
         self.updateRemote()
 
     def setId(self, new_id):
@@ -181,19 +247,24 @@ def setId(self, new_id):
             dictionary.delete(key)
             self._remoteLocation = dict_hash + ":" + str(new_id)
             self.updateRemote()
+            
+    def setTimeout(self, new_timeout):
+        self.syncRemote()
+        self._timeout = new_timeout
+        self.updateRemote()
 
     # Private method
     def __updateSelf(self, other_job):
         self._assigned = other_job._assigned
         self._retries = other_job._retries
-        self.vm = other_job.vm
-        self.input = other_job.input
-        self.outputFile = other_job.outputFile
-        self.name = other_job.name
-        self.notifyURL = other_job.notifyURL
-        self.timeout = other_job.timeout
-        self.trace = other_job.trace
-        self.maxOutputFileSize = other_job.maxOutputFileSize
+        self._vm = other_job._vm
+        self._input = other_job._input
+        self._outputFile = other_job._outputFile
+        self._name = other_job._name
+        self._notifyURL = other_job._notifyURL
+        self._timeout = other_job._timeout
+        self._trace = other_job._trace
+        self._maxOutputFileSize = other_job._maxOutputFileSize
 
 
     def syncRemote(self):

From 9d57597555ed9ee4aa2fd0415c166ef7c9300401 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 24 Aug 2025 13:00:51 -0400
Subject: [PATCH 03/39] fixed the local/remote desyncing issues, left some
 print statements in there

---
 jobQueue.py     | 40 ++++++++++++++++++++++++----------------
 tango.py        |  1 +
 tangoObjects.py |  7 +++++--
 worker.py       | 13 +++++++------
 4 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 1b0612e8..707b4e3f 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -136,7 +136,7 @@ def add(self, job):
 
         # Since we assume that the job is new, we set the number of retries
         # of this job to 0
-        job.retries = 0
+        assert(job.retries == 0)
 
         # Add the job to the queue. Careful not to append the trace until we
         # know the job has actually been added to the queue.
@@ -297,28 +297,36 @@ def unassignJob(self, jobId):
         self.queueLock.release()
         self.log.debug("unassignJob| Released lock to job queue.")
 
-    def makeDead(self, id, reason):
+    def makeDead(self, job: TangoJob, reason):
         """makeDead - move a job from live queue to dead queue"""
-        self.log.info("makeDead| Making dead job ID: " + str(id))
+        self.log.info("makeDead| Making dead job ID: " + str(job.id))
         self.queueLock.acquire()
         self.log.debug("makeDead| Acquired lock to job queue.")
         status = -1
         # Check to make sure that the job is in the live jobs queue
-        if id in self.liveJobs:
-            self.log.info("makeDead| Found job ID: %s in the live queue" % (id))
-            status = 0
-            job = self.liveJobs.get(id)
-            self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason))
-            # Add the job to the dead jobs dictionary
-            self.deadJobs.set(id, job)
-            # Remove the job from the live jobs dictionary
-            self.liveJobs.delete(id)
+        if job.id not in self.liveJobs:
+            self.log.error("makeDead| Job ID: %s not found in live jobs" % (job.id))
+            return -1
+        
+        self.log.info("makeDead| Found job ID: %s in the live queue" % (job.id))
+        status = 0
+        self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason))
+        
+        print(f"Anthony: {job.id} has remote location {job._remoteLocation} before swapping")
+
+        # Add the job to the dead jobs dictionary
+        self.deadJobs.set(job.id, job)
+        # Remove the job from the live jobs dictionary
+        self.liveJobs.delete(job.id)
 
-            # unassign, remove from unassigned jobs queue
-            job.makeUnassigned()
-            self.unassignedJobs.remove(int(id))
+        print(f"Anthony: {job.id} has remote location {job._remoteLocation} after swapping")
+        print(f"Anthony: {job.id} has remote location {job._remoteLocation} after syncing")
+
+        # unassign, remove from unassigned jobs queue
+        job.makeUnassigned()
+        self.unassignedJobs.remove(int(job.id))
 
-            job.appendTrace("%s|%s" % (datetime.utcnow().ctime(), reason))
+        job.appendTrace("%s|%s" % (datetime.utcnow().ctime(), reason))
         self.queueLock.release()
         self.log.debug("makeDead| Released lock to job queue.")
         return status
diff --git a/tango.py b/tango.py
index e487481a..8841a38c 100755
--- a/tango.py
+++ b/tango.py
@@ -278,6 +278,7 @@ def resetTango(self, vmms):
             self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err))
             os._exit(1)
 
+    # Returns 0 if the job is valid, -1 if the job is invalid
     def __validateJob(self, job, vmms):
         """validateJob - validate the input arguments in an addJob request."""
         errors = 0
diff --git a/tangoObjects.py b/tangoObjects.py
index dcd97c67..1a0156d0 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -272,7 +272,10 @@ def syncRemote(self):
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
             dictionary = TangoDictionary(dict_hash)
-            temp_job = dictionary.get(key)
+            temp_job = dictionary.get(key) # Key should be in dictionary
+            if temp_job is None:
+                print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob
+                return
             self.__updateSelf(temp_job)
 
     def updateRemote(self):
@@ -446,7 +449,7 @@ def set(self, id, obj):
         pickled_obj = pickle.dumps(obj)
 
         if hasattr(obj, "_remoteLocation"):
-            obj._remoteLocation = self.hash_name + ":" + str(id)
+            obj._remoteLocation = self.hash_name + ":" + str(id) # TODO: don't violate the encapsulation of TangoJob
 
         self.r.hset(self.hash_name, str(id), pickled_obj)
         return str(id)
diff --git a/worker.py b/worker.py
index f06e1f10..1dd8d8e7 100644
--- a/worker.py
+++ b/worker.py
@@ -13,7 +13,7 @@
 
 from datetime import datetime
 from config import Config
-
+from jobQueue import JobQueue
 #
 # Worker - The worker class is very simple and very dumb. The goal is
 # to walk through the VMMS interface, track the job's progress, and if
@@ -32,7 +32,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM):
         self.daemon = True
         self.job = job
         self.vmms = vmms
-        self.jobQueue = jobQueue
+        self.jobQueue : JobQueue = jobQueue
         self.preallocator = preallocator
         self.preVM = preVM
         threading.Thread.__init__(self)
@@ -64,6 +64,7 @@ def detachVM(self, return_vm=False, replace_vm=False):
             # pool is empty and creates a spurious vm.
             self.preallocator.removeVM(self.job.vm)
 
+    # TODO: figure out what hdrfile, ret and err are
     def rescheduleJob(self, hdrfile, ret, err):
         """rescheduleJob - Reschedule a job that has failed because
         of a system error, such as a VM timing out or a connection
@@ -96,7 +97,7 @@ def rescheduleJob(self, hdrfile, ret, err):
                 "%s|Giving up on job %s:%d"
                 % (datetime.now().ctime(), self.job.name, self.job.id)
             )
-            self.jobQueue.makeDead(self.job.id, err) # Note: this reports the error that caused the last call to rescheduleJob to fail
+            self.jobQueue.makeDead(self.job, err) # Note: this reports the error that caused the last call to rescheduleJob to fail
 
             self.appendMsg(
                 hdrfile,
@@ -165,13 +166,13 @@ def notifyServer(self, job):
             self.log.debug("Error in notifyServer: %s" % str(e))
 
     def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True):
-        self.jobQueue.makeDead(self.job.id, msg)
-
+        self.jobQueue.makeDead(self.job, msg)
+        
         # Update the text that users see in the autodriver output file
         self.appendMsg(hdrfile, msg)
         self.catFiles(hdrfile, self.job.outputFile)
         os.chmod(self.job.outputFile, 0o644)
-
+        
         # Thread exit after termination
         if killVM:
             self.detachVM(return_vm=returnVM)

From 3bd80cf3bbb3a7c0eff7992b3ffeb0bedac7ef03 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 24 Aug 2025 17:06:50 -0400
Subject: [PATCH 04/39] Better encapsulation of TangoJob._remoteLocation

---
 jobQueue.py     | 15 +++++----------
 tangoObjects.py | 17 +++++++++++++----
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 707b4e3f..ae23d4f9 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -145,7 +145,7 @@ def add(self, job):
         self.log.debug("add| Acquired lock to job queue.")
 
         # Adds the job to the live jobs dictionary
-        self.liveJobs.set(job.id, job)
+        job.addToDict(self.liveJobs)
         # Add this to the unassigned job queue too
         self.unassignedJobs.put(int(job.id))
 
@@ -194,7 +194,7 @@ def addDead(self, job):
         self.log.debug("addDead|Acquired lock to job queue.")
 
         # We add the job into the dead jobs dictionary
-        self.deadJobs.set(job.id, job)
+        job.addToDict(self.deadJobs)
         self.queueLock.release()
         self.log.debug("addDead|Released lock to job queue.")
 
@@ -312,15 +312,10 @@ def makeDead(self, job: TangoJob, reason):
         status = 0
         self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason))
         
-        print(f"Anthony: {job.id} has remote location {job._remoteLocation} before swapping")
-
-        # Add the job to the dead jobs dictionary
-        self.deadJobs.set(job.id, job)
         # Remove the job from the live jobs dictionary
-        self.liveJobs.delete(job.id)
-
-        print(f"Anthony: {job.id} has remote location {job._remoteLocation} after swapping")
-        print(f"Anthony: {job.id} has remote location {job._remoteLocation} after syncing")
+        job.deleteFromDict(self.liveJobs)
+        # Add the job to the dead jobs dictionary
+        job.addToDict(self.deadJobs)
 
         # unassign, remove from unassigned jobs queue
         job.makeUnassigned()
diff --git a/tangoObjects.py b/tangoObjects.py
index 1a0156d0..9dcdf5fe 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 # tangoREST.py
 #
 # Implements objects used to pass state within Tango.
@@ -284,6 +285,18 @@ def updateRemote(self):
             key = self._remoteLocation.split(":")[1]
             dictionary = TangoDictionary(dict_hash)
             dictionary.set(key, self)
+            
+    def deleteFromDict(self, dictionary : TangoDictionary) -> None:
+        dictionary.delete(self.id)
+        self._remoteLocation = None
+        
+    def addToDict(self, dictionary : TangoDictionary) -> None:
+        dictionary.set(self.id, self)
+        assert self._remoteLocation is None, "Job already has a remote location"
+        if Config.USE_REDIS:
+            self._remoteLocation = dictionary.hash_name + ":" + str(self.id)
+            self.updateRemote()
+        
 
 
 def TangoIntValue(object_name, obj):
@@ -448,9 +461,6 @@ def __contains__(self, id):
     def set(self, id, obj):
         pickled_obj = pickle.dumps(obj)
 
-        if hasattr(obj, "_remoteLocation"):
-            obj._remoteLocation = self.hash_name + ":" + str(id) # TODO: don't violate the encapsulation of TangoJob
-
         self.r.hset(self.hash_name, str(id), pickled_obj)
         return str(id)
 
@@ -474,7 +484,6 @@ def values(self):
         return valslist
 
     def delete(self, id):
-        self._remoteLocation = None
         self.r.hdel(self.hash_name, id)
 
     def _clean(self):

From f0c0d3262bf3fc0327d0198776808cefb16370dd Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 24 Aug 2025 19:01:26 -0400
Subject: [PATCH 05/39] Fixing uses of makeDead that was changed 2 commits ago

---
 jobManager.py         |  4 ++--
 jobQueue.py           |  4 +++-
 tangoObjects.py       | 10 ++++++++++
 tests/testJobQueue.py |  2 +-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/jobManager.py b/jobManager.py
index 2be2147f..1010827c 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -28,7 +28,7 @@
 class JobManager(object):
     def __init__(self, queue):
         self.daemon = True
-        self.jobQueue = queue
+        self.jobQueue: JobQueue = queue
         self.preallocator = self.jobQueue.preallocator
         self.vmms = self.preallocator.vmms
         self.log = logging.getLogger("JobManager")
@@ -124,7 +124,7 @@ def __manage(self):
                     self.log.info("job_manager: job is None")
                 else:
                     self.log.error("job failed during creation %d %s" % (job.id, str(err)))
-                    self.jobQueue.makeDead(job.id, str(err))
+                    self.jobQueue.makeDead(job, str(err))
 
 
 if __name__ == "__main__":
diff --git a/jobQueue.py b/jobQueue.py
index ae23d4f9..e1c35cec 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -209,6 +209,7 @@ def delJob(self, id, deadjob):
         """
         status = -1
         if deadjob == 0:
+            temp_job = self.liveJobs.getExn(id)
             try:
                 # Remove the job from the unassigned live jobs queue, if it
                 # is yet to be assigned.
@@ -218,7 +219,7 @@ def delJob(self, id, deadjob):
                 self.log.info("delJob | Job ID %s was already assigned" % (id))
                 return status
 
-            return self.makeDead(id, "Requested by operator")
+            return self.makeDead(temp_job, "Requested by operator")
         else:
             self.queueLock.acquire()
             self.log.debug("delJob| Acquired lock to job queue.")
@@ -297,6 +298,7 @@ def unassignJob(self, jobId):
         self.queueLock.release()
         self.log.debug("unassignJob| Released lock to job queue.")
 
+    # Takes in a job in order to switch its remote location, in which you can't rely on syncing
     def makeDead(self, job: TangoJob, reason):
         """makeDead - move a job from live queue to dead queue"""
         self.log.info("makeDead| Making dead job ID: " + str(job.id))
diff --git a/tangoObjects.py b/tangoObjects.py
index 9dcdf5fe..3e57dccb 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -472,6 +472,11 @@ def get(self, id):
         else:
             return None
 
+    def getExn(self, id):
+        job = self.get(id)
+        assert job is not None, f"ID {id} does not exist in this remote dictionary"
+        return job
+
     def keys(self):
         keys = map(lambda key: key.decode(), self.r.hkeys(self.hash_name))
         return list(keys)
@@ -519,6 +524,11 @@ def get(self, id):
         else:
             return None
 
+    def getExn(self, id):
+        job = self.get(id)
+        assert job is not None, f"ID {id} does not exist in this native dictionary"
+        return job
+
     def keys(self):
         return list(self.dict.keys())
 
diff --git a/tests/testJobQueue.py b/tests/testJobQueue.py
index 00861d23..27fd3d4f 100644
--- a/tests/testJobQueue.py
+++ b/tests/testJobQueue.py
@@ -130,7 +130,7 @@ def test_makeDead(self):
         info = self.jobQueue.getInfo()
         self.assertEqual(info["size_deadjobs"], 0)
         self.assertEqual(info["size_unassignedjobs"], 2)
-        self.jobQueue.makeDead(self.jobId1, "test")
+        self.jobQueue.makeDead(self.job1, "test")
         info = self.jobQueue.getInfo()
         self.assertEqual(info["size_deadjobs"], 1)
         self.assertEqual(info["size_unassignedjobs"], 1)

From ba83509eb984a52f51346992840887b60b1a2e04 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sat, 30 Aug 2025 18:25:38 -0400
Subject: [PATCH 06/39] comments, logging and todos

---
 clients/tango-cli.py | 2 +-
 jobManager.py        | 4 ++++
 jobQueue.py          | 4 ++++
 tangoObjects.py      | 3 ++-
 vmms/ec2SSH.py       | 1 +
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clients/tango-cli.py b/clients/tango-cli.py
index d9f5ab8d..f060bd1d 100755
--- a/clients/tango-cli.py
+++ b/clients/tango-cli.py
@@ -131,7 +131,7 @@
 parser.add_argument("--accessKeyId", default="", help="AWS account access key ID")
 parser.add_argument("--accessKey", default="", help="AWS account access key content")
 parser.add_argument("--instanceType", default="", help="AWS EC2 instance type")
-parser.add_argument("--ec2", action="store_true", help="Enable ec2SSH VMMS")
+parser.add_argument("--ec2", action="store_true", help="Enable ec2SSH VMMS") # TODO: shouldn't this be taken from config.py?
 parser.add_argument("--stopBefore", default="", help="Stops the worker before a function is executed")
 
 def checkKey():
diff --git a/jobManager.py b/jobManager.py
index 59c1ebad..7f254cce 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -19,10 +19,12 @@
 
 import tango  # Written this way to avoid circular imports
 from config import Config
+from jobQueue import JobQueue
 from tangoObjects import TangoQueue
 from worker import Worker
 
 
+
 class JobManager(object):
     def __init__(self, queue):
         self.daemon = True
@@ -63,6 +65,7 @@ def __manage(self):
             # Blocks until we get a next job
             job = self.jobQueue.getNextPendingJob()
             if not job.accessKey and Config.REUSE_VMS:
+                self.log.info(f"job has access key {job.accessKey} and is calling reuseVM")
                 vm = None
                 while vm is None:
                     vm = self.jobQueue.reuseVM(job)
@@ -89,6 +92,7 @@ def __manage(self):
                             "EC2 SSH VM initialization failed: see log"
                         )
                 else:
+                    self.log.info(f"job {job.id} is not an ec2 vmms job")
                     # Try to find a vm on the free list and allocate it to
                     # the worker if successful.
                     if Config.REUSE_VMS:
diff --git a/jobQueue.py b/jobQueue.py
index e1c35cec..45dd40ac 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -246,6 +246,8 @@ def get(self, id):
         self.log.debug("get| Released lock to job queue.")
         return job
 
+    # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. 
+    # TODO: It does assign the job to a particular VM though.
     def assignJob(self, jobId, vm=None):
         """assignJob - marks a job to be assigned"""
         self.queueLock.acquire()
@@ -389,3 +391,5 @@ def reuseVM(self, job):
                 return job.vm
             else:
                 raise Exception("Job assigned without vm")
+
+
diff --git a/tangoObjects.py b/tangoObjects.py
index 3e57dccb..64c5f309 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -7,6 +7,7 @@
 from queue import Queue
 import pickle
 import redis
+from typing import Optional
 
 redisConnection = None
 
@@ -91,7 +92,7 @@ class TangoJob(object):
 
     def __init__(
         self,
-        vm=None,
+        vm: Optional[TangoMachine] = None,
         outputFile=None,
         name=None,
         input=None,
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index b2621192..318b6d49 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -162,6 +162,7 @@ def release_vm_semaphore():
         """Releases the VM sempahore"""
         Ec2SSH._vm_semaphore.release()
 
+    # TODO: the arguments accessKeyId and accessKey don't do anything
     def __init__(self, accessKeyId=None, accessKey=None):
         """log - logger for the instance
         connection - EC2Connection object that stores the connection

From 600803ccd43980c2a2617a6435272a38f19c7322 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 15 Sep 2025 22:06:33 -0400
Subject: [PATCH 07/39] Code to enable spot instances

---
 jobManager.py  |  4 ++--
 jobQueue.py    |  2 +-
 vmms/ec2SSH.py | 24 +++++++++++++++++-------
 worker.py      |  3 ++-
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/jobManager.py b/jobManager.py
index 7f254cce..df54dec8 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -20,7 +20,7 @@
 import tango  # Written this way to avoid circular imports
 from config import Config
 from jobQueue import JobQueue
-from tangoObjects import TangoQueue
+from tangoObjects import TangoJob, TangoQueue
 from worker import Worker
 
 
@@ -63,7 +63,7 @@ def __manage(self):
         self.running = True
         while True:
             # Blocks until we get a next job
-            job = self.jobQueue.getNextPendingJob()
+            job: TangoJob = self.jobQueue.getNextPendingJob()
             if not job.accessKey and Config.REUSE_VMS:
                 self.log.info(f"job has access key {job.accessKey} and is calling reuseVM")
                 vm = None
diff --git a/jobQueue.py b/jobQueue.py
index 45dd40ac..e3dc4256 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -347,7 +347,7 @@ def reset(self):
         self.deadJobs._clean()
         self.unassignedJobs._clean()
 
-    def getNextPendingJob(self):
+    def getNextPendingJob(self) -> TangoJob:
         """Gets the next unassigned live job. Note that this is a
         blocking function and we will block till there is an available
         job.
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 318b6d49..1e21da79 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -22,6 +22,7 @@
 
 import config
 from tangoObjects import TangoMachine
+from typing import Optional
 
 # suppress most boto logging
 logging.getLogger("boto3").setLevel(logging.CRITICAL)
@@ -192,7 +193,7 @@ def __init__(self, accessKeyId=None, accessKey=None):
         self.img2ami = {}
         self.images = []
         try:
-            self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION)
+            self.boto3resource: boto3.EC2.ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION)
             self.boto3client = boto3.client("ec2", config.Config.EC2_REGION)
 
             # Get images from ec2
@@ -248,7 +249,7 @@ def domainName(self, vm):
     # VMMS helper methods
     #
 
-    def tangoMachineToEC2Instance(self, vm: TangoMachine):
+    def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict:
         """tangoMachineToEC2Instance - returns an object with EC2 instance
         type and AMI. Only general-purpose instances are used. Defalt AMI
         is currently used.
@@ -328,7 +329,7 @@ def createSecurityGroup(self):
     #
     # VMMS API functions
     #
-    def initializeVM(self, vm):
+    def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
         """initializeVM - Tell EC2 to create a new VM instance.
 
         Returns a boto.ec2.instance.Instance object.
@@ -350,13 +351,21 @@ def initializeVM(self, vm):
                 self.key_pair_name = self.keyPairName(vm.id, vm.name)
                 self.createKeyPair()
 
-            reservation = self.boto3resource.create_instances(
+            instance_market_options = {
+                "MarketType": "spot",
+                "SpotOptions": {
+                    "SpotInstanceType": "one-time",
+                    "InstanceInterruptionBehavior": "terminate"
+                }
+            }
+            reservation: list[boto3.ec2.Instance] = self.boto3resource.create_instances(
                 ImageId=ec2instance["ami"],
                 KeyName=self.key_pair_name,
                 SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP],
                 InstanceType=ec2instance["instance_type"],
                 MaxCount=1,
                 MinCount=1,
+                InstanceMarketOptions=instance_market_options,
             )
 
             # Sleep for a while to prevent random transient errors observed
@@ -365,8 +374,9 @@ def initializeVM(self, vm):
 
             # reservation is a list of instances created. there is only
             # one instance created so get index 0.
-            newInstance = reservation[0]
+            newInstance: boto3.ec2.Instance = reservation[0]
             if not newInstance:
+                # TODO: when does this happen?
                 raise ValueError("Cannot find new instance for %s" % vm.name)
 
             # Wait for instance to reach 'running' state
@@ -436,7 +446,7 @@ def initializeVM(self, vm):
             self.log.debug("initializeVM Failed: %s" % e)
 
             # if the new instance exists, terminate it
-            if newInstance:
+            if newInstance is not None:
                 try:
                     self.boto3resource.instances.filter(
                         InstanceIds=[newInstance.id]
@@ -448,7 +458,7 @@ def initializeVM(self, vm):
                     return None
             return None
 
-    def waitVM(self, vm, max_secs):
+    def waitVM(self, vm, max_secs) -> 0 | -1:
         """waitVM - Wait at most max_secs for a VM to become
         ready. Return error if it takes too long.
 
diff --git a/worker.py b/worker.py
index 1dd8d8e7..59a17239 100644
--- a/worker.py
+++ b/worker.py
@@ -247,7 +247,7 @@ def run(self):
                 )
 
                 # Host name returned from EC2 is stored in the vm object
-                self.vmms.initializeVM(self.job.vm)
+                self.vmms.initializeVM(self.job.vm) # TODO: This can return None if the step fails, check for that
                 self.log.debug("Asigned job to a new VM")
 
             vm = self.job.vm
@@ -322,6 +322,7 @@ def run(self):
                 self.job.vm.notes = str(self.job.id) + "_" + self.job.name
                 self.job.vm.keep_for_debugging = True
                 self.afterJobExecution(hdrfile, msg, False)
+                # TODO: no reschedule?
                 return
 
             self.log.info(

From d4501129334b2bbce4506217d96cf48c7688db1b Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 16 Sep 2025 16:01:11 -0400
Subject: [PATCH 08/39] type annotations

---
 jobManager.py              | 23 +++++------
 jobQueue.py                |  1 +
 mypy.ini                   |  5 +++
 restful_tango/server.py    |  6 +--
 restful_tango/tangoREST.py |  5 ---
 tango.py                   |  6 +--
 tangoObjects.py            | 78 ++++++++++++++++++++++++++++++++------
 tests/validate.py          |  4 +-
 vmms/ec2SSH.py             | 54 +++++++++++++-------------
 vmms/interface.py          | 42 ++++++++++++++++++++
 vmms/localDocker.py        |  2 +-
 11 files changed, 162 insertions(+), 64 deletions(-)
 create mode 100644 mypy.ini
 create mode 100644 vmms/interface.py

diff --git a/jobManager.py b/jobManager.py
index df54dec8..65d62116 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -26,7 +26,7 @@
 
 
 class JobManager(object):
-    def __init__(self, queue):
+    def __init__(self, queue: JobQueue):
         self.daemon = True
         self.jobQueue: JobQueue = queue
         self.preallocator = self.jobQueue.preallocator
@@ -36,7 +36,7 @@ def __init__(self, queue):
         self.nextId = 10000
         self.running = False
 
-    def start(self):
+    def start(self) -> None:
         if self.running:
             return
         thread = threading.Thread(target=self.__manage)
@@ -59,7 +59,7 @@ def _getNextID(self):
             self.nextId = 10000
         return id
 
-    def __manage(self):
+    def __manage(self) -> None:
         self.running = True
         while True:
             # Blocks until we get a next job
@@ -83,7 +83,8 @@ def __manage(self):
                     newVM = copy.deepcopy(job.vm)
                     newVM.id = self._getNextID()
                     try:
-                        preVM = vmms.initializeVM(newVM)
+                        vmms.initializeVM(newVM)
+                        preVM = newVM
                     except Exception as e:
                         self.log.error("ERROR initialization VM: %s", e)
                         self.log.error(traceback.format_exc())
@@ -138,19 +139,19 @@ def __manage(self):
          JobManager"
         )
     else:
-        tango = tango.TangoServer()
-        tango.log.debug("Resetting Tango VMs")
-        tango.resetTango(tango.preallocator.vmms)
-        for key in tango.preallocator.machines.keys():
-            tango.preallocator.machines.set(key, [[], TangoQueue(key)])
+        tango_server = tango.TangoServer()
+        tango_server.log.debug("Resetting Tango VMs")
+        tango_server.resetTango(tango_server.preallocator.vmms)
+        for key in tango_server.preallocator.machines.keys():
+            tango_server.preallocator.machines.set(key, [[], TangoQueue(key)])
 
             # The above call sets the total pool empty.  But the free pool which
             # is a queue in redis, may not be empty.  When the job manager restarts,
             # resetting the free queue using the key doesn't change its content.
             # Therefore we empty the queue, thus the free pool, to keep it consistent
             # with the total pool.
-            tango.preallocator.machines.get(key)[1].make_empty()
-        jobs = JobManager(tango.jobQueue)
+            tango_server.preallocator.machines.get(key)[1].make_empty()
+        jobs = JobManager(tango_server.jobQueue)
 
         print("Starting the stand-alone Tango JobManager")
         jobs.run()
diff --git a/jobQueue.py b/jobQueue.py
index e3dc4256..2d0a9934 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -270,6 +270,7 @@ def assignJob(self, jobId, vm=None):
         self.log.debug("assignJob| Released lock to job queue.")
         # return job
 
+    # TODO: Rename this job to be more accurate in its description
     def unassignJob(self, jobId):
         """unassignJob - marks a job to be unassigned
         Note: We assume here that a job is to be rescheduled or
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..db776d15
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,5 @@
+[mypy]
+explicit_package_bases = True
+
+[mypy-vmms.tashiSSH]
+ignore_errors = True
\ No newline at end of file
diff --git a/restful_tango/server.py b/restful_tango/server.py
index 2eba21fc..b31f0e8b 100755
--- a/restful_tango/server.py
+++ b/restful_tango/server.py
@@ -9,13 +9,9 @@
 
 import tornado.web
 from tempfile import NamedTemporaryFile
-from tangoREST import TangoREST
+from restful_tango.tangoREST import TangoREST
 import asyncio
 
-currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
-parentdir = os.path.dirname(currentdir)
-sys.path.insert(0, parentdir)
-
 from config import Config
 
 tangoREST = TangoREST()
diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py
index c60cf44d..90dbc300 100644
--- a/restful_tango/tangoREST.py
+++ b/restful_tango/tangoREST.py
@@ -6,16 +6,11 @@
 
 import sys
 import os
-import inspect
 import hashlib
 import json
 import logging
 import docker
 
-currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
-parentdir = os.path.dirname(currentdir)
-sys.path.insert(0, parentdir)
-
 from config import Config
 from tangoObjects import TangoJob, TangoMachine, InputFile
 from tango import TangoServer
diff --git a/tango.py b/tango.py
index 8841a38c..3f94cd12 100755
--- a/tango.py
+++ b/tango.py
@@ -75,8 +75,8 @@ def __init__(self):
 
             vmms = DistDocker()
 
-        self.preallocator = Preallocator({Config.VMMS_NAME: vmms})
-        self.jobQueue = JobQueue(self.preallocator)
+        self.preallocator: Preallocator = Preallocator({Config.VMMS_NAME: vmms})
+        self.jobQueue: JobQueue = JobQueue(self.preallocator)
         if not Config.USE_REDIS:
             # creates a local Job Manager if there is no persistent
             # memory between processes. Otherwise, JobManager will
@@ -89,7 +89,7 @@ def __init__(self):
             level=Config.LOGLEVEL,
         )
         self.start_time = time.time()
-        self.log = logging.getLogger("TangoServer")
+        self.log: logging.Logger = logging.getLogger("TangoServer")
         self.log.info("Starting Tango server")
 
     def addJob(self, job):
diff --git a/tangoObjects.py b/tangoObjects.py
index 64c5f309..a5554e1e 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -7,7 +7,8 @@
 from queue import Queue
 import pickle
 import redis
-from typing import Optional
+from typing import Optional, Protocol, TypeVar
+from abc import abstractmethod
 
 redisConnection = None
 
@@ -117,9 +118,9 @@ def __init__(
         self._name = name
         self._notifyURL = notifyURL
         self._timeout = timeout # How long to run the autodriver on the job for before timing out.
-        self._trace = []
+        self._trace: list[str] = []
         self._maxOutputFileSize = maxOutputFileSize
-        self._remoteLocation = None
+        self._remoteLocation: Optional[str] = None
         self._accessKeyId = accessKeyId
         self._accessKey = accessKey
         self._disableNetwork = disableNetwork
@@ -245,7 +246,7 @@ def setId(self, new_id):
         if self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
-            dictionary = TangoDictionary(dict_hash)
+            dictionary = TangoDictionary.create(dict_hash)
             dictionary.delete(key)
             self._remoteLocation = dict_hash + ":" + str(new_id)
             self.updateRemote()
@@ -438,23 +439,72 @@ def make_empty(self):
             if item is None:
                 break
 
+T = TypeVar('T')
+class TangoDictionary(Protocol[T]):
+
+    @staticmethod
+    def create(dictionary_name: str) -> TangoDictionary[T]:
+        if Config.USE_REDIS:
+            return TangoRemoteDictionary(dictionary_name)
+        else:
+            return TangoNativeDictionary()
+        
+    @property
+    @abstractmethod
+    def hash_name(self) -> str:
+        ...
+        
+    @abstractmethod
+    def __contains__(self, id: str) -> bool:
+        ...
+    @abstractmethod
+    def set(self, id: str, obj: T) -> str:
+        ...
+    @abstractmethod
+    def get(self, id: str) -> Optional[T]:
+        ...
+    @abstractmethod
+    def getExn(self, id: str) -> T:
+        ...
+    @abstractmethod
+    def keys(self) -> list[str]:
+        ...
+    @abstractmethod
+    def values(self) -> list[T]:
+        ...
+    @abstractmethod
+    def delete(self, id: str) -> None:
+        ...
+    @abstractmethod
+    def _clean(self) -> None:
+        ...
+    @abstractmethod
+    def items(self) -> list[tuple[str, T]]:
+        ...
+
 
 # This is an abstract class that decides on
 # if we should initiate a TangoRemoteDictionary or TangoNativeDictionary
 # Since there are no abstract classes in Python, we use a simple method
 
 
-def TangoDictionary(object_name):
-    if Config.USE_REDIS:
-        return TangoRemoteDictionary(object_name)
-    else:
-        return TangoNativeDictionary()
+# def TangoDictionary(object_name):
+#     if Config.USE_REDIS:
+#         return TangoRemoteDictionary(object_name)
+#     else:
+#         return TangoNativeDictionary()
+
+
 
 
-class TangoRemoteDictionary(object):
+class TangoRemoteDictionary(TangoDictionary[T]):
     def __init__(self, object_name):
         self.r = getRedisConnection()
-        self.hash_name = object_name
+        self._hash_name = object_name
+
+    @property
+    def hash_name(self) -> str:
+        return self._hash_name
 
     def __contains__(self, id):
         return self.r.hexists(self.hash_name, str(id))
@@ -506,10 +556,14 @@ def items(self):
         )
 
 
-class TangoNativeDictionary(object):
+class TangoNativeDictionary(TangoDictionary[T]):
     def __init__(self):
         self.dict = {}
 
+    @property
+    def hash_name(self) -> str:
+        raise ValueError("TangoNativeDictionary does not have a hash name")
+
     def __repr__(self):
         return str(self.dict)
 
diff --git a/tests/validate.py b/tests/validate.py
index 546db78b..fe0ab3bf 100644
--- a/tests/validate.py
+++ b/tests/validate.py
@@ -4,6 +4,8 @@
 # directories.
 #
 
+from typing import List
+
 try:
     import pyflakes
 
@@ -46,7 +48,7 @@
 args = parser.parse_args()
 
 # Setup
-skip_paths = []
+skip_paths: List[str] = []
 
 # Stats
 file_count = 0
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 1e21da79..0e7297a8 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -22,7 +22,9 @@
 
 import config
 from tangoObjects import TangoMachine
-from typing import Optional
+from typing import Optional, Literal, List
+from mypy_boto3_ec2 import EC2ServiceResource
+from mypy_boto3_ec2.service_resource import Instance
 
 # suppress most boto logging
 logging.getLogger("boto3").setLevel(logging.CRITICAL)
@@ -193,7 +195,7 @@ def __init__(self, accessKeyId=None, accessKey=None):
         self.img2ami = {}
         self.images = []
         try:
-            self.boto3resource: boto3.EC2.ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION)
+            self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION)
             self.boto3client = boto3.client("ec2", config.Config.EC2_REGION)
 
             # Get images from ec2
@@ -276,26 +278,26 @@ def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict:
     def createKeyPair(self):
         # TODO: SUPPORT
         raise
-        # try to delete the key to avoid collision
-        self.key_pair_path = "%s/%s.pem" % (
-            config.Config.DYNAMIC_SECURITY_KEY_PATH,
-            self.key_pair_name,
-        )
-        self.deleteKeyPair()
-        key = self.connection.create_key_pair(self.key_pair_name)
-        key.save(config.Config.DYNAMIC_SECURITY_KEY_PATH)
-        # change the SSH_FLAG accordingly
-        self.ssh_flags[1] = self.key_pair_path
+        # # try to delete the key to avoid collision
+        # self.key_pair_path: str = "%s/%s.pem" % (
+        #     config.Config.DYNAMIC_SECURITY_KEY_PATH,
+        #     self.key_pair_name,
+        # )
+        # self.deleteKeyPair()
+        # key = self.connection.create_key_pair(self.key_pair_name)
+        # key.save(config.Config.DYNAMIC_SECURITY_KEY_PATH)
+        # # change the SSH_FLAG accordingly
+        # self.ssh_flags[1] = self.key_pair_path
 
     def deleteKeyPair(self):
         # TODO: SUPPORT
         raise
-        self.boto3client.delete_key_pair(self.key_pair_name)
-        # try to delete may not exist key file
-        try:
-            os.remove(self.key_pair_path)
-        except OSError:
-            pass
+        # self.boto3client.delete_key_pair(self.key_pair_name)
+        # # try to delete may not exist key file
+        # try:
+        #     os.remove(self.key_pair_path)
+        # except OSError:
+        #     pass
 
     def createSecurityGroup(self):
         try:
@@ -329,7 +331,7 @@ def createSecurityGroup(self):
     #
     # VMMS API functions
     #
-    def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
+    def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
         """initializeVM - Tell EC2 to create a new VM instance.
 
         Returns a boto.ec2.instance.Instance object.
@@ -343,8 +345,8 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
             # ensure that security group exists
             self.createSecurityGroup()
             if self.useDefaultKeyPair:
-                self.key_pair_name = config.Config.SECURITY_KEY_NAME
-                self.key_pair_path = config.Config.SECURITY_KEY_PATH
+                self.key_pair_name: str = config.Config.SECURITY_KEY_NAME
+                self.key_pair_path: str = config.Config.SECURITY_KEY_PATH
             else:
                 # TODO: SUPPORT
                 raise
@@ -358,7 +360,7 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
                     "InstanceInterruptionBehavior": "terminate"
                 }
             }
-            reservation: list[boto3.ec2.Instance] = self.boto3resource.create_instances(
+            reservation: List[Instance] = self.boto3resource.create_instances(
                 ImageId=ec2instance["ami"],
                 KeyName=self.key_pair_name,
                 SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP],
@@ -440,7 +442,7 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
             vm.domain_name = newInstance.public_ip_address
             vm.instance_id = newInstance.id
             self.log.debug("VM %s: %s" % (instanceName, newInstance))
-            return vm
+            return 0
 
         except Exception as e:
             self.log.debug("initializeVM Failed: %s" % e)
@@ -455,10 +457,10 @@ def initializeVM(self, vm: TangoMachine) -> Optional[TangoMachine]:
                     self.log.error(
                         "Exception handling failed for %s: %s" % (vm.name, e)
                     )
-                    return None
-            return None
+                    return -1
+            return -1
 
-    def waitVM(self, vm, max_secs) -> 0 | -1:
+    def waitVM(self, vm, max_secs) -> Literal[0, -1]:
         """waitVM - Wait at most max_secs for a VM to become
         ready. Return error if it takes too long.
 
diff --git a/vmms/interface.py b/vmms/interface.py
new file mode 100644
index 00000000..d71500f9
--- /dev/null
+++ b/vmms/interface.py
@@ -0,0 +1,42 @@
+from typing import Protocol, Optional, Literal, List
+from tangoObjects import TangoMachine
+from abc import abstractmethod
+
+
+class VMMSInterface(Protocol):
+    @abstractmethod
+    def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
+        ...
+
+    @abstractmethod
+    def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
+        ...
+
+    @abstractmethod
+    def copyIn(self, vm: TangoMachine, inputFiles: List[str], job_id: Optional[int] = None) -> Literal[0, -1]:
+        ...
+
+    @abstractmethod
+    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: # -1 to infinity
+        ...
+
+    @abstractmethod
+    def copyOut(self, vm: TangoMachine, destFile: str) -> Literal[0, -1]:
+        ...
+
+    @abstractmethod
+    def destroyVM(self, vm: TangoMachine) -> Literal[0, -1]:
+        ...
+    
+    @abstractmethod
+    def safeDestroyVM(self, vm: TangoMachine) -> Literal[0, -1]:
+        ...
+
+    @abstractmethod
+    def getVMs(self) -> List[TangoMachine]:
+        ...
+
+    @abstractmethod
+    def existsVM(self, vm: TangoMachine) -> bool:
+        ...
+
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index 1a1e1dd0..ada77c09 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -117,7 +117,7 @@ def domainName(self, vm):
     #
     def initializeVM(self, vm):
         """initializeVM -  Nothing to do for initializeVM"""
-        return vm
+        return 0
 
     def waitVM(self, vm, max_secs):
         """waitVM - Nothing to do for waitVM"""

From 029c1a9f386551c886d01d6393e93faf30db6c64 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 16 Sep 2025 16:21:04 -0400
Subject: [PATCH 09/39] finished mypy stuff up to not using
 --check-untyped-defs

---
 Dockerfile       |  3 +++
 requirements.txt | 49 +++++++++++++++++++++++++++++++++++++++++-------
 vmms/ec2SSH.py   | 45 ++++++++++++++++++++++----------------------
 3 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c120e15f..d6c7dac8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,5 +70,8 @@ RUN mkdir -p /var/log/docker /var/log/supervisor
 RUN cp /opt/TangoService/Tango/deployment/config/nginx.conf /etc/nginx/nginx.conf
 RUN cp /opt/TangoService/Tango/deployment/config/supervisord.conf /etc/supervisor/supervisord.conf
 
+# Set up PYTHONPATH
+ENV PYTHONPATH="/opt/TangoService/Tango"
+
 # Reload new config scripts
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
diff --git a/requirements.txt b/requirements.txt
index 548d9625..8ebee4d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,47 @@
+async-timeout==5.0.1
+backoff==2.2.1
 backports.ssl-match-hostname==3.7.0.1
-boto3
+boto3==1.37.38
+boto3-stubs==1.40.31
+botocore==1.37.38
+botocore-stubs==1.38.30
+certifi==2025.8.3
+cffi==1.17.1
+charset-normalizer==3.4.3
+cryptography==45.0.7
+distlib==0.4.0
+docker==5.0.3
+filelock==3.16.1
+idna==3.10
+jmespath==1.0.1
+mypy==1.14.1
+mypy-boto3-ec2==1.40.24
+mypy-extensions==1.1.0
+platformdirs==4.3.6
+pycparser==2.23
 pyflakes==2.1.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
 redis==4.4.4
 requests==2.31.0
-# needed for tashi, rpyc==4.1.4
+s3transfer==0.11.5
+six==1.17.0
+supervisor==4.1.0
+tomli==2.2.1
 tornado==6.4.1
-urllib3==1.26.19
-docker==5.0.3
-backoff==2.2.1
-pytz
-pyyaml
+types-awscrt==0.27.6
+types-cffi==1.16.0.20241221
+types-docker==7.1.0.20241229
+types-jmespath==1.0.2.20240106
+types-pyflakes==3.2.0.20240813
+types-pyOpenSSL==24.1.0.20240722
+types-PyYAML==6.0.12.20241230
+types-redis==4.6.0.20241004
+types-requests==2.32.0.20241016
+types-s3transfer==0.13.1
+types-setuptools==75.8.0.20250110
+typing-extensions==4.13.2
+urllib3==2.2.3
+virtualenv==20.34.0
+websocket-client==1.8.0
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 0e7297a8..8f59e1d0 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -22,9 +22,10 @@
 
 import config
 from tangoObjects import TangoMachine
-from typing import Optional, Literal, List
+from typing import Optional, Literal, List, Sequence
 from mypy_boto3_ec2 import EC2ServiceResource
 from mypy_boto3_ec2.service_resource import Instance
+from mypy_boto3_ec2.type_defs import FilterTypeDef
 
 # suppress most boto logging
 logging.getLogger("boto3").setLevel(logging.CRITICAL)
@@ -189,7 +190,14 @@ def __init__(self, accessKeyId=None, accessKey=None):
         self.useDefaultKeyPair = True
 
         # key pair settings, for now, use default security key
-
+        if self.useDefaultKeyPair:
+            self.key_pair_name: str = config.Config.SECURITY_KEY_NAME
+            self.key_pair_path: str = config.Config.SECURITY_KEY_PATH
+        else:
+            # TODO: SUPPORT. Know that this if/else block used to be under initializeVM, using vm for a unique identifier
+            raise
+            # self.key_pair_name = self.keyPairName(vm.id, vm.name)
+            # self.createKeyPair()
         # create boto3resource
 
         self.img2ami = {}
@@ -336,7 +344,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
 
         Returns a boto.ec2.instance.Instance object.
         """
-        newInstance = None
+        newInstance: Optional[Instance] = None
         # Create the instance and obtain the reservation
         try:
             instanceName = self.instanceName(vm.id, vm.name)
@@ -344,22 +352,8 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
             self.log.debug("instanceName: %s" % instanceName)
             # ensure that security group exists
             self.createSecurityGroup()
-            if self.useDefaultKeyPair:
-                self.key_pair_name: str = config.Config.SECURITY_KEY_NAME
-                self.key_pair_path: str = config.Config.SECURITY_KEY_PATH
-            else:
-                # TODO: SUPPORT
-                raise
-                self.key_pair_name = self.keyPairName(vm.id, vm.name)
-                self.createKeyPair()
-
-            instance_market_options = {
-                "MarketType": "spot",
-                "SpotOptions": {
-                    "SpotInstanceType": "one-time",
-                    "InstanceInterruptionBehavior": "terminate"
-                }
-            }
+
+
             reservation: List[Instance] = self.boto3resource.create_instances(
                 ImageId=ec2instance["ami"],
                 KeyName=self.key_pair_name,
@@ -367,7 +361,14 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
                 InstanceType=ec2instance["instance_type"],
                 MaxCount=1,
                 MinCount=1,
-                InstanceMarketOptions=instance_market_options,
+                InstanceMarketOptions=
+                        {
+                    "MarketType": "spot",
+                    "SpotOptions": {
+                        "SpotInstanceType": "one-time",
+                        "InstanceInterruptionBehavior": "terminate"
+                    }
+                },
             )
 
             # Sleep for a while to prevent random transient errors observed
@@ -376,7 +377,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
 
             # reservation is a list of instances created. there is only
             # one instance created so get index 0.
-            newInstance: boto3.ec2.Instance = reservation[0]
+            newInstance = reservation[0]
             if not newInstance:
                 # TODO: when does this happen?
                 raise ValueError("Cannot find new instance for %s" % vm.name)
@@ -385,7 +386,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
             start_time = time.time()
             while True:
 
-                filters = [
+                filters: Sequence[FilterTypeDef] = [
                     {"Name": "instance-state-name", "Values": ["running"]}
                 ]
                 instances = self.boto3resource.instances.filter(Filters=filters)

From 85bdf7c2398092bef6d15def3915721c6e57dfdf Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 22 Sep 2025 17:44:24 -0400
Subject: [PATCH 10/39] doesn't quite work, but wanted a checkpoint before
 muddling with the preallocator

---
 jobQueue.py          | 11 +++++-----
 mypy.ini             |  3 +++
 preallocator.py      | 11 ++++++----
 tangoObjects.py      |  7 +++---
 tests/testObjects.py |  2 +-
 vmms/ec2SSH.py       | 10 ++++++---
 vmms/localDocker.py  |  4 +++-
 worker.py            | 51 ++++++++++++++++++++++++++++++++++++++------
 8 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 2d0a9934..ee5b2956 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -53,8 +53,8 @@ def __init__(self, preallocator):
         is needed since there are multiple worker threads and they might be
         using the makeUnassigned api.
         """
-        self.liveJobs = TangoDictionary("liveJobs")
-        self.deadJobs = TangoDictionary("deadJobs")
+        self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs")
+        self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs")
         self.unassignedJobs = TangoQueue("unassignedLiveJobs")
         self.queueLock = threading.Lock()
         self.preallocator = preallocator
@@ -187,7 +187,7 @@ def addDead(self, job):
 
         self.log.info("addDead|Unassigning job %s" % str(job.id))
         job.makeUnassigned()
-        job.retries = 0
+        job.resetRetries()
 
         self.log.debug("addDead|Acquiring lock to job queue.")
         self.queueLock.acquire()
@@ -248,12 +248,13 @@ def get(self, id):
 
     # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. 
     # TODO: It does assign the job to a particular VM though.
-    def assignJob(self, jobId, vm=None):
+    # Precondition: jobId is in self.liveJobs
+    def assignJob(self, jobId, vm=None) -> None:
         """assignJob - marks a job to be assigned"""
         self.queueLock.acquire()
         self.log.debug("assignJob| Acquired lock to job queue.")
 
-        job = self.liveJobs.get(jobId)
+        job = self.liveJobs.getExn(jobId)
         print(str(job), jobId)
         # print(str(self.unassignedJobs))
         # print(str(self.liveJobs))
diff --git a/mypy.ini b/mypy.ini
index db776d15..f6332a89 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -2,4 +2,7 @@
 explicit_package_bases = True
 
 [mypy-vmms.tashiSSH]
+ignore_errors = True
+
+[mypy-preallocator]
 ignore_errors = True
\ No newline at end of file
diff --git a/preallocator.py b/preallocator.py
index 93e9c822..6607c552 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -6,7 +6,7 @@
 import time
 import copy
 
-from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue
+from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine
 from config import Config
 
 #
@@ -22,7 +22,7 @@
 
 class Preallocator(object):
     def __init__(self, vmms):
-        self.machines = TangoDictionary("machines")
+        self.machines: TangoDictionary[TangoMachine] = TangoDictionary.create("machines")
         self.lock = threading.Lock()
         self.nextID = TangoIntValue("nextID", 1000)
         self.vmms = vmms
@@ -33,7 +33,7 @@ def poolSize(self, vmName):
         if vmName not in self.machines:
             return 0
         else:
-            return len(self.machines.get(vmName)[0])
+            return len(self.machines.getExn(vmName)[0])
 
     def update(self, vm, num):
         """update - Updates the number of machines of a certain type
@@ -47,7 +47,10 @@ def update(self, vm, num):
         self.lock.acquire()
         if vm.name not in self.machines:
             self.machines.set(vm.name, [[], TangoQueue(vm.name)])
-            self.machines.get(vm.name)[1].make_empty()
+            self.machines.getExn(vm.name)[1].make_empty() # TODO: oh bruh this is incorrect.
+            # TODO: when used with a TangoRemoteDictionary, this will create a transient copy of the queue (from the redis pickle),
+            # TODO: then modify it, and then discard it :). self.machines will not be updated by a .get/.getExn call.
+            # TODO: TangoDictionary's should have value semantics, so the value type should probably be a tuple
             self.log.debug("Creating empty pool of %s instances" % (vm.name))
         self.lock.release()
 
diff --git a/tangoObjects.py b/tangoObjects.py
index a5554e1e..9bf60b65 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -246,7 +246,7 @@ def setId(self, new_id):
         if self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
-            dictionary = TangoDictionary.create(dict_hash)
+            dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash)
             dictionary.delete(key)
             self._remoteLocation = dict_hash + ":" + str(new_id)
             self.updateRemote()
@@ -274,7 +274,7 @@ def syncRemote(self):
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
-            dictionary = TangoDictionary(dict_hash)
+            dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash)
             temp_job = dictionary.get(key) # Key should be in dictionary
             if temp_job is None:
                 print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob
@@ -285,7 +285,7 @@ def updateRemote(self):
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
-            dictionary = TangoDictionary(dict_hash)
+            dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash)
             dictionary.set(key, self)
             
     def deleteFromDict(self, dictionary : TangoDictionary) -> None:
@@ -440,6 +440,7 @@ def make_empty(self):
                 break
 
 T = TypeVar('T')
+# Dictionary from string to T
 class TangoDictionary(Protocol[T]):
 
     @staticmethod
diff --git a/tests/testObjects.py b/tests/testObjects.py
index f0b08ea6..d477073f 100644
--- a/tests/testObjects.py
+++ b/tests/testObjects.py
@@ -18,7 +18,7 @@ def setUp(self):
         }
 
     def runDictionaryTests(self):
-        test_dict = TangoDictionary("test")
+        test_dict = TangoDictionary.create("test")
         self.assertEqual(test_dict.keys(), [])
         self.assertEqual(test_dict.values(), [])
 
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 8f59e1d0..02a3f577 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -27,6 +27,9 @@
 from mypy_boto3_ec2.service_resource import Instance
 from mypy_boto3_ec2.type_defs import FilterTypeDef
 
+from vmms.interface import VMMSInterface
+
+
 # suppress most boto logging
 logging.getLogger("boto3").setLevel(logging.CRITICAL)
 logging.getLogger("botocore").setLevel(logging.CRITICAL)
@@ -143,7 +146,7 @@ class ec2CallError(Exception):
     pass
 
 
-class Ec2SSH(object):
+class Ec2SSH(VMMSInterface):
     _SSH_FLAGS = [
         "-i",
         config.Config.SECURITY_KEY_PATH,
@@ -200,10 +203,11 @@ def __init__(self, accessKeyId=None, accessKey=None):
             # self.createKeyPair()
         # create boto3resource
 
-        self.img2ami = {}
+        self.img2ami = {} # this is a bad name, should really be img_name to img
         self.images = []
         try:
-            self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION)
+            # This is a service resource
+            self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource
             self.boto3client = boto3.client("ec2", config.Config.EC2_REGION)
 
             # Get images from ec2
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index ada77c09..8ccfb7d5 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -14,6 +14,8 @@
 import config
 from tangoObjects import TangoMachine
 
+from vmms.interface import VMMSInterface
+
 
 def timeout(command, time_out=1):
     """timeout - Run a unix command with a timeout. Return -1 on
@@ -72,7 +74,7 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0):
 #
 
 
-class LocalDocker(object):
+class LocalDocker(VMMSInterface):
     def __init__(self):
         """Checks if the machine is ready to run docker containers.
         Initialize boot2docker if running on OS X.
diff --git a/worker.py b/worker.py
index 59a17239..14da23f6 100644
--- a/worker.py
+++ b/worker.py
@@ -6,7 +6,7 @@
 import logging
 import tempfile
 import requests
-from requests.packages.urllib3.util.retry import Retry
+from urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
 import os
 import shutil
@@ -41,6 +41,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM):
     #
     # Worker helper functions
     #
+    # TODO: These should not have default values in my opinion
     def detachVM(self, return_vm=False, replace_vm=False):
         """detachVM - Detach the VM from this worker. The options are
         to return it to the pool's free list (return_vm), destroy it
@@ -106,8 +107,8 @@ def rescheduleJob(self, hdrfile, ret, err):
             )
             self.appendMsg(
                 hdrfile,
-                "Job status: waitVM=%s copyIn=%s runJob=%s copyOut=%s"
-                % (ret["waitvm"], ret["copyin"], ret["runjob"], ret["copyout"]),
+                "Job status: waitVM=%s initializeVM=%s copyIn=%s runJob=%s copyOut=%s"
+                % (ret["waitvm"], ret["initializevm"], ret["copyin"], ret["runjob"], ret["copyout"]),
             )
 
             self.catFiles(hdrfile, self.job.outputFile)
@@ -165,7 +166,10 @@ def notifyServer(self, job):
         except Exception as e:
             self.log.debug("Error in notifyServer: %s" % str(e))
 
-    def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True):
+    def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): 
+        # TODO: I don't think killVM is a good variable name, it can refer to either returning or destroying the VM
+        # TODO: Also, what does it mean to not kill the VM? (i.e. not returning it)? It only gets called when we have a job.stopBefore.
+        # TODO: This directly contradicts the documentation of detachVM ("The worker must always call this function before returning.")
         self.jobQueue.makeDead(self.job, msg)
         
         # Update the text that users see in the autodriver output file
@@ -188,6 +192,7 @@ def run(self):
             # Hash of return codes for each step
             ret = {}
             ret["waitvm"] = None
+            ret["initializevm"] = None
             ret["copyin"] = None
             ret["runjob"] = None
             ret["copyout"] = None
@@ -201,6 +206,7 @@ def run(self):
 
             # Assigning job to a preallocated VM
             if self.preVM:  # self.preVM:
+                assert not Config.VMMS_NAME == "ec2ssh", "Unimplemented"
                 self.log.debug("Assigning job to preallocated VM")
                 self.job.makeVM(self.preVM)
                 self.log.info(
@@ -221,6 +227,7 @@ def run(self):
                     )
                 )
                 self.log.debug("Assigned job to preallocated VM")
+                ret["initializevm"] = 0 # Vacuous success since it doesn't happen
             # Assigning job to a new VM
             else:
                 self.log.debug("Assigning job to a new VM")
@@ -247,7 +254,15 @@ def run(self):
                 )
 
                 # Host name returned from EC2 is stored in the vm object
-                self.vmms.initializeVM(self.job.vm) # TODO: This can return None if the step fails, check for that
+                ret["initializevm"] = self.vmms.initializeVM(self.job.vm)
+                if ret["initializevm"] != 0:
+                    self.rescheduleJob(
+                        hdrfile,
+                        ret,
+                        "Internal error: initializeVM failed"
+                    )
+                    return
+                
                 self.log.debug("Asigned job to a new VM")
 
             vm = self.job.vm
@@ -321,8 +336,12 @@ def run(self):
                 msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
                 self.job.vm.notes = str(self.job.id) + "_" + self.job.name
                 self.job.vm.keep_for_debugging = True
-                self.afterJobExecution(hdrfile, msg, False)
-                # TODO: no reschedule?
+                self.log.debug(msg)
+                self.rescheduleJob(
+                    hdrfile,
+                    ret,
+                    msg
+                )
                 return
 
             self.log.info(
@@ -350,6 +369,13 @@ def run(self):
                 Config.runjob_errors += 1
                 if ret["runjob"] == -1:
                     Config.runjob_timeouts += 1
+                self.rescheduleJob(
+                    hdrfile,
+                    ret,
+                    "Internal error: runJob failed"
+                )
+                return
+            
             self.log.info(
                 "Job %s:%d executed [status=%s]"
                 % (self.job.name, self.job.id, ret["runjob"])
@@ -368,6 +394,13 @@ def run(self):
             ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile)
             if ret["copyout"] != 0:
                 Config.copyout_errors += 1
+                self.rescheduleJob(
+                    hdrfile,
+                    ret,
+                    "Internal error: copyOut failed"
+                )
+                return
+            
             self.log.info(
                 "Output copied for job %s:%d [status=%d]"
                 % (self.job.name, self.job.id, ret["copyout"])
@@ -382,6 +415,10 @@ def run(self):
             # normal termination and doesn't reschedule the job.
             self.log.info("Success: job %s:%d finished" % (self.job.name, self.job.id))
 
+            for status in ret.values():
+                assert status == 0, "Should not get to the success point if any stage failed"
+                # TODO: test this, then remove everything below this point
+                
             # Move the job from the live queue to the dead queue
             # with an explanatory message
             msg = "Success: Autodriver returned normally"

From 7e6751e61e71b7f7d8f5bb2859c76c567580a520 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 22 Sep 2025 21:08:06 -0400
Subject: [PATCH 11/39] more typing changes, streamlining the used of the
 TangoDictionary in preallocator (do note that my initial worry about
 incorrectness was unfounded due to weird Redis sharing behavior)

---
 jobManager.py        |  8 +++++---
 jobQueue.py          |  2 +-
 preallocator.py      | 15 ++++++++-------
 tangoObjects.py      | 42 ++++++++++++++++++++++++++++++++----------
 tests/testObjects.py |  2 +-
 5 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/jobManager.py b/jobManager.py
index 65d62116..c21d316f 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -20,7 +20,8 @@
 import tango  # Written this way to avoid circular imports
 from config import Config
 from jobQueue import JobQueue
-from tangoObjects import TangoJob, TangoQueue
+from tangoObjects import TangoJob, TangoQueue, TangoMachine
+from typing import List, Tuple
 from worker import Worker
 
 
@@ -143,14 +144,15 @@ def __manage(self) -> None:
         tango_server.log.debug("Resetting Tango VMs")
         tango_server.resetTango(tango_server.preallocator.vmms)
         for key in tango_server.preallocator.machines.keys():
-            tango_server.preallocator.machines.set(key, [[], TangoQueue(key)])
+            machine: Tuple[List[TangoMachine], TangoQueue] = ([], TangoQueue.create(key))
+            machine[1].make_empty()
+            tango_server.preallocator.machines.set(key, machine)
 
             # The above call sets the total pool empty.  But the free pool which
             # is a queue in redis, may not be empty.  When the job manager restarts,
             # resetting the free queue using the key doesn't change its content.
             # Therefore we empty the queue, thus the free pool, to keep it consistent
             # with the total pool.
-            tango_server.preallocator.machines.get(key)[1].make_empty()
         jobs = JobManager(tango_server.jobQueue)
 
         print("Starting the stand-alone Tango JobManager")
diff --git a/jobQueue.py b/jobQueue.py
index ee5b2956..34c3ae5b 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -55,7 +55,7 @@ def __init__(self, preallocator):
         """
         self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs")
         self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs")
-        self.unassignedJobs = TangoQueue("unassignedLiveJobs")
+        self.unassignedJobs = TangoQueue.create("unassignedLiveJobs")
         self.queueLock = threading.Lock()
         self.preallocator = preallocator
         self.log = logging.getLogger("JobQueue")
diff --git a/preallocator.py b/preallocator.py
index 6607c552..69291933 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -8,6 +8,7 @@
 
 from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine
 from config import Config
+from typing import Tuple, List
 
 #
 # Preallocator - This class maintains a pool of active VMs for future
@@ -22,7 +23,7 @@
 
 class Preallocator(object):
     def __init__(self, vmms):
-        self.machines: TangoDictionary[TangoMachine] = TangoDictionary.create("machines")
+        self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue]] = TangoDictionary.create("machines")
         self.lock = threading.Lock()
         self.nextID = TangoIntValue("nextID", 1000)
         self.vmms = vmms
@@ -46,11 +47,9 @@ def update(self, vm, num):
         """
         self.lock.acquire()
         if vm.name not in self.machines:
-            self.machines.set(vm.name, [[], TangoQueue(vm.name)])
-            self.machines.getExn(vm.name)[1].make_empty() # TODO: oh bruh this is incorrect.
-            # TODO: when used with a TangoRemoteDictionary, this will create a transient copy of the queue (from the redis pickle),
-            # TODO: then modify it, and then discard it :). self.machines will not be updated by a .get/.getExn call.
-            # TODO: TangoDictionary's should have value semantics, so the value type should probably be a tuple
+            initial_queue = TangoQueue.create(vm.name)
+            initial_queue.make_empty()
+            self.machines.set(vm.name, ([], initial_queue))
             self.log.debug("Creating empty pool of %s instances" % (vm.name))
         self.lock.release()
 
@@ -209,7 +208,9 @@ def destroyVM(self, vmName, id):
             for i in range(size):
                 vm = self.machines.get(vmName)[1].get_nowait()
                 if vm.id != id:
-                    self.machines.get(vmName)[1].put(vm)
+                    machine = self.machines.get(vmName)
+                    machine[1].put(vm)
+                    self.machines.set(vmName, machine)
                 else:
                     dieVM = vm
         self.lock.release()
diff --git a/tangoObjects.py b/tangoObjects.py
index 9bf60b65..7d0dfc55 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -342,16 +342,35 @@ def get(self):
     def set(self, val):
         self.val = val
         return val
+    
 
+class TangoQueue(Protocol):
+    @staticmethod
+    def create(key_name: str) -> TangoQueue:
+        if Config.USE_REDIS:
+            return TangoRemoteQueue(key_name)
+        else:
+            return ExtendedQueue()
 
-def TangoQueue(object_name):
-    if Config.USE_REDIS:
-        return TangoRemoteQueue(object_name)
-    else:
-        return ExtendedQueue()
-
+    @abstractmethod
+    def qsize(self) -> int:
+        ...
+    def empty(self) -> bool:
+        ...
+    def put(self, item) -> None:
+        ...
+    def get(self, block=True, timeout=None) -> Optional[T]:
+        ...
+    def get_nowait(self) -> Optional[T]:
+        ...
+    def remove(self, item) -> None:
+        ...
+    def _clean(self) -> None:
+        ...
+    def make_empty(self) -> None:
+        ...
 
-class ExtendedQueue(Queue):
+class ExtendedQueue(Queue, TangoQueue):
     """Python Thread safe Queue with the remove and clean function added"""
 
     def test(self):
@@ -369,9 +388,12 @@ def remove(self, value):
     def _clean(self):
         with self.mutex:
             self.queue.clear()
+            
+    def make_empty(self):
+        self._clean()
+            
 
-
-class TangoRemoteQueue(object):
+class TangoRemoteQueue(TangoQueue):
 
     """Simple Queue with Redis Backend"""
 
@@ -433,7 +455,7 @@ def remove(self, item):
     def _clean(self):
         self.__db.delete(self.key)
 
-    def make_empty(self):
+    def make_empty(self) -> None:
         while True:
             item = self.__db.lpop(self.key)
             if item is None:
diff --git a/tests/testObjects.py b/tests/testObjects.py
index d477073f..8ba81270 100644
--- a/tests/testObjects.py
+++ b/tests/testObjects.py
@@ -69,7 +69,7 @@ def addAllToQueue(self):
             self.assertEqual(self.testQueue.qsize(), self.expectedSize)
 
     def runQueueTests(self):
-        self.testQueue = TangoQueue("self.testQueue")
+        self.testQueue = TangoQueue.create("self.testQueue")
         self.expectedSize = 0
         self.assertEqual(self.testQueue.qsize(), self.expectedSize)
         self.assertTrue(self.testQueue.empty())

From 03e1f3181e23e26d738cb06ce06fb10f82ded926 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 28 Sep 2025 14:08:25 -0400
Subject: [PATCH 12/39] refactored afterJobExecution and detachVM

---
 worker.py | 92 +++++++++++++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/worker.py b/worker.py
index 14da23f6..271a9b7f 100644
--- a/worker.py
+++ b/worker.py
@@ -10,6 +10,7 @@
 from requests.adapters import HTTPAdapter
 import os
 import shutil
+from enum import Enum
 
 from datetime import datetime
 from config import Config
@@ -25,6 +26,11 @@
 # anything else in the system.
 #
 
+class DetachMethod(Enum):
+    RETURN_TO_POOL = "return_to_pool"
+    DESTROY_WITHOUT_REPLACEMENT = "destroy_without_replacement"
+    DESTROY_AND_REPLACE = "replace"
+
 
 class Worker(threading.Thread):
     def __init__(self, job, vmms, jobQueue, preallocator, preVM):
@@ -37,12 +43,18 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM):
         self.preVM = preVM
         threading.Thread.__init__(self)
         self.log = logging.getLogger("Worker")
+        self.cleanupStatus = False
 
     #
     # Worker helper functions
     #
-    # TODO: These should not have default values in my opinion
-    def detachVM(self, return_vm=False, replace_vm=False):
+    def __del__(self):
+        if self.job.stopBefore == "": # We don't want to cleanup the VM if we are stopping early for debugging 
+            assert self.cleanupStatus, "Worker must call detachVM before returning"
+    
+    
+    # TODO: Return_vm should not have default values in my opinion
+    def detachVM(self, detachMethod: DetachMethod):
         """detachVM - Detach the VM from this worker. The options are
         to return it to the pool's free list (return_vm), destroy it
         (not return_vm), and if destroying it, whether to replace it
@@ -50,20 +62,25 @@ def detachVM(self, return_vm=False, replace_vm=False):
         this function before returning.
         """
         # job-owned instance, simply destroy after job is completed
+        self.cleanupStatus = True
         if self.job.vm.ec2_vmms:
             self.vmms.safeDestroyVM(self.job.vm)
-        elif return_vm:
-            self.preallocator.freeVM(self.job.vm)
         else:
-            self.vmms.safeDestroyVM(self.job.vm)
-            if replace_vm:
+            if detachMethod == DetachMethod.RETURN_TO_POOL:
+                self.preallocator.freeVM(self.job.vm)
+            elif detachMethod == DetachMethod.DESTROY_WITHOUT_REPLACEMENT:
+                self.vmms.safeDestroyVM(self.job.vm)
+                self.preallocator.removeVM(self.job.vm)
+            elif detachMethod == DetachMethod.DESTROY_AND_REPLACE:
+                self.vmms.safeDestroyVM(self.job.vm)
                 self.preallocator.createVM(self.job.vm)
-
-            # Important: don't remove the VM from the pool until its
-            # replacement has been created. Otherwise there is a
-            # potential race where the job manager thinks that the
-            # pool is empty and creates a spurious vm.
-            self.preallocator.removeVM(self.job.vm)
+                # Important: don't remove the VM from the pool until its
+                # replacement has been created. Otherwise there is a
+                # potential race where the job manager thinks that the
+                # pool is empty and creates a spurious vm.
+                self.preallocator.removeVM(self.job.vm)
+            else:
+                raise ValueError(f"Invalid detach method: {detachMethod}")
 
     # TODO: figure out what hdrfile, ret and err are
     def rescheduleJob(self, hdrfile, ret, err):
@@ -88,32 +105,18 @@ def rescheduleJob(self, hdrfile, ret, err):
                 os.remove(hdrfile)
             except OSError:
                 pass
-            self.detachVM(return_vm=False, replace_vm=True)
+            self.detachVM(DetachMethod.DESTROY_AND_REPLACE)
             self.jobQueue.unassignJob(self.job.id)
 
         # Here is where we give up
         else:
-            self.log.error("Giving up on job %s:%d" % (self.job.name, self.job.id))
+            full_err = f"Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}"
+            self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err))
             self.job.appendTrace(
-                "%s|Giving up on job %s:%d"
-                % (datetime.now().ctime(), self.job.name, self.job.id)
+                "%s|Giving up on job %s:%d. %s"
+                % (datetime.now().ctime(), self.job.name, self.job.id, full_err)
             )
-            self.jobQueue.makeDead(self.job, err) # Note: this reports the error that caused the last call to rescheduleJob to fail
-
-            self.appendMsg(
-                hdrfile,
-                "Internal error: Unable to complete job after %d tries. Pleae resubmit"
-                % (Config.JOB_RETRIES),
-            )
-            self.appendMsg(
-                hdrfile,
-                "Job status: waitVM=%s initializeVM=%s copyIn=%s runJob=%s copyOut=%s"
-                % (ret["waitvm"], ret["initializevm"], ret["copyin"], ret["runjob"], ret["copyout"]),
-            )
-
-            self.catFiles(hdrfile, self.job.outputFile)
-            self.detachVM(return_vm=False, replace_vm=True)
-            self.notifyServer(self.job)
+            self.afterJobExecution(hdrfile, full_err, DetachMethod.DESTROY_AND_REPLACE)
 
     def appendMsg(self, filename, msg):
         """appendMsg - Append a timestamped Tango message to a file"""
@@ -166,10 +169,7 @@ def notifyServer(self, job):
         except Exception as e:
             self.log.debug("Error in notifyServer: %s" % str(e))
 
-    def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True): 
-        # TODO: I don't think killVM is a good variable name, it can refer to either returning or destroying the VM
-        # TODO: Also, what does it mean to not kill the VM? (i.e. not returning it)? It only gets called when we have a job.stopBefore.
-        # TODO: This directly contradicts the documentation of detachVM ("The worker must always call this function before returning.")
+    def afterJobExecution(self, hdrfile, msg, detachMethod: Optional[DetachMethod]): 
         self.jobQueue.makeDead(self.job, msg)
         
         # Update the text that users see in the autodriver output file
@@ -178,8 +178,8 @@ def afterJobExecution(self, hdrfile, msg, returnVM, killVM=True):
         os.chmod(self.job.outputFile, 0o644)
         
         # Thread exit after termination
-        if killVM:
-            self.detachVM(return_vm=returnVM)
+        if detachMethod is not None:
+            self.detachVM(detachMethod)
         self.notifyServer(self.job)
         return
 
@@ -286,7 +286,7 @@ def run(self):
             if self.job.stopBefore == "waitvm":
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 returnVM = True
-                self.afterJobExecution(hdrfile, msg, returnVM, False)
+                self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT)
 
@@ -324,7 +324,7 @@ def run(self):
             if (self.job.stopBefore == "copyin"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 returnVM = True
-                self.afterJobExecution(hdrfile, msg, returnVM, False)
+                self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Copy input files to VM
             self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}")
@@ -356,7 +356,7 @@ def run(self):
             if (self.job.stopBefore == "runjob"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 returnVM = True
-                self.afterJobExecution(hdrfile, msg, returnVM, False)
+                self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Run the job on the virtual machine
             ret["runjob"] = self.vmms.runJob(
@@ -388,7 +388,7 @@ def run(self):
             if (self.job.stopBefore == "copyout"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 returnVM = True
-                self.afterJobExecution(hdrfile, msg, returnVM, False)
+                self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Copy the output back.
             ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile)
@@ -422,7 +422,7 @@ def run(self):
             # Move the job from the live queue to the dead queue
             # with an explanatory message
             msg = "Success: Autodriver returned normally"
-            (returnVM, replaceVM) = (True, False)
+            detachMethod = DetachMethod.RETURN_TO_POOL
             if ret["copyin"] != 0:
                 msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
             elif ret["runjob"] != 0:
@@ -436,7 +436,7 @@ def run(self):
                     # and do not retry the job since the job may have damaged
                     # the VM.
                     msg = "Error: OS error while running job on VM"
-                    (returnVM, replaceVM) = (False, True)
+                    detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT
                     self.job.vm.notes = str(self.job.id) + "_" + self.job.name
                     self.job.vm.keep_for_debugging = True
                 else:  # This should never happen
@@ -447,7 +447,7 @@ def run(self):
             elif ret["copyout"] != 0:
                 msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"])
 
-            self.afterJobExecution(hdrfile, msg, returnVM)
+            self.afterJobExecution(hdrfile, msg, detachMethod)
             return
 
         #
@@ -463,4 +463,4 @@ def run(self):
             if self.preVM and not vm:
                 vm = self.job.vm = self.preVM
             if vm:
-                self.detachVM(return_vm=False, replace_vm=True)
+                self.detachVM(DetachMethod.DESTROY_AND_REPLACE)

From bc8c8a7c0b33fb33c49af83bcd9c33a8be1ef66b Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 28 Sep 2025 15:48:00 -0400
Subject: [PATCH 13/39] error messages

---
 worker.py | 81 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/worker.py b/worker.py
index 271a9b7f..a57e9e5f 100644
--- a/worker.py
+++ b/worker.py
@@ -53,7 +53,6 @@ def __del__(self):
             assert self.cleanupStatus, "Worker must call detachVM before returning"
     
     
-    # TODO: Return_vm should not have default values in my opinion
     def detachVM(self, detachMethod: DetachMethod):
         """detachVM - Detach the VM from this worker. The options are
         to return it to the pool's free list (return_vm), destroy it
@@ -65,6 +64,7 @@ def detachVM(self, detachMethod: DetachMethod):
         self.cleanupStatus = True
         if self.job.vm.ec2_vmms:
             self.vmms.safeDestroyVM(self.job.vm)
+            # TODO: what about the preallocator?
         else:
             if detachMethod == DetachMethod.RETURN_TO_POOL:
                 self.preallocator.freeVM(self.job.vm)
@@ -110,7 +110,7 @@ def rescheduleJob(self, hdrfile, ret, err):
 
         # Here is where we give up
         else:
-            full_err = f"Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}"
+            full_err = f"Internal Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}"
             self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err))
             self.job.appendTrace(
                 "%s|Giving up on job %s:%d. %s"
@@ -333,7 +333,7 @@ def run(self):
 
             if ret["copyin"] != 0:
                 Config.copyin_errors += 1
-                msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
+                msg = "Copy in to VM failed (status=%d)" % (ret["copyin"])
                 self.job.vm.notes = str(self.job.id) + "_" + self.job.name
                 self.job.vm.keep_for_debugging = True
                 self.log.debug(msg)
@@ -366,13 +366,31 @@ def run(self):
                 self.job.disableNetwork,
             )
             if ret["runjob"] != 0:
-                Config.runjob_errors += 1
-                if ret["runjob"] == -1:
+                if ret["runjob"] == 1:  # This should never happen
+                    msg = "RunJob: Autodriver usage error (status=%d)" % (ret["runjob"])
+                elif ret["runjob"] == 2:
+                    msg = "RunJob: Job timed out after %d seconds" % (self.job.timeout)
+                elif ret["runjob"] == 3:  # EXIT_OSERROR in Autodriver
+                    # Abnormal job termination (Autodriver encountered an OS
+                    # error).  Assume that the VM is damaged. Destroy this VM
+                    # and do not retry the job since the job may have damaged
+                    # the VM.
+                    msg = "RunJob: OS error while running job on VM"
+                    # TODO: do we need to not reschedule the job?
+                    self.job.vm.notes = str(self.job.id) + "_" + self.job.name
+                    self.job.vm.keep_for_debugging = True
+                elif ret["runjob"] == -1:
                     Config.runjob_timeouts += 1
+                    # TODO: difference between 2 and -1?
+                else:  # This should never happen
+                    msg = "RunJob: Unknown autodriver error (status=%d)" % (
+                        ret["runjob"]
+                    )
+                Config.runjob_errors += 1
                 self.rescheduleJob(
                     hdrfile,
                     ret,
-                    "Internal error: runJob failed"
+                    msg
                 )
                 return
             
@@ -397,7 +415,7 @@ def run(self):
                 self.rescheduleJob(
                     hdrfile,
                     ret,
-                    "Internal error: copyOut failed"
+                    f"Internal error: copyOut failed (status={ret['copyout']})"
                 )
                 return
             
@@ -422,32 +440,31 @@ def run(self):
             # Move the job from the live queue to the dead queue
             # with an explanatory message
             msg = "Success: Autodriver returned normally"
-            detachMethod = DetachMethod.RETURN_TO_POOL
-            if ret["copyin"] != 0:
-                msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
-            elif ret["runjob"] != 0:
-                if ret["runjob"] == 1:  # This should never happen
-                    msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"])
-                elif ret["runjob"] == 2:
-                    msg = "Error: Job timed out after %d seconds" % (self.job.timeout)
-                elif ret["runjob"] == 3:  # EXIT_OSERROR in Autodriver
-                    # Abnormal job termination (Autodriver encountered an OS
-                    # error).  Assume that the VM is damaged. Destroy this VM
-                    # and do not retry the job since the job may have damaged
-                    # the VM.
-                    msg = "Error: OS error while running job on VM"
-                    detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT
-                    self.job.vm.notes = str(self.job.id) + "_" + self.job.name
-                    self.job.vm.keep_for_debugging = True
-                else:  # This should never happen
-                    msg = "Error: Unknown autodriver error (status=%d)" % (
-                        ret["runjob"]
-                    )
-
-            elif ret["copyout"] != 0:
-                msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"])
+            self.afterJobExecution(hdrfile, msg, DetachMethod.RETURN_TO_POOL)
+            # if ret["copyin"] != 0:
+            #     msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
+            # elif ret["runjob"] != 0:
+            #     if ret["runjob"] == 1:  # This should never happen
+            #         msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"])
+            #     elif ret["runjob"] == 2:
+            #         msg = "Error: Job timed out after %d seconds" % (self.job.timeout)
+            #     elif ret["runjob"] == 3:  # EXIT_OSERROR in Autodriver
+            #         # Abnormal job termination (Autodriver encountered an OS
+            #         # error).  Assume that the VM is damaged. Destroy this VM
+            #         # and do not retry the job since the job may have damaged
+            #         # the VM.
+            #         msg = "Error: OS error while running job on VM"
+            #         detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT
+            #         self.job.vm.notes = str(self.job.id) + "_" + self.job.name
+            #         self.job.vm.keep_for_debugging = True
+            #     else:  # This should never happen
+            #         msg = "Error: Unknown autodriver error (status=%d)" % (
+            #             ret["runjob"]
+            #         )
+
+            # elif ret["copyout"] != 0:
+            #     msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"])
 
-            self.afterJobExecution(hdrfile, msg, detachMethod)
             return
 
         #

From 20b833b0017de3f25161c59d54087de89850a81c Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 28 Sep 2025 16:55:49 -0400
Subject: [PATCH 14/39] code cleanup: worker always gets initialized with a
 preallocated VM (else preVM.name will fail)

---
 vmms/ec2SSH.py |  1 +
 worker.py      | 88 ++++++++++++++------------------------------------
 2 files changed, 25 insertions(+), 64 deletions(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 02a3f577..73e63a1b 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -347,6 +347,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
         """initializeVM - Tell EC2 to create a new VM instance.
 
         Returns a boto.ec2.instance.Instance object.
+        Reads from vm's id and name, writes to vm's instance_id and domain_name
         """
         newInstance: Optional[Instance] = None
         # Create the instance and obtain the reservation
diff --git a/worker.py b/worker.py
index a57e9e5f..8bc84d5f 100644
--- a/worker.py
+++ b/worker.py
@@ -15,6 +15,8 @@
 from datetime import datetime
 from config import Config
 from jobQueue import JobQueue
+from typing import Optional
+from tangoObjects import TangoMachine
 #
 # Worker - The worker class is very simple and very dumb. The goal is
 # to walk through the VMMS interface, track the job's progress, and if
@@ -32,8 +34,9 @@ class DetachMethod(Enum):
     DESTROY_AND_REPLACE = "replace"
 
 
+# We always preallocate a VM for the worker to use
 class Worker(threading.Thread):
-    def __init__(self, job, vmms, jobQueue, preallocator, preVM):
+    def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine):
         threading.Thread.__init__(self)
         self.daemon = True
         self.job = job
@@ -204,69 +207,30 @@ def run(self):
             hdrfile = tempfile.mktemp()
             self.appendMsg(hdrfile, "Received job %s:%d" % (self.job.name, self.job.id))
 
-            # Assigning job to a preallocated VM
-            if self.preVM:  # self.preVM:
-                assert not Config.VMMS_NAME == "ec2ssh", "Unimplemented"
-                self.log.debug("Assigning job to preallocated VM")
-                self.job.makeVM(self.preVM)
-                self.log.info(
-                    "Assigned job %s:%d existing VM %s"
-                    % (
-                        self.job.name,
-                        self.job.id,
-                        self.vmms.instanceName(self.preVM.id, self.preVM.name),
-                    )
-                )
-                self.job.appendTrace(
-                    "%s|Assigned job %s:%d existing VM %s"
-                    % (
-                        datetime.now().ctime(),
-                        self.job.name,
-                        self.job.id,
-                        self.vmms.instanceName(self.preVM.id, self.preVM.name),
-                    )
-                )
-                self.log.debug("Assigned job to preallocated VM")
-                ret["initializevm"] = 0 # Vacuous success since it doesn't happen
-            # Assigning job to a new VM
-            else:
-                self.log.debug("Assigning job to a new VM")
-                self.job.syncRemote()
-                self.job.vm.id = self.job.id
-                self.job.updateRemote()
-
-                self.log.info(
-                    "Assigned job %s:%d new VM %s"
-                    % (
-                        self.job.name,
-                        self.job.id,
-                        self.vmms.instanceName(self.job.vm.id, self.job.vm.name),
-                    )
+            # Assigning job to the preallocated VM
+            self.log.debug("Assigning job to preallocated VM")
+            self.job.makeVM(self.preVM)
+            self.log.info(
+                "Assigned job %s:%d existing VM %s"
+                % (
+                    self.job.name,
+                    self.job.id,
+                    self.vmms.instanceName(self.preVM.id, self.preVM.name),
                 )
-                self.job.appendTrace(
-                    "%s|Assigned job %s:%d new VM %s"
-                    % (
-                        datetime.now().ctime(),
-                        self.job.name,
-                        self.job.id,
-                        self.vmms.instanceName(self.job.vm.id, self.job.vm.name),
-                    )
+            )
+            self.job.appendTrace(
+                "%s|Assigned job %s:%d existing VM %s"
+                % (
+                    datetime.now().ctime(),
+                    self.job.name,
+                    self.job.id,
+                    self.vmms.instanceName(self.preVM.id, self.preVM.name),
                 )
-
-                # Host name returned from EC2 is stored in the vm object
-                ret["initializevm"] = self.vmms.initializeVM(self.job.vm)
-                if ret["initializevm"] != 0:
-                    self.rescheduleJob(
-                        hdrfile,
-                        ret,
-                        "Internal error: initializeVM failed"
-                    )
-                    return
-                
-                self.log.debug("Asigned job to a new VM")
+            )
+            self.log.debug("Assigned job to preallocated VM")
+            ret["initializevm"] = 0 # Vacuous success since it doesn't happen
 
             vm = self.job.vm
-            returnVM = True
 
             # Wait for the instance to be ready
             self.log.debug(
@@ -285,7 +249,6 @@ def run(self):
             self.log.debug("Waiting for VM")
             if self.job.stopBefore == "waitvm":
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                returnVM = True
                 self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT)
@@ -323,7 +286,6 @@ def run(self):
 
             if (self.job.stopBefore == "copyin"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                returnVM = True
                 self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Copy input files to VM
@@ -355,7 +317,6 @@ def run(self):
 
             if (self.job.stopBefore == "runjob"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                returnVM = True
                 self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Run the job on the virtual machine
@@ -405,7 +366,6 @@ def run(self):
 
             if (self.job.stopBefore == "copyout"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                returnVM = True
                 self.afterJobExecution(hdrfile, msg, detachMethod=None)
                 return
             # Copy the output back.

From 138e33299479eca5ccf10c60bded480a5fcd488c Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 26 Oct 2025 22:17:55 -0400
Subject: [PATCH 15/39] fixed logging type safety

---
 jobManager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jobManager.py b/jobManager.py
index 635df83e..e4e4cb54 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -135,7 +135,7 @@ def __manage(self) -> None:
 if __name__ == "__main__":
 
     if not Config.USE_REDIS:
-        tango.log.error(
+        print(
             "You need to have Redis running to be able to initiate stand-alone\
          JobManager"
         )
@@ -155,5 +155,5 @@ def __manage(self) -> None:
             # with the total pool.
         jobs = JobManager(tango_server.jobQueue)
 
-        tango.log.info("Starting the stand-alone Tango JobManager")
+        tango_server.log.info("Starting the stand-alone Tango JobManager")
         jobs.run()

From d1a8b27aa7c5725b33f13c3f8104fbddc8f319df Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Sun, 26 Oct 2025 22:25:14 -0400
Subject: [PATCH 16/39] always assert that detachVM is called (taken care of
 with .keep_for_debuggin)

---
 worker.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/worker.py b/worker.py
index 2c3e078a..569bb89d 100644
--- a/worker.py
+++ b/worker.py
@@ -51,8 +51,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine):
     # Worker helper functions
     #
     def __del__(self):
-        if self.job.stopBefore == "": # We don't want to cleanup the VM if we are stopping early for debugging 
-            assert self.cleanupStatus, "Worker must call detachVM before returning"
+        assert self.cleanupStatus, "Worker must call detachVM before returning"
     
     
     def detachVM(self, detachMethod: DetachMethod):

From de383297745dcba1b27d70aaf55aeae62a683857 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 27 Oct 2025 01:12:56 -0400
Subject: [PATCH 17/39] more todos

---
 tangoObjects.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tangoObjects.py b/tangoObjects.py
index 63ed8f99..adfa71db 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -106,7 +106,7 @@ def __init__(
         stopBefore="",
     ):
         self._assigned = False
-        self._retries = 0
+        self._retries: int = 0
 
         self._vm = vm
         if input is None:
@@ -130,6 +130,7 @@ def __repr__(self):
         self.syncRemote()
         return f"ID: {self.id} - Name: {self.name}"
     
+    # TODO: reduce code size/duplication by setting TangoJob as a dataclass
     # Getters for private variables
     @property
     def assigned(self):

From 299e9998eaa2b19f664042e51de6f4b6050946b4 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 28 Oct 2025 15:21:47 -0400
Subject: [PATCH 18/39] stop Before bug fix

---
 tangoObjects.py |  7 ++++++-
 worker.py       | 16 ++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tangoObjects.py b/tangoObjects.py
index adfa71db..b0ce2da0 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -124,7 +124,7 @@ def __init__(
         self._accessKeyId = accessKeyId
         self._accessKey = accessKey
         self._disableNetwork = disableNetwork
-        self._stopBefore = "stopBefore"
+        self._stopBefore = stopBefore
 
     def __repr__(self):
         self.syncRemote()
@@ -257,6 +257,11 @@ def setTimeout(self, new_timeout):
         self._timeout = new_timeout
         self.updateRemote()
 
+    def setKeepForDebugging(self, keep_for_debugging: bool):
+        self.syncRemote()
+        self._vm.keep_for_debugging = keep_for_debugging
+        self.updateRemote()
+
     # Private method
     def __updateSelf(self, other_job):
         self._assigned = other_job._assigned
diff --git a/worker.py b/worker.py
index 569bb89d..c68400f0 100644
--- a/worker.py
+++ b/worker.py
@@ -246,7 +246,7 @@ def run(self):
             self.log.debug("Waiting for VM")
             if self.job.stopBefore == "waitvm":
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                self.job.vm.keep_for_debugging = True
+                self.job.setKeepForDebugging(True)
                 self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
                 return
             ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT)
@@ -281,11 +281,11 @@ def run(self):
                     self.job.id,
                 )
             )
-
             if (self.job.stopBefore == "copyin"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                self.job.vm.keep_for_debugging = True
+                self.job.setKeepForDebugging(True)
                 self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
+                self.log.debug(msg)
                 return
             # Copy input files to VM
             self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}")
@@ -296,7 +296,7 @@ def run(self):
                 Config.copyin_errors += 1
                 msg = "Copy in to VM failed (status=%d)" % (ret["copyin"])
                 self.job.vm.notes = str(self.job.id) + "_" + self.job.name
-                self.job.vm.keep_for_debugging = True
+                self.job.setKeepForDebugging(True)
                 self.log.debug(msg)
                 self.rescheduleJob(
                     hdrfile,
@@ -316,7 +316,7 @@ def run(self):
 
             if (self.job.stopBefore == "runjob"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                self.job.vm.keep_for_debugging = True
+                self.job.setKeepForDebugging(True)
                 self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
                 return
             # Run the job on the virtual machine
@@ -339,7 +339,7 @@ def run(self):
                     msg = "RunJob: OS error while running job on VM"
                     # TODO: do we need to not reschedule the job?
                     self.job.vm.notes = str(self.job.id) + "_" + self.job.name
-                    self.job.vm.keep_for_debugging = True
+                    self.job.setKeepForDebugging(True)
                 elif ret["runjob"] == -1:
                     Config.runjob_timeouts += 1
                     # TODO: difference between 2 and -1?
@@ -366,7 +366,7 @@ def run(self):
 
             if (self.job.stopBefore == "copyout"):
                 msg = "Execution stopped before %s" % self.job.stopBefore
-                self.job.vm.keep_for_debugging = True
+                self.job.setKeepForDebugging(True)
                 self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
                 return
             # Copy the output back.
@@ -417,7 +417,7 @@ def run(self):
             #         msg = "Error: OS error while running job on VM"
             #         detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT
             #         self.job.vm.notes = str(self.job.id) + "_" + self.job.name
-            #         self.job.vm.keep_for_debugging = True
+            #         self.job.setKeepForDebugging(True)
             #     else:  # This should never happen
             #         msg = "Error: Unknown autodriver error (status=%d)" % (
             #             ret["runjob"]

From f44735e10af65112bf7b3837e72ad003fbcb7a69 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Thu, 6 Nov 2025 00:20:31 -0500
Subject: [PATCH 19/39] tangodictionary expanded key type

---
 jobQueue.py     |  5 +++--
 tangoObjects.py | 34 ++++++++++++++++++++++------------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 34c3ae5b..0cda7fd3 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -54,7 +54,7 @@ def __init__(self, preallocator):
         using the makeUnassigned api.
         """
         self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs")
-        self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs")
+        self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs
         self.unassignedJobs = TangoQueue.create("unassignedLiveJobs")
         self.queueLock = threading.Lock()
         self.preallocator = preallocator
@@ -168,7 +168,8 @@ def add(self, job):
 
         return str(job.id)
 
-    def addDead(self, job):
+    # TODO: get rid of this return value, it is not used anywhere 
+    def addDead(self, job) -> int: 
         """addDead - add a job to the dead queue.
         Called by validateJob when a job validation fails.
         Returns -1 on failure and the job id on success
diff --git a/tangoObjects.py b/tangoObjects.py
index adfa71db..4dfc1170 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -7,7 +7,7 @@
 from queue import Queue
 import pickle
 import redis
-from typing import Optional, Protocol, TypeVar
+from typing import Optional, Protocol, TypeVar, Union
 from abc import abstractmethod
 
 redisConnection = None
@@ -125,10 +125,11 @@ def __init__(
         self._accessKey = accessKey
         self._disableNetwork = disableNetwork
         self._stopBefore = "stopBefore"
+        self._id: Optional[int] = None # uninitialized until it gets added to either the live or dead queue
 
     def __repr__(self):
         self.syncRemote()
-        return f"ID: {self.id} - Name: {self.name}"
+        return f"ID: {self._id} - Name: {self.name}"
     
     # TODO: reduce code size/duplication by setting TangoJob as a dataclass
     # Getters for private variables
@@ -207,6 +208,11 @@ def stopBefore(self):
         self.syncRemote()
         return self._stopBefore
     
+    @property
+    def id(self) -> int:
+        self.syncRemote()
+        assert self._id is not None, "Job ID is not set, add it to the job queue first"
+        return self._id
 
     def makeAssigned(self):
         self.syncRemote()
@@ -242,8 +248,8 @@ def appendTrace(self, trace_str):
         self._trace.append(trace_str)
         self.updateRemote()
 
-    def setId(self, new_id):
-        self.id = new_id
+    def setId(self, new_id: int) -> None:
+        self._id = new_id
         if self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
@@ -269,6 +275,7 @@ def __updateSelf(self, other_job):
         self._timeout = other_job._timeout
         self._trace = other_job._trace
         self._maxOutputFileSize = other_job._maxOutputFileSize
+        self._id = other_job._id
 
 
     def syncRemote(self):
@@ -290,14 +297,16 @@ def updateRemote(self):
             dictionary.set(key, self)
             
     def deleteFromDict(self, dictionary : TangoDictionary) -> None:
-        dictionary.delete(self.id)
+        assert self._id is not None
+        dictionary.delete(self._id)
         self._remoteLocation = None
         
     def addToDict(self, dictionary : TangoDictionary) -> None:
-        dictionary.set(self.id, self)
+        assert self._id is not None
+        dictionary.set(self._id, self)
         assert self._remoteLocation is None, "Job already has a remote location"
         if Config.USE_REDIS:
-            self._remoteLocation = dictionary.hash_name + ":" + str(self.id)
+            self._remoteLocation = dictionary.hash_name + ":" + str(self._id)
             self.updateRemote()
         
 
@@ -460,6 +469,7 @@ def make_empty(self) -> None:
         self.__db.delete(self.key)
 
 T = TypeVar('T')
+KeyType = Union[str, int]
 # Dictionary from string to T
 class TangoDictionary(Protocol[T]):
 
@@ -476,16 +486,16 @@ def hash_name(self) -> str:
         ...
         
     @abstractmethod
-    def __contains__(self, id: str) -> bool:
+    def __contains__(self, id: KeyType) -> bool:
         ...
     @abstractmethod
-    def set(self, id: str, obj: T) -> str:
+    def set(self, id: KeyType, obj: T) -> str:
         ...
     @abstractmethod
-    def get(self, id: str) -> Optional[T]:
+    def get(self, id: KeyType) -> Optional[T]:
         ...
     @abstractmethod
-    def getExn(self, id: str) -> T:
+    def getExn(self, id: KeyType) -> T:
         ...
     @abstractmethod
     def keys(self) -> list[str]:
@@ -494,7 +504,7 @@ def keys(self) -> list[str]:
     def values(self) -> list[T]:
         ...
     @abstractmethod
-    def delete(self, id: str) -> None:
+    def delete(self, id: KeyType) -> None:
         ...
     @abstractmethod
     def _clean(self) -> None:

From f7b4acbe22fbddb63258619043b6ffb99df1145e Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Thu, 6 Nov 2025 00:39:25 -0500
Subject: [PATCH 20/39] more no-op type annotations

---
 tangoObjects.py | 11 ++++++-----
 vmms/ec2SSH.py  | 10 +++++-----
 worker.py       |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tangoObjects.py b/tangoObjects.py
index 82a08a45..8c49b3d3 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -264,9 +264,10 @@ def setTimeout(self, new_timeout):
         self.updateRemote()
 
     def setKeepForDebugging(self, keep_for_debugging: bool):
-        self.syncRemote()
-        self._vm.keep_for_debugging = keep_for_debugging
-        self.updateRemote()
+        if (self._vm is not None):
+            self.syncRemote()
+            self._vm.keep_for_debugging = keep_for_debugging
+            self.updateRemote()
 
     # Private method
     def __updateSelf(self, other_job):
@@ -283,7 +284,7 @@ def __updateSelf(self, other_job):
         self._id = other_job._id
 
 
-    def syncRemote(self):
+    def syncRemote(self) -> None:
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
@@ -294,7 +295,7 @@ def syncRemote(self):
                 return
             self.__updateSelf(temp_job)
 
-    def updateRemote(self):
+    def updateRemote(self) -> None:
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 914de719..6753d757 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -22,9 +22,9 @@
 
 import config
 from tangoObjects import TangoMachine
-from typing import Optional, Literal, List, Sequence
+from typing import Optional, Literal, List, Dict, Sequence
 from mypy_boto3_ec2 import EC2ServiceResource
-from mypy_boto3_ec2.service_resource import Instance
+from mypy_boto3_ec2.service_resource import Instance, Image
 from mypy_boto3_ec2.type_defs import FilterTypeDef
 
 from vmms.interface import VMMSInterface
@@ -170,7 +170,7 @@ def release_vm_semaphore():
         Ec2SSH._vm_semaphore.release()
 
     # TODO: the arguments accessKeyId and accessKey don't do anything
-    def __init__(self, accessKeyId=None, accessKey=None):
+    def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None) -> None:
         """log - logger for the instance
         connection - EC2Connection object that stores the connection
         info to the EC2 network
@@ -203,8 +203,8 @@ def __init__(self, accessKeyId=None, accessKey=None):
             # self.createKeyPair()
         # create boto3resource
 
-        self.img2ami = {} # this is a bad name, should really be img_name to img
-        self.images = []
+        self.img2ami: Dict[str, Image] = {} # this is a bad name, should really be img_name to img
+        self.images: List[Image] = []
         try:
             # This is a service resource
             self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource
diff --git a/worker.py b/worker.py
index c68400f0..692a30de 100644
--- a/worker.py
+++ b/worker.py
@@ -439,6 +439,6 @@ def run(self):
             # if vm is not set but self.preVM is set, we still need
             # to return the VM, but have to initialize self.job.vm first
             if self.preVM and not vm:
-                vm = self.job.vm = self.preVM
+                vm = self.job.vm = self.preVM 
             if vm:
                 self.detachVM(DetachMethod.DESTROY_AND_REPLACE)

From b643a9862ca5c2c942a230b91f9d1ae190ab318f Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Thu, 6 Nov 2025 01:01:14 -0500
Subject: [PATCH 21/39] 'no issues found in 25 source files'

---
 jobQueue.py        |  9 +++++----
 tango.py           |  5 +++--
 tangoObjects.py    | 15 ++++++++-------
 vmms/distDocker.py |  2 +-
 vmms/tashiSSH.py   |  4 ++--
 5 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 0cda7fd3..acbb31c8 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -14,6 +14,7 @@
 from datetime import datetime
 from tangoObjects import TangoDictionary, TangoJob, TangoQueue
 from config import Config
+from preallocator import Preallocator
 
 #
 # JobQueue - This class defines the job queue and the functions for
@@ -31,7 +32,7 @@
 
 
 class JobQueue(object):
-    def __init__(self, preallocator):
+    def __init__(self, preallocator: Preallocator) -> None:
         """
         Here we maintain several data structures used to keep track of the
         jobs present for the autograder.
@@ -55,9 +56,9 @@ def __init__(self, preallocator):
         """
         self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs")
         self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs
-        self.unassignedJobs = TangoQueue.create("unassignedLiveJobs")
+        self.unassignedJobs: TangoQueue[int] = TangoQueue.create("unassignedLiveJobs")
         self.queueLock = threading.Lock()
-        self.preallocator = preallocator
+        self.preallocator: Preallocator = preallocator
         self.log = logging.getLogger("JobQueue")
         self.nextID = 1
 
@@ -357,6 +358,7 @@ def getNextPendingJob(self) -> TangoJob:
         """
         # Blocks till the next item is added
         id = self.unassignedJobs.get()
+        assert id is not None, ".get with default arguments should block and never return None"
 
         self.log.debug("_getNextPendingJob|Acquiring lock to job queue.")
         self.queueLock.acquire()
@@ -366,7 +368,6 @@ def getNextPendingJob(self) -> TangoJob:
         job = self.liveJobs.get(id)
         if job is None:
             raise Exception("Cannot find unassigned job in live jobs")
-
         self.log.debug("getNextPendingJob| Releasing lock to job queue.")
         self.queueLock.release()
         self.log.debug("getNextPendingJob| Released lock to job queue.")
diff --git a/tango.py b/tango.py
index 3f94cd12..925413f0 100755
--- a/tango.py
+++ b/tango.py
@@ -48,16 +48,17 @@
 from jobQueue import JobQueue
 from tangoObjects import TangoJob
 from config import Config
+from vmms.interface import VMMSInterface
 
 
 class TangoServer(object):
 
     """TangoServer - Implements the API functions that the server accepts"""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.daemon = True
 
-        vmms = None
+        vmms: VMMSInterface
         if Config.VMMS_NAME == "tashiSSH":
             from vmms.tashiSSH import TashiSSH
 
diff --git a/tangoObjects.py b/tangoObjects.py
index 8c49b3d3..8dd4d888 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -360,9 +360,10 @@ def set(self, val):
         return val
     
 
-class TangoQueue(Protocol):
+QueueElem = TypeVar('QueueElem')
+class TangoQueue(Protocol[QueueElem]):
     @staticmethod
-    def create(key_name: str) -> TangoQueue:
+    def create(key_name: str) -> TangoQueue[QueueElem]:
         if Config.USE_REDIS:
             return TangoRemoteQueue(key_name)
         else:
@@ -373,20 +374,20 @@ def qsize(self) -> int:
         ...
     def empty(self) -> bool:
         ...
-    def put(self, item) -> None:
+    def put(self, item: QueueElem) -> None:
         ...
-    def get(self, block=True, timeout=None) -> Optional[T]:
+    def get(self, block=True, timeout=None) -> Optional[QueueElem]:
         ...
-    def get_nowait(self) -> Optional[T]:
+    def get_nowait(self) -> Optional[QueueElem]:
         ...
-    def remove(self, item) -> None:
+    def remove(self, item: QueueElem) -> None:
         ...
     def _clean(self) -> None:
         ...
     def make_empty(self) -> None:
         ...
 
-class ExtendedQueue(Queue, TangoQueue):
+class ExtendedQueue(Queue, TangoQueue[QueueElem]):
     """Python Thread safe Queue with the remove and clean function added"""
 
     def test(self):
diff --git a/vmms/distDocker.py b/vmms/distDocker.py
index b6d498e5..1dc93bb6 100644
--- a/vmms/distDocker.py
+++ b/vmms/distDocker.py
@@ -187,7 +187,7 @@ def waitVM(self, vm, max_secs):
             # Sleep a bit before trying again
             time.sleep(config.Config.TIMER_POLL_INTERVAL)
 
-    def copyIn(self, vm, inputFiles):
+    def copyIn(self, vm, inputFiles, job_id=None):
         """copyIn - Create a directory to be mounted as a volume
         for the docker containers on the host machine for this VM.
         Copy input files to this directory on the host machine.
diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py
index e0e7c58e..74113755 100644
--- a/vmms/tashiSSH.py
+++ b/vmms/tashiSSH.py
@@ -254,7 +254,7 @@ def waitVM(self, vm, max_secs):
             # Sleep a bit before trying again
             time.sleep(config.Config.TIMER_POLL_INTERVAL)
 
-    def copyIn(self, vm, inputFiles):
+    def copyIn(self, vm, inputFiles, job_id=None):
         """copyIn - Copy input files to VM"""
         domain_name = self.domainName(vm.id, vm.name)
         self.log.debug("Creating autolab directory on VM")
@@ -292,7 +292,7 @@ def copyIn(self, vm, inputFiles):
                 return ret
         return 0
 
-    def runJob(self, vm, runTimeout, maxOutputFileSize):
+    def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         """runJob - Run the make command on a VM using SSH and
         redirect output to file "output".
         """

From 5439c159a525982bb71f64eb2f2be49673ecabbf Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Thu, 6 Nov 2025 13:04:02 -0500
Subject: [PATCH 22/39] changed ret in worker.py to not have Nones

---
 clients/tango-cli.py |  1 -
 jobQueue.py          | 10 +++++-----
 mypy.ini             |  6 ++++++
 preallocator.py      |  8 ++++----
 tangoObjects.py      |  6 +++---
 vmms/distDocker.py   |  3 ++-
 vmms/localDocker.py  |  9 ++++++---
 worker.py            | 14 +++++---------
 8 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/clients/tango-cli.py b/clients/tango-cli.py
index a2a47ba0..bfa2c14b 100755
--- a/clients/tango-cli.py
+++ b/clients/tango-cli.py
@@ -274,7 +274,6 @@ def tango_upload():
 
 def tango_addJob():
     try:
-        requestObj = {}
         res = checkKey() + checkCourselab() + checkInfiles()
         if res != 0:
             raise Exception("Invalid usage: [addJob] " + addJob_help)
diff --git a/jobQueue.py b/jobQueue.py
index acbb31c8..6633adab 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -12,10 +12,10 @@
 import time
 
 from datetime import datetime
-from tangoObjects import TangoDictionary, TangoJob, TangoQueue
+from tangoObjects import TangoDictionary, TangoJob, TangoQueue, TangoMachine
 from config import Config
 from preallocator import Preallocator
-
+from typing import Optional
 #
 # JobQueue - This class defines the job queue and the functions for
 # manipulating it. The actual queue is made up of two smaller
@@ -251,7 +251,7 @@ def get(self, id):
     # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. 
     # TODO: It does assign the job to a particular VM though.
     # Precondition: jobId is in self.liveJobs
-    def assignJob(self, jobId, vm=None) -> None:
+    def assignJob(self, jobId, vm=None):
         """assignJob - marks a job to be assigned"""
         self.queueLock.acquire()
         self.log.debug("assignJob| Acquired lock to job queue.")
@@ -274,7 +274,7 @@ def assignJob(self, jobId, vm=None) -> None:
         # return job
 
     # TODO: Rename this job to be more accurate in its description
-    def unassignJob(self, jobId):
+    def unassignJob(self, jobId: int) -> None:
         """unassignJob - marks a job to be unassigned
         Note: We assume here that a job is to be rescheduled or
         'retried' when you unassign it. This retry is done by
@@ -284,7 +284,7 @@ def unassignJob(self, jobId):
         self.log.debug("unassignJob| Acquired lock to job queue.")
 
         # Get the current job
-        job = self.liveJobs.get(jobId)
+        job = self.liveJobs.getExn(jobId)
 
         # Increment the number of retires
         if job.retries is None:
diff --git a/mypy.ini b/mypy.ini
index f6332a89..7fe2ea10 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -4,5 +4,11 @@ explicit_package_bases = True
 [mypy-vmms.tashiSSH]
 ignore_errors = True
 
+[mypy-vmms.distDocker]
+ignore_errors = True
+
+[mypy-tests.*]
+ignore_errors = True
+
 [mypy-preallocator]
 ignore_errors = True
\ No newline at end of file
diff --git a/preallocator.py b/preallocator.py
index 69291933..acfd9a91 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -9,7 +9,7 @@
 from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine
 from config import Config
 from typing import Tuple, List
-
+from vmms.interface import VMMSInterface
 #
 # Preallocator - This class maintains a pool of active VMs for future
 # job requests.  The pool is stored in dictionary called
@@ -22,11 +22,11 @@
 
 
 class Preallocator(object):
-    def __init__(self, vmms):
-        self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue]] = TangoDictionary.create("machines")
+    def __init__(self, vmms: VMMSInterface) -> None:
+        self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines")
         self.lock = threading.Lock()
         self.nextID = TangoIntValue("nextID", 1000)
-        self.vmms = vmms
+        self.vmms: VMMSInterface = vmms
         self.log = logging.getLogger("Preallocator")
 
     def poolSize(self, vmName):
diff --git a/tangoObjects.py b/tangoObjects.py
index 8dd4d888..49b05ea9 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -93,7 +93,7 @@ class TangoJob(object):
 
     def __init__(
         self,
-        vm: Optional[TangoMachine] = None,
+        vm: TangoMachine,
         outputFile=None,
         name=None,
         input=None,
@@ -108,7 +108,7 @@ def __init__(
         self._assigned = False
         self._retries: int = 0
 
-        self._vm = vm
+        self._vm: TangoMachine = vm
         if input is None:
             self._input = []
         else:
@@ -229,7 +229,7 @@ def incrementRetries(self):
         self._retries += 1
         self.updateRemote()
 
-    def makeVM(self, vm):
+    def makeVM(self, vm: TangoMachine) -> None:
         self.syncRemote()
         self._vm = vm
         self.updateRemote()
diff --git a/vmms/distDocker.py b/vmms/distDocker.py
index 1dc93bb6..ec0e475c 100644
--- a/vmms/distDocker.py
+++ b/vmms/distDocker.py
@@ -21,6 +21,7 @@
 import socket
 import config
 from tangoObjects import TangoMachine
+from vmms.interface import VMMSInterface
 
 
 def timeout(command, time_out=1):
@@ -75,7 +76,7 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0):
     return ret
 
 
-class DistDocker(object):
+class DistDocker(VMMSInterface):
 
     _SSH_FLAGS = ["-q", "-o", "BatchMode=yes"]
     _SSH_AUTH_FLAGS = [
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index 8ccfb7d5..fe2ad73c 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -13,11 +13,11 @@
 import shutil
 import config
 from tangoObjects import TangoMachine
-
+from typing import List
 from vmms.interface import VMMSInterface
 
 
-def timeout(command, time_out=1):
+def timeout(command: List[str], time_out: float = 1) -> int:
     """timeout - Run a unix command with a timeout. Return -1 on
     timeout, otherwise return the return value from the command, which
     is typically 0 for success, 1-255 for failure.
@@ -35,6 +35,7 @@ def timeout(command, time_out=1):
         t += config.Config.TIMER_POLL_INTERVAL
 
     # Determine why the while loop terminated
+    returncode: int
     if p.poll() is None:
         try:
             os.kill(p.pid, 9)
@@ -42,7 +43,9 @@ def timeout(command, time_out=1):
             pass
         returncode = -1
     else:
-        returncode = p.poll()
+        poll_result = p.poll()
+        assert poll_result is not None
+        returncode = poll_result
     return returncode
 
 
diff --git a/worker.py b/worker.py
index 692a30de..cf6e5a03 100644
--- a/worker.py
+++ b/worker.py
@@ -16,6 +16,8 @@
 from config import Config
 from jobQueue import JobQueue
 from tangoObjects import TangoMachine
+from typing import Dict, Optional
+from vmms.interface import VMMSInterface
 #
 # Worker - The worker class is very simple and very dumb. The goal is
 # to walk through the VMMS interface, track the job's progress, and if
@@ -39,7 +41,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine):
         threading.Thread.__init__(self)
         self.daemon = True
         self.job = job
-        self.vmms = vmms
+        self.vmms: VMMSInterface = vmms
         self.jobQueue : JobQueue = jobQueue
         self.preallocator = preallocator
         self.preVM = preVM
@@ -154,7 +156,7 @@ def notifyServer(self, job):
                 self.log.debug("Sending request to %s" % job.notifyURL)
                 with requests.session() as s:
                     # urllib3 retry, allow POST to be retried, use backoffs
-                    r = Retry(total=10, allowed_methods=False, backoff_factor=1)
+                    r = Retry(total=10, allowed_methods=None, backoff_factor=1)
                     s.mount("http://", HTTPAdapter(max_retries=r))
                     s.mount("https://", HTTPAdapter(max_retries=r))
                     response = s.post(
@@ -190,13 +192,7 @@ def run(self):
         """run - Step a job through its execution sequence"""
         try:
             # Hash of return codes for each step
-            ret = {}
-            ret["waitvm"] = None
-            ret["initializevm"] = None
-            ret["copyin"] = None
-            ret["runjob"] = None
-            ret["copyout"] = None
-            print("HELLO")
+            ret: Dict[str, int] = {}
             self.log.debug("Run worker")
             vm = None
 

From 2dc0ee744a48f0972a75f7b39dbdbc89f3a6385a Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 11 Nov 2025 12:27:41 -0500
Subject: [PATCH 23/39] moved timeout into VMMSUtils

---
 jobManager.py       |  2 ++
 preallocator.py     |  6 ++--
 vmms/distDocker.py  | 81 +++++++++------------------------------------
 vmms/ec2SSH.py      | 58 ++------------------------------
 vmms/interface.py   |  3 ++
 vmms/localDocker.py | 63 +++--------------------------------
 vmms/tashiSSH.py    | 34 -------------------
 7 files changed, 31 insertions(+), 216 deletions(-)

diff --git a/jobManager.py b/jobManager.py
index e4e4cb54..50cd9553 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -23,6 +23,7 @@
 from tangoObjects import TangoJob, TangoQueue, TangoMachine
 from typing import List, Tuple
 from worker import Worker
+from vmms.interface import VMMSInterface
 
 
 
@@ -73,6 +74,7 @@ def __manage(self) -> None:
                     # Sleep for a bit and then check again
                     time.sleep(Config.DISPATCH_PERIOD)
 
+            vmms: VMMSInterface
             try:
                 # if the job is a ec2 vmms job
                 # spin up an ec2 instance for that job
diff --git a/preallocator.py b/preallocator.py
index acfd9a91..16b800b3 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -8,7 +8,7 @@
 
 from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue, TangoMachine
 from config import Config
-from typing import Tuple, List
+from typing import Tuple, List, Dict
 from vmms.interface import VMMSInterface
 #
 # Preallocator - This class maintains a pool of active VMs for future
@@ -22,11 +22,11 @@
 
 
 class Preallocator(object):
-    def __init__(self, vmms: VMMSInterface) -> None:
+    def __init__(self, vmms: Dict[str, VMMSInterface]) -> None:
         self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines")
         self.lock = threading.Lock()
         self.nextID = TangoIntValue("nextID", 1000)
-        self.vmms: VMMSInterface = vmms
+        self.vmms: Dict[str, VMMSInterface] = vmms
         self.log = logging.getLogger("Preallocator")
 
     def poolSize(self, vmName):
diff --git a/vmms/distDocker.py b/vmms/distDocker.py
index ec0e475c..ab90bcb6 100644
--- a/vmms/distDocker.py
+++ b/vmms/distDocker.py
@@ -22,61 +22,10 @@
 import config
 from tangoObjects import TangoMachine
 from vmms.interface import VMMSInterface
+from vmms.sharedUtils import VMMSUtils
 
 
-def timeout(command, time_out=1):
-    """timeout - Run a unix command with a timeout. Return -1 on
-    timeout, otherwise return the return value from the command, which
-    is typically 0 for success, 1-255 for failure.
-    """
-
-    # Launch the command
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-
-    # Wait for the command to complete
-    t = 0.0
-    while t < time_out and p.poll() is None:
-        time.sleep(config.Config.TIMER_POLL_INTERVAL)
-        t += config.Config.TIMER_POLL_INTERVAL
-
-    # Determine why the while loop terminated
-    if p.poll() is None:
-        try:
-            os.kill(p.pid, 9)
-        except OSError:
-            pass
-        returncode = -1
-    else:
-        returncode = p.poll()
-    return returncode
-
-
-def timeoutWithReturnStatus(command, time_out, returnValue=0):
-    """timeoutWithReturnStatus - Run a Unix command with a timeout,
-    until the expected value is returned by the command; On timeout,
-    return last error code obtained from the command.
-    """
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-    t = 0.0
-    while t < time_out:
-        ret = p.poll()
-        if ret is None:
-            time.sleep(config.Config.TIMER_POLL_INTERVAL)
-            t += config.Config.TIMER_POLL_INTERVAL
-        elif ret == returnValue:
-            return ret
-        else:
-            p = subprocess.Popen(
-                command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-            )
-    return ret
-
-
-class DistDocker(VMMSInterface):
+class DistDocker(VMMSInterface, VMMSUtils):
 
     _SSH_FLAGS = ["-q", "-o", "BatchMode=yes"]
     _SSH_AUTH_FLAGS = [
@@ -172,7 +121,7 @@ def waitVM(self, vm, max_secs):
             # If the call to ssh returns timeout (-1) or ssh error
             # (255), then success. Otherwise, keep trying until we run
             # out of time.
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -197,7 +146,7 @@ def copyIn(self, vm, inputFiles, job_id=None):
         volumePath = self.getVolumePath(instanceName)
 
         if vm.use_ssh_master:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -209,7 +158,7 @@ def copyIn(self, vm, inputFiles, job_id=None):
                 return ret
 
         # Create a fresh volume
-        ret = timeout(
+        ret = VMMSUtils.timeout(
             ["ssh"]
             + DistDocker._SSH_FLAGS
             + vm.ssh_flags
@@ -225,7 +174,7 @@ def copyIn(self, vm, inputFiles, job_id=None):
             return ret
 
         for file in inputFiles:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["scp"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -261,7 +210,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         volumePath = self.getVolumePath(instanceName)
 
         if vm.use_ssh_master:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -303,7 +252,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
 
         self.log.debug("Running job: %s" % args)
 
-        ret = timeout(
+        ret = VMMSUtils.timeout(
             ["ssh"]
             + DistDocker._SSH_FLAGS
             + vm.ssh_flags
@@ -324,7 +273,7 @@ def copyOut(self, vm, destFile):
         volumePath = self.getVolumePath(instanceName)
 
         if vm.use_ssh_master:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -335,7 +284,7 @@ def copyOut(self, vm, destFile):
                 self.log.debug("Lost persistent SSH connection")
                 return ret
 
-        ret = timeout(
+        ret = VMMSUtils.timeout(
             ["scp"]
             + DistDocker._SSH_FLAGS
             + vm.ssh_flags
@@ -356,7 +305,7 @@ def destroyVM(self, vm):
         instanceName = self.instanceName(vm.id, vm.image)
         volumePath = self.getVolumePath(instanceName)
         if vm.use_ssh_master:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -372,7 +321,7 @@ def destroyVM(self, vm):
         # Do a hard kill on corresponding docker container.
         # Return status does not matter.
         args = "(docker rm -f %s)" % (instanceName)
-        timeout(
+        VMMSUtils.timeout(
             ["ssh"]
             + DistDocker._SSH_FLAGS
             + vm.ssh_flags
@@ -380,7 +329,7 @@ def destroyVM(self, vm):
             config.Config.DOCKER_RM_TIMEOUT,
         )
         # Destroy corresponding volume if it exists.
-        timeout(
+        VMMSUtils.timeout(
             ["ssh"]
             + DistDocker._SSH_FLAGS
             + vm.ssh_flags
@@ -389,7 +338,7 @@ def destroyVM(self, vm):
         )
         self.log.debug("Deleted volume %s" % instanceName)
         if vm.use_ssh_master:
-            timeout(
+            VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
@@ -490,7 +439,7 @@ def getPartialOutput(self, vm):
         instanceName = self.instanceName(vm.id, vm.image)
 
         if vm.use_ssh_master:
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + DistDocker._SSH_FLAGS
                 + vm.ssh_flags
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 6753d757..27134f6a 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -28,7 +28,7 @@
 from mypy_boto3_ec2.type_defs import FilterTypeDef
 
 from vmms.interface import VMMSInterface
-
+from vmms.sharedUtils import VMMSUtils
 
 # suppress most boto logging
 logging.getLogger("boto3").setLevel(logging.CRITICAL)
@@ -36,35 +36,6 @@
 logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
 
 
-def timeout(command, time_out=1):
-    """timeout - Run a unix command with a timeout. Return -1 on
-    timeout, otherwise return the return value from the command, which
-    is typically 0 for success, 1-255 for failure.
-    """
-
-    # Launch the command
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-
-    # Wait for the command to complete
-    t = 0.0
-    while t < time_out and p.poll() is None:
-        time.sleep(config.Config.TIMER_POLL_INTERVAL)
-        t += config.Config.TIMER_POLL_INTERVAL
-    if t >= time_out:
-        print("ERROR: timeout trying ", command)
-    # Determine why the while loop terminated
-    if p.poll() is None:
-        try:
-            os.kill(p.pid, 9)
-        except OSError:
-            pass
-        returncode = -1
-    else:
-        returncode = p.poll()
-    return returncode
-
 
 def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2):
     """timeout - Run a unix command with a timeout. Return -1 on
@@ -108,29 +79,6 @@ def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2):
             return returncode
 
 
-def timeoutWithReturnStatus(command, time_out, returnValue=0):
-    """timeoutWithReturnStatus - Run a Unix command with a timeout,
-    until the expected value is returned by the command; On timeout,
-    return last error code obtained from the command.
-    """
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-    t = 0.0
-    while t < time_out:
-        ret = p.poll()
-        if ret is None:
-            time.sleep(config.Config.TIMER_POLL_INTERVAL)
-            t += config.Config.TIMER_POLL_INTERVAL
-        elif ret == returnValue:
-            return ret
-        else:
-            p = subprocess.Popen(
-                command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-            )
-            return ret
-
-
 @backoff.on_exception(backoff.expo, ClientError, max_tries=3, jitter=None)
 def try_load_instance(newInstance):
     newInstance.load()
@@ -146,7 +94,7 @@ class ec2CallError(Exception):
     pass
 
 
-class Ec2SSH(VMMSInterface):
+class Ec2SSH(VMMSInterface, VMMSUtils):
     _SSH_FLAGS = [
         "-i",
         config.Config.SECURITY_KEY_PATH,
@@ -517,7 +465,7 @@ def waitVM(self, vm, max_secs) -> Literal[0, -1]:
             # If the call to ssh returns timeout (-1) or ssh error
             # (255), then success. Otherwise, keep trying until we run
             # out of time.
-            ret = timeout(
+            ret = VMMSUtils.timeout(
                 ["ssh"]
                 + self.ssh_flags
                 + ["%s@%s" % (self.ec2User, domain_name), "(:)"],
diff --git a/vmms/interface.py b/vmms/interface.py
index d71500f9..04043c13 100644
--- a/vmms/interface.py
+++ b/vmms/interface.py
@@ -40,3 +40,6 @@ def getVMs(self) -> List[TangoMachine]:
     def existsVM(self, vm: TangoMachine) -> bool:
         ...
 
+    @abstractmethod
+    def instanceName(self, id: int, name: str) -> str:
+        ...
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index fe2ad73c..b002a06e 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -15,69 +15,16 @@
 from tangoObjects import TangoMachine
 from typing import List
 from vmms.interface import VMMSInterface
+from vmms.sharedUtils import VMMSUtils
 
 
-def timeout(command: List[str], time_out: float = 1) -> int:
-    """timeout - Run a unix command with a timeout. Return -1 on
-    timeout, otherwise return the return value from the command, which
-    is typically 0 for success, 1-255 for failure.
-    """
-
-    # Launch the command
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-
-    # Wait for the command to complete
-    t = 0.0
-    while t < time_out and p.poll() is None:
-        time.sleep(config.Config.TIMER_POLL_INTERVAL)
-        t += config.Config.TIMER_POLL_INTERVAL
-
-    # Determine why the while loop terminated
-    returncode: int
-    if p.poll() is None:
-        try:
-            os.kill(p.pid, 9)
-        except OSError:
-            pass
-        returncode = -1
-    else:
-        poll_result = p.poll()
-        assert poll_result is not None
-        returncode = poll_result
-    return returncode
-
-
-def timeoutWithReturnStatus(command, time_out, returnValue=0):
-    """timeoutWithReturnStatus - Run a Unix command with a timeout,
-    until the expected value is returned by the command; On timeout,
-    return last error code obtained from the command.
-    """
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-    t = 0.0
-    while t < time_out:
-        ret = p.poll()
-        if ret is None:
-            time.sleep(config.Config.TIMER_POLL_INTERVAL)
-            t += config.Config.TIMER_POLL_INTERVAL
-        elif ret == returnValue:
-            return ret
-        else:
-            p = subprocess.Popen(
-                command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-            )
-    return ret
-
 
 #
 # User defined exceptions
 #
 
 
-class LocalDocker(VMMSInterface):
+class LocalDocker(VMMSInterface, VMMSUtils):
     def __init__(self):
         """Checks if the machine is ready to run docker containers.
         Initialize boot2docker if running on OS X.
@@ -188,7 +135,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         ]
 
         self.log.debug("Running job: %s" % str(args))
-        ret = timeout(args, runTimeout * 2)
+        ret = VMMSUtils.timeout(args, runTimeout * 2)
         self.log.debug("runJob returning %d" % ret)
 
         return ret
@@ -212,7 +159,7 @@ def destroyVM(self, vm):
         volumePath = self.getVolumePath("")
         # Do a hard kill on corresponding docker container.
         # Return status does not matter.
-        timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT)
+        VMMSUtils.timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT)
         # Destroy corresponding volume if it exists.
         if instanceName in os.listdir(volumePath):
             shutil.rmtree(volumePath + instanceName)
@@ -254,7 +201,7 @@ def existsVM(self, vm):
         a non-zero status upon not finding a container.
         """
         instanceName = self.instanceName(vm.id, vm.name)
-        ret = timeout(["docker", "inspect", instanceName])
+        ret = VMMSUtils.timeout(["docker", "inspect", instanceName])
         return ret == 0
 
     def getImages(self):
diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py
index 74113755..93486bae 100644
--- a/vmms/tashiSSH.py
+++ b/vmms/tashiSSH.py
@@ -54,40 +54,6 @@ def timeout(command, time_out=1):
     return returncode
 
 
-def timeoutWithReturnStatus(command, time_out, returnValue=0):
-    """timeoutWithReturnStatus - Run a Unix command with a timeout,
-    until the expected value is returned by the command; On timeout,
-    return last error code obtained from the command.
-    """
-    if (config.Config.LOGLEVEL is logging.DEBUG) and (
-        "ssh" in command or "scp" in command
-    ):
-        out = sys.stdout
-        err = sys.stderr
-    else:
-        out = open("/dev/null", "w")
-        err = sys.stdout
-
-    # Launch the command
-    p = subprocess.Popen(
-        command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-    )
-
-    t = 0.0
-    while t < time_out:
-        ret = p.poll()
-        if ret is None:
-            time.sleep(config.Config.TIMER_POLL_INTERVAL)
-            t += config.Config.TIMER_POLL_INTERVAL
-        elif ret == returnValue:
-            return ret
-        else:
-            p = subprocess.Popen(
-                command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
-            )
-    return ret
-
-
 #
 # User defined exceptions
 #

From bababf95e39b9d02180faa8fb57aa2895860634b Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 11 Nov 2025 12:48:54 -0500
Subject: [PATCH 24/39] tidied code: return statement in timeout_with_retries

---
 vmms/ec2SSH.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 27134f6a..e4692c03 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -37,12 +37,12 @@
 
 
 
-def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2):
+def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2) -> int:
     """timeout - Run a unix command with a timeout. Return -1 on
     timeout, otherwise return the return value from the command, which
     is typically 0 for success, 1-255 for failure.
     """
-    for attempt in range(retries + 1):
+    for _ in range(retries + 1):
         # Launch the command
         p = subprocess.Popen(
             command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
@@ -57,26 +57,24 @@ def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2):
             print("ERROR: timeout trying ", command)
 
         # Determine why the while loop terminated
-        if p.poll() is None:
+        poll_result: Optional[int] = p.poll()
+        if poll_result is None:
             try:
                 os.kill(p.pid, 9)
             except OSError:
                 pass
             returncode = -1
         else:
-            returncode = p.poll()
+            returncode = poll_result
 
         # try to retry the command on a timeout
         if returncode == -1:
-            if attempt < retries:
-                print(f"Retrying in {retry_delay} seconds...")
-                time.sleep(retry_delay)
-            else:
-                # attempt == retries -> failure
-                print("All retries exhausted.")
-                return -1
+            print(f"Retrying in {retry_delay} seconds...")
+            time.sleep(retry_delay)
         else:
             return returncode
+    print("All retries exhausted.")
+    return -1
 
 
 @backoff.on_exception(backoff.expo, ClientError, max_tries=3, jitter=None)
@@ -566,7 +564,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         )
         # no logging for now
 
-        ret = timeout(
+        ret = VMMSUtils.timeout(
             ["ssh"]
             + self.ssh_flags
             + ["%s@%s" % (self.ec2User, domain_name), runcmd],
@@ -625,7 +623,7 @@ def copyOut(self, vm, destFile):
                 # Error copying out the timing data (probably runJob failed)
                 pass
 
-        return timeout(
+        return VMMSUtils.timeout(
             ["scp"]
             + self.ssh_flags
             + [

From ee7d71e4033c4c093ddace6f6e47ffe6d1758f5a Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 11 Nov 2025 12:59:18 -0500
Subject: [PATCH 25/39] added to the VMMSInterfae

---
 vmms/interface.py | 8 ++++++++
 vmms/tashiSSH.py  | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/vmms/interface.py b/vmms/interface.py
index 04043c13..6235e227 100644
--- a/vmms/interface.py
+++ b/vmms/interface.py
@@ -43,3 +43,11 @@ def existsVM(self, vm: TangoMachine) -> bool:
     @abstractmethod
     def instanceName(self, id: int, name: str) -> str:
         ...
+
+    @abstractmethod
+    def getImages(self) -> List[str]:
+        ...
+    
+    @abstractmethod
+    def getPartialOutput(self, vm: TangoMachine) -> str:
+        ...
diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py
index 93486bae..6562abfe 100644
--- a/vmms/tashiSSH.py
+++ b/vmms/tashiSSH.py
@@ -64,7 +64,7 @@ class tashiCallError(Exception):
     pass
 
 
-class TashiSSH(object):
+class TashiSSH(VMMSInterface):
     _SSH_FLAGS = [
         "-q",
         "-i",

From 6abab6cab4381fe0212986d1547ed78fd724588d Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 11 Nov 2025 14:14:10 -0500
Subject: [PATCH 26/39] all the changes that are basically no-ops

---
 restful_tango/tangoREST.py |  8 ++++++--
 tango.py                   |  9 ---------
 tangoObjects.py            |  1 +
 vmms/ec2SSH.py             | 17 +++++++++--------
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py
index 0e67b0b2..d246daba 100644
--- a/restful_tango/tangoREST.py
+++ b/restful_tango/tangoREST.py
@@ -306,7 +306,9 @@ def upload(self, key, courselab, file, tempfile, fileMD5):
                     os.unlink(tempfile)
                     return self.status.wrong_courselab
             except Exception as e:
-                exc_type, exc_obj, exc_tb = sys.exc_info()
+                exc_type, _, exc_tb = sys.exc_info()
+                assert exc_type is not None
+                assert exc_tb is not None # currently handling an exception
                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                 print(exc_type, fname, exc_tb.tb_lineno)
                 self.log.error("upload request failed: %s" % str(e))
@@ -335,7 +337,9 @@ def addJob(self, key, courselab, jobStr):
                 result["jobId"] = jobId
                 return result
             except Exception as e:
-                exc_type, exc_obj, exc_tb = sys.exc_info()
+                exc_type, _, exc_tb = sys.exc_info()
+                assert exc_type is not None
+                assert exc_tb is not None # currently handling an exception
                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                 print(exc_type, fname, exc_tb.tb_lineno)
                 self.log.error("addJob request failed: %s" % str(e))
diff --git a/tango.py b/tango.py
index 925413f0..a8cfa7dd 100755
--- a/tango.py
+++ b/tango.py
@@ -355,15 +355,6 @@ def __validateJob(self, job, vmms):
                 )
                 errors += 1
 
-        # Check for max output file size parameter
-        if not job.maxOutputFileSize:
-            self.log.debug(
-                "validateJob: Setting job.maxOutputFileSize "
-                "to default value: %d bytes",
-                Config.MAX_OUTPUT_FILE_SIZE,
-            )
-            job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE
-
         # Check the list of input files
         hasMakefile = False
         for inputFile in job.input:
diff --git a/tangoObjects.py b/tangoObjects.py
index 49b05ea9..9297db13 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -91,6 +91,7 @@ class TangoJob(object):
     TangoJob - A job that is to be run on a TangoMachine
     """
 
+    # TODO: do we really want all of these default values?
     def __init__(
         self,
         vm: TangoMachine,
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index e4692c03..b3aa7197 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -260,7 +260,7 @@ def deleteKeyPair(self):
     def createSecurityGroup(self):
         try:
             # Check if the security group already exists
-            response = self.boto3client.describe_security_groups(
+            description_response = self.boto3client.describe_security_groups(
                 Filters=[
                     {
                         "Name": "group-name",
@@ -268,18 +268,18 @@ def createSecurityGroup(self):
                     }
                 ]
             )
-            if response["SecurityGroups"]:
-                security_group_id = response["SecurityGroups"][0]["GroupId"]
+            if description_response["SecurityGroups"]:
+                security_group_id = description_response["SecurityGroups"][0]["GroupId"]
                 return
         except Exception as e:
             self.log.debug("ERROR checking for existing security group: %s", e)
 
         try:
-            response = self.boto3resource.create_security_group(
+            security_group_response = self.boto3resource.create_security_group(
                 GroupName=config.Config.DEFAULT_SECURITY_GROUP,
                 Description="Autolab security group - allowing all traffic",
             )
-            security_group_id = response["GroupId"]
+            security_group_id = security_group_response["GroupId"]
             self.boto3resource.authorize_security_group_ingress(
                 GroupId=security_group_id
             )
@@ -619,9 +619,10 @@ def copyOut(self, vm, destFile):
 
             except subprocess.CalledProcessError as xxx_todo_changeme:
                 # Error copying out the timing data (probably runJob failed)
-                re.error = xxx_todo_changeme
+                # re.error = xxx_todo_changeme
                 # Error copying out the timing data (probably runJob failed)
                 pass
+                
 
         return VMMSUtils.timeout(
             ["scp"]
@@ -689,7 +690,7 @@ def getVMs(self):
         """
         try:
             vms = list()
-            filters = [
+            filters: Sequence[FilterTypeDef] = [
                 {
                     "Name": "instance-state-name",
                     "Values": ["running", "pending"],
@@ -741,7 +742,7 @@ def getVMs(self):
     def existsVM(self, vm):
         """existsVM - Checks whether a VM exists in the vmms."""
         # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html
-        filters = [{"Name": "instance-state-name", "Values": ["running"]}]
+        filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}]
         # gets all running instances
         instances = self.boto3resource.instances.filter(Filters=filters)
         for instance in instances:

From 2325e54236133135eaeed797211b2440cb18cedd Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 11 Nov 2025 15:36:35 -0500
Subject: [PATCH 27/39] empty line

---
 vmms/localDocker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index 8ccfb7d5..338bc6d4 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -17,6 +17,7 @@
 from vmms.interface import VMMSInterface
 
 
+
 def timeout(command, time_out=1):
     """timeout - Run a unix command with a timeout. Return -1 on
     timeout, otherwise return the return value from the command, which

From 5b34e24f76e39cc19cc2a6fb48328d67ef311f1f Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 18 Nov 2025 12:52:13 -0500
Subject: [PATCH 28/39] replaced _clean with make_empty for both TangoQueue and
 TangoDictionary

---
 jobQueue.py     |  8 ++++----
 tangoObjects.py | 17 ++++-------------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/jobQueue.py b/jobQueue.py
index 34c3ae5b..d00969fc 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -341,13 +341,13 @@ def getInfo(self):
 
         return info
 
-    def reset(self):
+    def reset(self) -> None:
         """reset - resets and clears all the internal dictionaries
         and queues
         """
-        self.liveJobs._clean()
-        self.deadJobs._clean()
-        self.unassignedJobs._clean()
+        self.liveJobs.make_empty()
+        self.deadJobs.make_empty()
+        self.unassignedJobs.make_empty()
 
     def getNextPendingJob(self) -> TangoJob:
         """Gets the next unassigned live job. Note that this is a
diff --git a/tangoObjects.py b/tangoObjects.py
index b0ce2da0..191bf141 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -371,8 +371,6 @@ def get_nowait(self) -> Optional[T]:
         ...
     def remove(self, item) -> None:
         ...
-    def _clean(self) -> None:
-        ...
     def make_empty(self) -> None:
         ...
 
@@ -391,14 +389,10 @@ def remove(self, value):
         with self.mutex:
             self.queue.remove(value)
 
-    def _clean(self):
+    def make_empty(self):
         with self.mutex:
             self.queue.clear()
             
-    def make_empty(self):
-        self._clean()
-            
-
 class TangoRemoteQueue(TangoQueue):
 
     """Simple Queue with Redis Backend"""
@@ -458,9 +452,6 @@ def remove(self, item):
         pickled_item = pickle.dumps(item)
         return self.__db.lrem(self.key, 0, pickled_item)
 
-    def _clean(self):
-        self.__db.delete(self.key)
-
     def make_empty(self) -> None:
         self.__db.delete(self.key)
 
@@ -502,7 +493,7 @@ def values(self) -> list[T]:
     def delete(self, id: str) -> None:
         ...
     @abstractmethod
-    def _clean(self) -> None:
+    def make_empty(self) -> None:
         ...
     @abstractmethod
     def items(self) -> list[tuple[str, T]]:
@@ -568,7 +559,7 @@ def values(self):
     def delete(self, id):
         self.r.hdel(self.hash_name, id)
 
-    def _clean(self):
+    def make_empty(self):
         # only for testing
         self.r.delete(self.hash_name)
 
@@ -629,6 +620,6 @@ def items(self):
             ]
         )
 
-    def _clean(self):
+    def make_empty(self):
         # only for testing
         return

From ee45c4d9c024271e8dfc1398277e493965404811 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 18 Nov 2025 12:52:59 -0500
Subject: [PATCH 29/39] removed dead code

---
 worker.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/worker.py b/worker.py
index c68400f0..cf62b61f 100644
--- a/worker.py
+++ b/worker.py
@@ -402,30 +402,6 @@ def run(self):
             # with an explanatory message
             msg = "Success: Autodriver returned normally"
             self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL)
-            # if ret["copyin"] != 0:
-            #     msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"])
-            # elif ret["runjob"] != 0:
-            #     if ret["runjob"] == 1:  # This should never happen
-            #         msg = "Error: Autodriver usage error (status=%d)" % (ret["runjob"])
-            #     elif ret["runjob"] == 2:
-            #         msg = "Error: Job timed out after %d seconds" % (self.job.timeout)
-            #     elif ret["runjob"] == 3:  # EXIT_OSERROR in Autodriver
-            #         # Abnormal job termination (Autodriver encountered an OS
-            #         # error).  Assume that the VM is damaged. Destroy this VM
-            #         # and do not retry the job since the job may have damaged
-            #         # the VM.
-            #         msg = "Error: OS error while running job on VM"
-            #         detachMethod = DetachMethod.DESTROY_WITHOUT_REPLACEMENT
-            #         self.job.vm.notes = str(self.job.id) + "_" + self.job.name
-            #         self.job.setKeepForDebugging(True)
-            #     else:  # This should never happen
-            #         msg = "Error: Unknown autodriver error (status=%d)" % (
-            #             ret["runjob"]
-            #         )
-
-            # elif ret["copyout"] != 0:
-            #     msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"])
-
             return
 
         #

From ff65d39a85e1b841386a636fd682154c8f7aa241 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 1 Dec 2025 23:51:32 -0500
Subject: [PATCH 30/39] changed some dead code in createSecurityGroup to allow
 for type safety

---
 vmms/ec2SSH.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index b3aa7197..edb68280 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -257,6 +257,8 @@ def deleteKeyPair(self):
         # except OSError:
         #     pass
 
+    # Creates a security group if it doesn't exist.
+    # ^ Note: strangely, the security group id is never used.
     def createSecurityGroup(self):
         try:
             # Check if the security group already exists
@@ -273,14 +275,16 @@ def createSecurityGroup(self):
                 return
         except Exception as e:
             self.log.debug("ERROR checking for existing security group: %s", e)
-
+        # ! Note: We've never encountered the lines below before (there was a type error),
+        # ! because we've always had a security group. 
+        # ! Difficult to test because it involves deleting all security groups.
         try:
-            security_group_response = self.boto3resource.create_security_group(
+            security_group_response = self.boto3client.create_security_group(
                 GroupName=config.Config.DEFAULT_SECURITY_GROUP,
                 Description="Autolab security group - allowing all traffic",
             )
             security_group_id = security_group_response["GroupId"]
-            self.boto3resource.authorize_security_group_ingress(
+            self.boto3client.authorize_security_group_ingress(
                 GroupId=security_group_id
             )
         except Exception as e:

From 0c7d45baf25ccec95f068a12e796738788d3415d Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 09:58:08 -0500
Subject: [PATCH 31/39] removed unused variable

---
 vmms/ec2SSH.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index edb68280..6681a488 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -726,7 +726,6 @@ def getVMs(self):
 
                 vm.name = instName
                 vm.id = int(instName.split("-")[1])
-                vm.pool = instName.split("-")[2]
                 vm.name = instName
 
                 # needed for SSH

From 40b7ace15b1fe9ba99a25bdd912bf0b9111c115d Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 09:59:11 -0500
Subject: [PATCH 32/39] fixed error logging in server.py

---
 restful_tango/server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/restful_tango/server.py b/restful_tango/server.py
index b31f0e8b..1a177ce0 100755
--- a/restful_tango/server.py
+++ b/restful_tango/server.py
@@ -46,7 +46,7 @@ def prepare(self):
         if not os.path.exists(tempdir):
             os.mkdir(tempdir, 0o700)
         if os.path.exists(tempdir) and not os.path.isdir(tempdir):
-            tangoREST.log("Cannot process uploads, %s is not a directory" % (tempdir,))
+            tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,))
             return self.send_error()
         self.tempfile = NamedTemporaryFile(prefix="upload", dir=tempdir, delete=False)
         self.hasher = hashlib.md5()
@@ -129,7 +129,7 @@ def prepare(self):
         if not os.path.exists(tempdir):
             os.mkdir(tempdir, 0o700)
         if os.path.exists(tempdir) and not os.path.isdir(tempdir):
-            tangoREST.log("Cannot process uploads, %s is not a directory" % (tempdir,))
+            tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,))
             return self.send_error()
         self.tempfile = NamedTemporaryFile(prefix="docker", dir=tempdir, delete=False)
 

From 25a9cf267a439958455c9570baf26ab59b4a84dc Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 10:01:45 -0500
Subject: [PATCH 33/39] proper decoding of bytes

---
 worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker.py b/worker.py
index cf6e5a03..fc439d59 100644
--- a/worker.py
+++ b/worker.py
@@ -164,7 +164,7 @@ def notifyServer(self, job):
                     )
                 self.log.info(
                     "Response from callback to %s:%s"
-                    % (job.notifyURL, response.content)
+                    % (job.notifyURL, response.content.decode())
                 )
                 fh.close()
             else:

From d4a8c5562ee935ab85360887a16a5a7e25db3c63 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 10:22:32 -0500
Subject: [PATCH 34/39] preallocator changes

---
 mypy.ini        |  3 ---
 preallocator.py | 47 ++++++++++++++++++++++++++---------------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 7fe2ea10..f607e0a5 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -9,6 +9,3 @@ ignore_errors = True
 
 [mypy-tests.*]
 ignore_errors = True
-
-[mypy-preallocator]
-ignore_errors = True
\ No newline at end of file
diff --git a/preallocator.py b/preallocator.py
index 16b800b3..a5ba8e63 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -47,13 +47,13 @@ def update(self, vm, num):
         """
         self.lock.acquire()
         if vm.name not in self.machines:
-            initial_queue = TangoQueue.create(vm.name)
+            initial_queue: TangoQueue[TangoMachine] = TangoQueue.create(vm.name)
             initial_queue.make_empty()
             self.machines.set(vm.name, ([], initial_queue))
             self.log.debug("Creating empty pool of %s instances" % (vm.name))
         self.lock.release()
 
-        delta = num - len(self.machines.get(vm.name)[0])
+        delta = num - len(self.machines.getExn(vm.name)[0])
         if delta > 0:
             # We need more self.machines, spin them up.
             self.log.debug("update: Creating %d new %s instances" % (delta, vm.name))
@@ -75,8 +75,8 @@ def allocVM(self, vmName):
         if vmName in self.machines:
             self.lock.acquire()
 
-        if not self.machines.get(vmName)[1].empty():
-            vm = self.machines.get(vmName)[1].get_nowait()
+        if not self.machines.getExn(vmName)[1].empty():
+            vm = self.machines.getExn(vmName)[1].get_nowait()
 
         self.lock.release()
 
@@ -92,8 +92,8 @@ def freeVM(self, vm):
         # still a member of the pool.
         not_found = False
         self.lock.acquire()
-        if vm and vm.id in self.machines.get(vm.name)[0]:
-            machine = self.machines.get(vm.name)
+        if vm and vm.id in self.machines.getExn(vm.name)[0]:
+            machine = self.machines.getExn(vm.name)
             machine[1].put(vm)
             self.machines.set(vm.name, machine)
         else:
@@ -108,7 +108,7 @@ def freeVM(self, vm):
     def addVM(self, vm):
         """addVM - add a particular VM instance to the pool"""
         self.lock.acquire()
-        machine = self.machines.get(vm.name)
+        machine = self.machines.getExn(vm.name)
         machine[0].append(vm.id)
         self.machines.set(vm.name, machine)
         self.lock.release()
@@ -116,7 +116,7 @@ def addVM(self, vm):
     def removeVM(self, vm):
         """removeVM - remove a particular VM instance from the pool"""
         self.lock.acquire()
-        machine = self.machines.get(vm.name)
+        machine = self.machines.getExn(vm.name)
         machine[0].remove(vm.id)
         self.machines.set(vm.name, machine)
         self.lock.release()
@@ -167,7 +167,7 @@ def __destroy(self, vm):
         the free list is empty.
         """
         self.lock.acquire()
-        dieVM = self.machines.get(vm.name)[1].get_nowait()
+        dieVM = self.machines.getExn(vm.name)[1].get_nowait()
         self.lock.release()
 
         if dieVM:
@@ -203,12 +203,14 @@ def destroyVM(self, vmName, id):
 
         dieVM = None
         self.lock.acquire()
-        size = self.machines.get(vmName)[1].qsize()
-        if size == len(self.machines.get(vmName)[0]):
-            for i in range(size):
-                vm = self.machines.get(vmName)[1].get_nowait()
+        size = self.machines.getExn(vmName)[1].qsize()
+        if size == len(self.machines.getExn(vmName)[0]):
+            for _ in range(size):
+                vm = self.machines.getExn(vmName)[1].get_nowait()
+                if vm is None:
+                    break
                 if vm.id != id:
-                    machine = self.machines.get(vmName)
+                    machine = self.machines.getExn(vmName)
                     machine[1].put(vm)
                     self.machines.set(vmName, machine)
                 else:
@@ -217,7 +219,7 @@ def destroyVM(self, vmName, id):
 
         if dieVM:
             self.removeVM(dieVM)
-            vmms = self.vmms[vm.vmms]
+            vmms = self.vmms[dieVM.vmms]
             vmms.safeDestroyVM(dieVM)
             return 0
         else:
@@ -229,9 +231,10 @@ def getAllPools(self):
             result[vmName] = self.getPool(vmName)
         return result
 
+    # TODO: replace with a named tuple
     def getPool(self, vmName):
         """getPool - returns the members of a pool and its free list"""
-        result = {}
+        result: Dict[str, List[TangoMachine]] = {}
         if vmName not in self.machines:
             return result
 
@@ -239,15 +242,17 @@ def getPool(self, vmName):
         result["free"] = []
         free_list = []
         self.lock.acquire()
-        size = self.machines.get(vmName)[1].qsize()
-        for i in range(size):
-            vm = self.machines.get(vmName)[1].get_nowait()
+        size = self.machines.getExn(vmName)[1].qsize()
+        for _ in range(size):
+            vm = self.machines.getExn(vmName)[1].get_nowait()
+            if vm is None:
+                break
             free_list.append(vm.id)
-            machine = self.machines.get(vmName)
+            machine = self.machines.getExn(vmName)
             machine[1].put(vm)
             self.machines.set(vmName, machine)
         self.lock.release()
 
-        result["total"] = self.machines.get(vmName)[0]
+        result["total"] = self.machines.getExn(vmName)[0]
         result["free"] = free_list
         return result

From e71b5b1120192e6aa61fda6b3a2ee52849118f72 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 11:05:20 -0500
Subject: [PATCH 35/39] fixes

---
 vmms/sharedUtils.py | 38 ++++++++++++++++++++++++++++++++++++++
 worker.py           |  1 -
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 vmms/sharedUtils.py

diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py
new file mode 100644
index 00000000..55e16195
--- /dev/null
+++ b/vmms/sharedUtils.py
@@ -0,0 +1,38 @@
+import subprocess
+import time
+import os
+from typing import List
+import config
+
+class VMMSUtils:
+    @staticmethod
+    def timeout(command: List[str], time_out: float = 1) -> int:
+        """timeout - Run a unix command with a timeout. Return -1 on
+        timeout, otherwise return the return value from the command, which
+        is typically 0 for success, 1-255 for failure.
+        """
+
+        # Launch the command
+        p = subprocess.Popen(
+            command, stdout=open("/dev/null", "w"), stderr=subprocess.STDOUT
+        )
+
+        # Wait for the command to complete
+        t = 0.0
+        while t < time_out and p.poll() is None:
+            time.sleep(config.Config.TIMER_POLL_INTERVAL)
+            t += config.Config.TIMER_POLL_INTERVAL
+
+        # Determine why the while loop terminated
+        returncode: int
+        poll_result = p.poll()
+        if poll_result is None:
+            try:
+                os.kill(p.pid, 9)
+            except OSError:
+                pass
+            returncode = -1
+        else:
+            returncode = poll_result
+        return returncode
+
diff --git a/worker.py b/worker.py
index fc439d59..33b2f12d 100644
--- a/worker.py
+++ b/worker.py
@@ -284,7 +284,6 @@ def run(self):
                 self.log.debug(msg)
                 return
             # Copy input files to VM
-            self.log.debug(f"Before copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}")
             ret["copyin"] = self.vmms.copyIn(vm, self.job.input, self.job.id)
             self.log.debug(f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}")
 

From 620b949369fa58bd76464be136c6c8761026f2a9 Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 13:27:25 -0500
Subject: [PATCH 36/39] added type annotations to everything such that mypy .
 doesn't give any --check-untyped-defs suggestions

---
 preallocator.py   | 22 ++++++-------
 vmms/ec2SSH.py    | 80 ++++++++++++++++++++++-------------------------
 vmms/interface.py | 10 +++---
 worker.py         | 22 +++++++------
 4 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/preallocator.py b/preallocator.py
index a5ba8e63..eb2d27c9 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -29,14 +29,14 @@ def __init__(self, vmms: Dict[str, VMMSInterface]) -> None:
         self.vmms: Dict[str, VMMSInterface] = vmms
         self.log = logging.getLogger("Preallocator")
 
-    def poolSize(self, vmName):
+    def poolSize(self, vmName: str):
         """poolSize - returns the size of the vmName pool, for external callers"""
         if vmName not in self.machines:
             return 0
         else:
             return len(self.machines.getExn(vmName)[0])
 
-    def update(self, vm, num):
+    def update(self, vm: TangoMachine, num: int):
         """update - Updates the number of machines of a certain type
         to be preallocated.
 
@@ -69,7 +69,7 @@ def update(self, vm, num):
 
         # If delta == 0 then we are the perfect number!
 
-    def allocVM(self, vmName):
+    def allocVM(self, vmName: str):
         """allocVM - Allocate a VM from the free list"""
         vm = None
         if vmName in self.machines:
@@ -86,7 +86,7 @@ def allocVM(self, vmName):
 
         return vm
 
-    def freeVM(self, vm):
+    def freeVM(self, vm: TangoMachine):
         """freeVM - Returns a VM instance to the free list"""
         # Sanity check: Return a VM to the free list only if it is
         # still a member of the pool.
@@ -105,7 +105,7 @@ def freeVM(self, vm):
             vmms = self.vmms[vm.vmms]
             vmms.safeDestroyVM(vm)
 
-    def addVM(self, vm):
+    def addVM(self, vm: TangoMachine):
         """addVM - add a particular VM instance to the pool"""
         self.lock.acquire()
         machine = self.machines.getExn(vm.name)
@@ -113,7 +113,7 @@ def addVM(self, vm):
         self.machines.set(vm.name, machine)
         self.lock.release()
 
-    def removeVM(self, vm):
+    def removeVM(self, vm: TangoMachine):
         """removeVM - remove a particular VM instance from the pool"""
         self.lock.acquire()
         machine = self.machines.getExn(vm.name)
@@ -137,7 +137,7 @@ def _getNextID(self):
         self.lock.release()
         return id
 
-    def __create(self, vm, cnt):
+    def __create(self, vm: TangoMachine, cnt: int):
         """__create - Creates count VMs and adds them to the pool
 
         This function should always be called in a thread since it
@@ -157,7 +157,7 @@ def __create(self, vm, cnt):
             self.freeVM(newVM)
             self.log.debug("__create: Added vm %s to pool %s " % (newVM.id, newVM.name))
 
-    def __destroy(self, vm):
+    def __destroy(self, vm: TangoMachine):
         """__destroy - Removes a VM from the pool
 
         If the user asks for fewer preallocated VMs, then we will
@@ -175,7 +175,7 @@ def __destroy(self, vm):
             vmms = self.vmms[vm.vmms]
             vmms.safeDestroyVM(dieVM)
 
-    def createVM(self, vm):
+    def createVM(self, vm: TangoMachine):
         """createVM - Called in non-thread context to create a single
         VM and add it to the pool
         """
@@ -192,7 +192,7 @@ def createVM(self, vm):
         self.freeVM(newVM)
         self.log.debug("createVM: Added vm %s to pool %s" % (newVM.id, newVM.name))
 
-    def destroyVM(self, vmName, id):
+    def destroyVM(self, vmName: str, id: int):
         """destroyVM - Called by the delVM API function to remove and
         destroy a particular VM instance from a pool. We only allow
         this function when the system is queiscent (pool size == free
@@ -232,7 +232,7 @@ def getAllPools(self):
         return result
 
     # TODO: replace with a named tuple
-    def getPool(self, vmName):
+    def getPool(self, vmName: str):
         """getPool - returns the members of a pool and its free list"""
         result: Dict[str, List[TangoMachine]] = {}
         if vmName not in self.machines:
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 6681a488..2c928650 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -2,11 +2,7 @@
 # ec2SSH.py - Implements the Tango VMMS interface to run Tango jobs on Amazon EC2.
 #
 # This implementation uses the AWS EC2 SDK to manage the virtual machines and
-# ssh and scp to access them. The following excecption are raised back
-# to the caller:
-#
-#   Ec2Exception - EC2 raises this if it encounters any problem
-#   ec2CallError - raised by ec2Call() function
+# ssh and scp to access them.
 #
 
 import logging
@@ -21,11 +17,13 @@
 from botocore.exceptions import ClientError
 
 import config
-from tangoObjects import TangoMachine
-from typing import Optional, Literal, List, Dict, Sequence
+from tangoObjects import TangoMachine, InputFile
+from typing import Optional, Literal, List, Dict, Sequence, Set, get_args
+from typing_extensions import TypeGuard
 from mypy_boto3_ec2 import EC2ServiceResource
+from mypy_boto3_ec2.literals import InstanceTypeType
 from mypy_boto3_ec2.service_resource import Instance, Image
-from mypy_boto3_ec2.type_defs import FilterTypeDef
+from mypy_boto3_ec2.type_defs import FilterTypeDef, TagTypeDef
 
 from vmms.interface import VMMSInterface
 from vmms.sharedUtils import VMMSUtils
@@ -35,9 +33,11 @@
 logging.getLogger("botocore").setLevel(logging.CRITICAL)
 logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
 
+valid_instance_types: Set[InstanceTypeType] = set(get_args(InstanceTypeType))
+def check_instance_type(instance_type: str) -> TypeGuard[InstanceTypeType]:
+    return instance_type in valid_instance_types
 
-
-def timeout_with_retries(command, time_out=1, retries=3, retry_delay=2) -> int:
+def timeout_with_retries(command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2) -> int:
     """timeout - Run a unix command with a timeout. Return -1 on
     timeout, otherwise return the return value from the command, which
     is typically 0 for success, 1-255 for failure.
@@ -82,16 +82,6 @@ def try_load_instance(newInstance):
     newInstance.load()
 
 
-#
-# User defined exceptions
-#
-# ec2Call() exception
-
-
-class ec2CallError(Exception):
-    pass
-
-
 class Ec2SSH(VMMSInterface, VMMSUtils):
     _SSH_FLAGS = [
         "-i",
@@ -106,12 +96,12 @@ class Ec2SSH(VMMSInterface, VMMSUtils):
     _vm_semaphore = threading.Semaphore(config.Config.MAX_EC2_VMS)
 
     @staticmethod
-    def acquire_vm_semaphore():
+    def acquire_vm_semaphore() -> None:
         """Blocks until a VM is available to limit load"""
         Ec2SSH._vm_semaphore.acquire()  # This blocks until a slot is available
 
     @staticmethod
-    def release_vm_semaphore():
+    def release_vm_semaphore() -> None:
         """Releases the VM sempahore"""
         Ec2SSH._vm_semaphore.release()
 
@@ -188,18 +178,18 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] =
                 % str(ignoredAMIs)
             )
 
-    def instanceName(self, id, name):
+    def instanceName(self, id: int, name: str) -> str:
         """instanceName - Constructs a VM instance name. Always use
         this function when you need a VM instance name. Never generate
         instance names manually.
         """
         return "%s-%d-%s" % (config.Config.PREFIX, id, name)
 
-    def keyPairName(self, id, name):
+    def keyPairName(self, id: int, name: str) -> str:
         """keyPairName - Constructs a unique key pair name."""
         return "%s-%d-%s" % (config.Config.PREFIX, id, name)
 
-    def domainName(self, vm):
+    def domainName(self, vm: TangoMachine) -> str:
         """Returns the domain name that is stored in the vm
         instance.
         """
@@ -209,7 +199,8 @@ def domainName(self, vm):
     # VMMS helper methods
     #
 
-    def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict:
+    # TODO: return a dataclass with the instance_type member of type InstanceTypeType of type str
+    def tangoMachineToEC2Instance(self, vm: TangoMachine) -> Dict[str, str]:
         """tangoMachineToEC2Instance - returns an object with EC2 instance
         type and AMI. Only general-purpose instances are used. Defalt AMI
         is currently used.
@@ -233,7 +224,7 @@ def tangoMachineToEC2Instance(self, vm: TangoMachine) -> dict:
         self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance))
         return ec2instance
 
-    def createKeyPair(self):
+    def createKeyPair(self) -> None:
         # TODO: SUPPORT
         raise
         # # try to delete the key to avoid collision
@@ -247,7 +238,7 @@ def createKeyPair(self):
         # # change the SSH_FLAG accordingly
         # self.ssh_flags[1] = self.key_pair_path
 
-    def deleteKeyPair(self):
+    def deleteKeyPair(self) -> None:
         # TODO: SUPPORT
         raise
         # self.boto3client.delete_key_pair(self.key_pair_name)
@@ -259,7 +250,7 @@ def deleteKeyPair(self):
 
     # Creates a security group if it doesn't exist.
     # ^ Note: strangely, the security group id is never used.
-    def createSecurityGroup(self):
+    def createSecurityGroup(self) -> None:
         try:
             # Check if the security group already exists
             description_response = self.boto3client.describe_security_groups(
@@ -304,6 +295,9 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
         try:
             instanceName = self.instanceName(vm.id, vm.name)
             ec2instance = self.tangoMachineToEC2Instance(vm)
+            instance_type = ec2instance["instance_type"]
+            if not check_instance_type(instance_type):
+                raise ValueError(f"Invalid instance type: {instance_type}")
             self.log.debug("instanceName: %s" % instanceName)
             # ensure that security group exists
             self.createSecurityGroup()
@@ -313,7 +307,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
                 ImageId=ec2instance["ami"],
                 KeyName=self.key_pair_name,
                 SecurityGroups=[config.Config.DEFAULT_SECURITY_GROUP],
-                InstanceType=ec2instance["instance_type"],
+                InstanceType=instance_type,
                 MaxCount=1,
                 MinCount=1,
                 InstanceMarketOptions=
@@ -416,7 +410,7 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
                     return -1
             return -1
 
-    def waitVM(self, vm, max_secs) -> Literal[0, -1]:
+    def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
         """waitVM - Wait at most max_secs for a VM to become
         ready. Return error if it takes too long.
 
@@ -482,7 +476,7 @@ def waitVM(self, vm, max_secs) -> Literal[0, -1]:
             # Sleep a bit before trying again
             time.sleep(config.Config.TIMER_POLL_INTERVAL)
 
-    def copyIn(self, vm, inputFiles, job_id=None):
+    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
         """copyIn - Copy input files to VM
         Args:
         - vm is a TangoMachine object
@@ -546,7 +540,7 @@ def copyIn(self, vm, inputFiles, job_id=None):
 
         return 0
 
-    def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
+    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int:
         """runJob - Run the make command on a VM using SSH and
         redirect output to file "output".
         """
@@ -578,7 +572,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         # runTimeout * 2 is a temporary hack. The driver will handle the timout
         return ret
 
-    def copyOut(self, vm, destFile):
+    def copyOut(self, vm: TangoMachine, destFile: str) -> int:
         """copyOut - Copy the file output on the VM to the file
         outputFile on the Tango host.
         """
@@ -638,7 +632,7 @@ def copyOut(self, vm, destFile):
             config.Config.COPYOUT_TIMEOUT,
         )
 
-    def destroyVM(self, vm):
+    def destroyVM(self, vm: TangoMachine) -> None:
         """destroyVM - Removes a VM from the system"""
         self.log.info(
             "destroyVM: %s %s %s %s"
@@ -685,15 +679,15 @@ def destroyVM(self, vm):
 
         Ec2SSH.release_vm_semaphore()
 
-    def safeDestroyVM(self, vm):
+    def safeDestroyVM(self, vm: TangoMachine) -> None:
         return self.destroyVM(vm)
 
-    def getVMs(self):
+    def getVMs(self) -> List[TangoMachine]:
         """getVMs - Returns the complete list of VMs on this account. Each
         list entry is a boto.ec2.instance.Instance object.
         """
         try:
-            vms = list()
+            vms: List[TangoMachine] = []
             filters: Sequence[FilterTypeDef] = [
                 {
                     "Name": "instance-state-name",
@@ -742,7 +736,7 @@ def getVMs(self):
 
         return vms
 
-    def existsVM(self, vm):
+    def existsVM(self, vm: TangoMachine) -> bool:
         """existsVM - Checks whether a VM exists in the vmms."""
         # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html
         filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}]
@@ -757,19 +751,19 @@ def existsVM(self, vm):
         # for instance in instances.filter(InstanceIds)
         return False
 
-    def getImages(self):
+    def getImages(self) -> List[str]:
         """getImages - return a constant; actually use the ami specified in config"""
         return [key for key in self.img2ami]
 
-    # getTag: to do later
-    def getTag(self, tagList, tagKey):
+    # TODO: later
+    def getTag(self, tagList: List[TagTypeDef], tagKey: str) -> Optional[str]:
         if tagList:
             for tag in tagList:
                 if tag["Key"] == tagKey:
                     return tag["Value"]
         return None
 
-    def getPartialOutput(self, vm):
+    def getPartialOutput(self, vm: TangoMachine) -> str:
         domain_name = self.domainName(vm)
 
         runcmd = "head -c %s /home/autograde/output.log" % (
diff --git a/vmms/interface.py b/vmms/interface.py
index 6235e227..a437401a 100644
--- a/vmms/interface.py
+++ b/vmms/interface.py
@@ -1,5 +1,5 @@
 from typing import Protocol, Optional, Literal, List
-from tangoObjects import TangoMachine
+from tangoObjects import TangoMachine, InputFile
 from abc import abstractmethod
 
 
@@ -13,7 +13,7 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
         ...
 
     @abstractmethod
-    def copyIn(self, vm: TangoMachine, inputFiles: List[str], job_id: Optional[int] = None) -> Literal[0, -1]:
+    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
         ...
 
     @abstractmethod
@@ -21,15 +21,15 @@ def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disa
         ...
 
     @abstractmethod
-    def copyOut(self, vm: TangoMachine, destFile: str) -> Literal[0, -1]:
+    def copyOut(self, vm: TangoMachine, destFile: str) -> int:
         ...
 
     @abstractmethod
-    def destroyVM(self, vm: TangoMachine) -> Literal[0, -1]:
+    def destroyVM(self, vm: TangoMachine) -> None:
         ...
     
     @abstractmethod
-    def safeDestroyVM(self, vm: TangoMachine) -> Literal[0, -1]:
+    def safeDestroyVM(self, vm: TangoMachine) -> None:
         ...
 
     @abstractmethod
diff --git a/worker.py b/worker.py
index 36f81b97..8d92123c 100644
--- a/worker.py
+++ b/worker.py
@@ -15,9 +15,10 @@
 from datetime import datetime
 from config import Config
 from jobQueue import JobQueue
-from tangoObjects import TangoMachine
+from tangoObjects import TangoMachine, TangoJob
 from typing import Dict, Optional
 from vmms.interface import VMMSInterface
+from preallocator import Preallocator
 #
 # Worker - The worker class is very simple and very dumb. The goal is
 # to walk through the VMMS interface, track the job's progress, and if
@@ -37,7 +38,7 @@ class DetachMethod(Enum):
 
 # We always preallocate a VM for the worker to use
 class Worker(threading.Thread):
-    def __init__(self, job, vmms, jobQueue, preallocator, preVM: TangoMachine):
+    def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preallocator: Preallocator, preVM: TangoMachine):
         threading.Thread.__init__(self)
         self.daemon = True
         self.job = job
@@ -85,8 +86,7 @@ def detachVM(self, detachMethod: DetachMethod):
             else:
                 raise ValueError(f"Invalid detach method: {detachMethod}")
 
-    # TODO: figure out what hdrfile, ret and err are
-    def rescheduleJob(self, hdrfile, ret, err):
+    def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None:
         """rescheduleJob - Reschedule a job that has failed because
         of a system error, such as a VM timing out or a connection
         failure.
@@ -121,13 +121,13 @@ def rescheduleJob(self, hdrfile, ret, err):
             )
             self.afterJobExecution(hdrfile, full_err, DetachMethod.DESTROY_AND_REPLACE)
 
-    def appendMsg(self, filename, msg):
+    def appendMsg(self, filename: str, msg: str) -> None:
         """appendMsg - Append a timestamped Tango message to a file"""
         f = open(filename, "a")
         f.write("Autograder [%s]: %s\n" % (datetime.now().ctime(), msg))
         f.close()
 
-    def catFiles(self, f1, f2):
+    def catFiles(self, f1: str, f2: str) -> None:
         """catFiles - cat f1 f2 > f2, where f1 is the Tango header
         and f2 is the output from the Autodriver
         """
@@ -146,7 +146,7 @@ def catFiles(self, f1, f2):
         os.rename(tmpname, f2)
         os.remove(f1)
 
-    def notifyServer(self, job):
+    def notifyServer(self, job: TangoJob) -> None:
         try:
             if job.notifyURL:
                 outputFileName = job.outputFile.split("/")[-1]  # get filename from path
@@ -172,7 +172,7 @@ def notifyServer(self, job):
         except Exception as e:
             self.log.debug("Error in notifyServer: %s" % str(e))
 
-    def afterJobExecution(self, hdrfile, msg, detachMethod: DetachMethod): 
+    def afterJobExecution(self, hdrfile: str, msg: str, detachMethod: DetachMethod) -> None: 
         self.jobQueue.makeDead(self.job, msg)
         
         # Update the text that users see in the autodriver output file
@@ -188,7 +188,7 @@ def afterJobExecution(self, hdrfile, msg, detachMethod: DetachMethod):
     #
     # Main worker function
     #
-    def run(self):
+    def run(self) -> None:
         """run - Step a job through its execution sequence"""
         try:
             # Hash of return codes for each step
@@ -409,7 +409,9 @@ def run(self):
             # and detachVM can be run
             # if vm is not set but self.preVM is set, we still need
             # to return the VM, but have to initialize self.job.vm first
+            # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set
             if self.preVM and not vm:
-                vm = self.job.vm = self.preVM 
+                self.job.makeVM(self.preVM)
+                vm = self.preVM 
             if vm:
                 self.detachVM(DetachMethod.DESTROY_AND_REPLACE)

From 8eed4dbffdf669404c9be849e8bea5a413f99b0e Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 14:34:45 -0500
Subject: [PATCH 37/39] local docker type annotations

---
 vmms/ec2SSH.py      |  9 ++-------
 vmms/localDocker.py | 49 ++++++++++++++++++++-------------------------
 vmms/sharedUtils.py |  8 ++++++++
 3 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 2c928650..4b2e4090 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -178,13 +178,6 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] =
                 % str(ignoredAMIs)
             )
 
-    def instanceName(self, id: int, name: str) -> str:
-        """instanceName - Constructs a VM instance name. Always use
-        this function when you need a VM instance name. Never generate
-        instance names manually.
-        """
-        return "%s-%d-%s" % (config.Config.PREFIX, id, name)
-
     def keyPairName(self, id: int, name: str) -> str:
         """keyPairName - Constructs a unique key pair name."""
         return "%s-%d-%s" % (config.Config.PREFIX, id, name)
@@ -198,6 +191,8 @@ def domainName(self, vm: TangoMachine) -> str:
     #
     # VMMS helper methods
     #
+    def instanceName(self, id: int, name: str) -> str:
+        return VMMSUtils.constructInstanceName(id, name)
 
     # TODO: return a dataclass with the instance_type member of type InstanceTypeType of type str
     def tangoMachineToEC2Instance(self, vm: TangoMachine) -> Dict[str, str]:
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index b002a06e..fa727bb6 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -12,8 +12,8 @@
 import sys
 import shutil
 import config
-from tangoObjects import TangoMachine
-from typing import List
+from tangoObjects import TangoMachine, InputFile
+from typing import List, Literal, Optional
 from vmms.interface import VMMSInterface
 from vmms.sharedUtils import VMMSUtils
 
@@ -40,25 +40,21 @@ def __init__(self):
             self.log.error(str(e))
             exit(1)
 
-    def instanceName(self, id, name):
-        """instanceName - Constructs a VM instance name. Always use
-        this function when you need a VM instance name. Never generate
-        instance names manually.
-        """
-        return "%s-%s-%s" % (config.Config.PREFIX, id, name)
+    def instanceName(self, id: int, name: str) -> str:
+        return VMMSUtils.constructInstanceName(id, name)
 
-    def getVolumePath(self, instanceName):
+    def getVolumePath(self, instanceName: str) -> str:
         volumePath = config.Config.DOCKER_VOLUME_PATH
         # Last empty string to cause trailing '/'
         volumePath = os.path.join(volumePath, instanceName, "")
         return volumePath
 
-    def getDockerVolumePath(self, dockerPath, instanceName):
+    def getDockerVolumePath(self, dockerPath: str, instanceName: str) -> str:
         # Last empty string to cause trailing '/'
         volumePath = os.path.join(dockerPath, instanceName, "")
         return volumePath
 
-    def domainName(self, vm):
+    def domainName(self, vm: TangoMachine) -> str:
         """Returns the domain name that is stored in the vm
         instance.
         """
@@ -67,15 +63,15 @@ def domainName(self, vm):
     #
     # VMMS API functions
     #
-    def initializeVM(self, vm):
+    def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
         """initializeVM -  Nothing to do for initializeVM"""
         return 0
 
-    def waitVM(self, vm, max_secs):
+    def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
         """waitVM - Nothing to do for waitVM"""
-        return
+        return 0
 
-    def copyIn(self, vm, inputFiles, job_id=None):
+    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
         """copyIn - Create a directory to be mounted as a volume
         for the docker containers. Copy input files to this directory.
         """
@@ -94,7 +90,7 @@ def copyIn(self, vm, inputFiles, job_id=None):
             )
         return 0
 
-    def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
+    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int:
         """runJob - Run a docker container by doing the follows:
         - mount directory corresponding to this job to /home/autolab
           in the container
@@ -103,10 +99,9 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
         """
         instanceName = self.instanceName(vm.id, vm.image)
         volumePath = self.getVolumePath(instanceName)
-        if os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH"):
-            volumePath = self.getDockerVolumePath(
-                os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH"), instanceName
-            )
+        host_volume_path = os.getenv("DOCKER_TANGO_HOST_VOLUME_PATH")
+        if host_volume_path:
+            volumePath = self.getDockerVolumePath(host_volume_path, instanceName)
         args = ["docker", "run", "--name", instanceName, "-v"]
         args = args + ["%s:%s" % (volumePath, "/home/mount")]
         if vm.cores:
@@ -140,7 +135,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize, disableNetwork):
 
         return ret
 
-    def copyOut(self, vm, destFile):
+    def copyOut(self, vm: TangoMachine, destFile: str) -> int:
         """copyOut - Copy the autograder feedback from container to
         destFile on the Tango host. Then, destroy that container.
         Containers are never reused.
@@ -153,7 +148,7 @@ def copyOut(self, vm, destFile):
 
         return 0
 
-    def destroyVM(self, vm):
+    def destroyVM(self, vm: TangoMachine) -> None:
         """destroyVM - Delete the docker container."""
         instanceName = self.instanceName(vm.id, vm.image)
         volumePath = self.getVolumePath("")
@@ -166,7 +161,7 @@ def destroyVM(self, vm):
             self.log.debug("Deleted volume %s" % instanceName)
         return
 
-    def safeDestroyVM(self, vm):
+    def safeDestroyVM(self, vm: TangoMachine) -> None:
         """safeDestroyVM - Delete the docker container and make
         sure it is removed.
         """
@@ -178,7 +173,7 @@ def safeDestroyVM(self, vm):
             self.destroyVM(vm)
         return
 
-    def getVMs(self):
+    def getVMs(self) -> List[TangoMachine]:
         """getVMs - Executes and parses `docker ps`. This function
         is a lot of parsing and can break easily.
         """
@@ -196,7 +191,7 @@ def getVMs(self):
                 machines.append(machine)
         return machines
 
-    def existsVM(self, vm):
+    def existsVM(self, vm: TangoMachine) -> bool:
         """existsVM - Executes `docker inspect CONTAINER`, which returns
         a non-zero status upon not finding a container.
         """
@@ -204,7 +199,7 @@ def existsVM(self, vm):
         ret = VMMSUtils.timeout(["docker", "inspect", instanceName])
         return ret == 0
 
-    def getImages(self):
+    def getImages(self) -> List[str]:
         """getImages - Executes `docker images` and returns a list of
         images that can be used to boot a docker container with. This
         function is a lot of parsing and so can break easily.
@@ -221,7 +216,7 @@ def getImages(self):
             result.add(re.sub(r".*/([^/]*)", r"\1", row_l[0]))
         return list(result)
 
-    def getPartialOutput(self, vm):
+    def getPartialOutput(self, vm: TangoMachine) -> str:
         """getPartialOutput - Get the partial output of a job.
         It does not check if the docker container exists before executing
         as the command will not fail even if the container does not exist.
diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py
index 55e16195..0165b313 100644
--- a/vmms/sharedUtils.py
+++ b/vmms/sharedUtils.py
@@ -5,6 +5,14 @@
 import config
 
 class VMMSUtils:
+    @staticmethod
+    def constructInstanceName(id: int, name: str) -> str:
+        """instanceName - Constructs a VM instance name. Always use
+        this function when you need a VM instance name. Never generate
+        instance names manually.
+        """
+        return "%s-%d-%s" % (config.Config.PREFIX, id, name)
+    
     @staticmethod
     def timeout(command: List[str], time_out: float = 1) -> int:
         """timeout - Run a unix command with a timeout. Return -1 on

From 9b1d8980b43f8471decd3035a0542dc025b2af5f Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Tue, 2 Dec 2025 16:12:41 -0500
Subject: [PATCH 38/39] linting fix

---
 clients/tango-cli.py                  | 305 ++++++++++++++++----------
 jobManager.py                         |  22 +-
 jobQueue.py                           |  23 +-
 preallocator.py                       |   5 +-
 restful_tango/server.py               |   8 +-
 restful_tango/tangoREST.py            |   6 +-
 tangoObjects.py                       |  97 ++++----
 tests/sample_test/expected_output.txt |   2 +-
 tests/stressTest.py                   | 118 +++++++---
 vmms/ec2SSH.py                        | 115 +++++-----
 vmms/interface.py                     |  19 +-
 vmms/localDocker.py                   |  20 +-
 vmms/sharedUtils.py                   |   4 +-
 worker.py                             |  95 +++++---
 14 files changed, 518 insertions(+), 321 deletions(-)

diff --git a/clients/tango-cli.py b/clients/tango-cli.py
index 78c5155f..d51d2da3 100755
--- a/clients/tango-cli.py
+++ b/clients/tango-cli.py
@@ -28,6 +28,7 @@ def get_arg(name, default=None):
 @dataclass
 class RequestObj:
     """Dataclass for job request objects"""
+
     image: str
     files: str
     timeout: int
@@ -46,6 +47,7 @@ class RequestObj:
 @dataclass
 class VmObj:
     """Dataclass for VM allocation objects"""
+
     vmms: str
     cores: int
     memory: int
@@ -161,45 +163,48 @@ class VmObj:
 parser.add_argument("--accessKeyId", default="", help="AWS account access key ID")
 parser.add_argument("--accessKey", default="", help="AWS account access key content")
 parser.add_argument("--instanceType", default="", help="AWS EC2 instance type")
-parser.add_argument("--stopBefore", default="", help="Stops the worker before a function is executed")
+parser.add_argument(
+    "--stopBefore", default="", help="Stops the worker before a function is executed"
+)
+
 
 def checkKey():
-    if get_arg('key') is None:
+    if get_arg("key") is None:
         print("Key must be specified with -k")
         return -1
     return 0
 
 
 def checkCourselab():
-    if get_arg('courselab') is None:
+    if get_arg("courselab") is None:
         print("Courselab must be specified with -l")
         return -1
     return 0
 
 
 def checkFilename():
-    if get_arg('filename') is None:
+    if get_arg("filename") is None:
         print("Filename must be specified with --filename")
         return -1
     return 0
 
 
 def checkInfiles():
-    if get_arg('infiles') is None:
+    if get_arg("infiles") is None:
         print("Input files must be specified with --infiles")
         return -1
     return 0
 
 
 def checkDeadjobs():
-    if get_arg('deadJobs') is None:
+    if get_arg("deadJobs") is None:
         print("Deadjobs must be specified with --deadJobs")
         return -1
     return 0
 
 
 def checkImageName():
-    if get_arg('imageName') is None:
+    if get_arg("imageName") is None:
         print("Image name must be specified with --imageName")
         return -1
     return 0
@@ -218,18 +223,24 @@ def tango_open():
 
         response = requests.get(
             "%s://%s:%d/open/%s/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'))
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+            )
         )
         print(
             "Sent request to %s:%d/open/%s/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("courselab"))
         )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/open/%s/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("courselab"))
         )
         print(str(err))
         sys.exit(0)
@@ -244,28 +255,46 @@ def tango_upload():
         if res != 0:
             raise Exception("Invalid usage: [upload] " + upload_help)
 
-        dirs = get_arg('filename').split("/")
+        dirs = get_arg("filename").split("/")
         filename = dirs[len(dirs) - 1]
         header = {"Filename": filename}
 
-        f = open(get_arg('filename'), 'rb')
+        f = open(get_arg("filename"), "rb")
         response = requests.post(
             "%s://%s:%d/upload/%s/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')),
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+            ),
             data=f.read(),
             headers=header,
         )
         f.close()
         print(
             "Sent request to %s:%d/upload/%s/%s/ filename=%s"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), get_arg('filename'))
+            % (
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                get_arg("filename"),
+            )
         )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/upload/%s/%s/ filename=%s"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), get_arg('filename'))
+            % (
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                get_arg("filename"),
+            )
         )
         print(str(err))
         sys.exit(0)
@@ -281,36 +310,54 @@ def tango_addJob():
             raise Exception("Invalid usage: [addJob] " + addJob_help)
 
         requestObj = RequestObj(
-            image=get_arg('image'),
-            files=get_arg('infiles'),
-            timeout=get_arg('timeout'),
-            max_kb=get_arg('maxsize'),
-            output_file=get_arg('outputFile'),
-            jobName=get_arg('jobname'),
-            accessKeyId=get_arg('accessKeyId'),
-            accessKey=get_arg('accessKey'),
-            disable_network=get_arg('disableNetwork'),
-            instanceType=get_arg('instanceType'),
-            stopBefore=get_arg('stopBefore'),
-            notifyURL=get_arg('notifyURL'),
-            callback_url=get_arg('callbackURL'),
+            image=get_arg("image"),
+            files=get_arg("infiles"),
+            timeout=get_arg("timeout"),
+            max_kb=get_arg("maxsize"),
+            output_file=get_arg("outputFile"),
+            jobName=get_arg("jobname"),
+            accessKeyId=get_arg("accessKeyId"),
+            accessKey=get_arg("accessKey"),
+            disable_network=get_arg("disableNetwork"),
+            instanceType=get_arg("instanceType"),
+            stopBefore=get_arg("stopBefore"),
+            notifyURL=get_arg("notifyURL"),
+            callback_url=get_arg("callbackURL"),
         )
 
         response = requests.post(
             "%s://%s:%d/addJob/%s/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab')),
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+            ),
             data=json.dumps(asdict(requestObj)),
         )
         print(
             "Sent request to %s:%d/addJob/%s/%s/ \t jobObj=%s"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), json.dumps(asdict(requestObj)))
+            % (
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                json.dumps(asdict(requestObj)),
+            )
         )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/addJob/%s/%s/ \t jobObj=%s"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('courselab'), json.dumps(asdict(requestObj)) if 'requestObj' in locals() else 'N/A')
+            % (
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                json.dumps(asdict(requestObj)) if "requestObj" in locals() else "N/A",
+            )
         )
         print(str(err))
         sys.exit(0)
@@ -325,19 +372,19 @@ def tango_getPartialOutput():
             "%s://%s:%d/getPartialOutput/%s/%s/"
             % (
                 _tango_protocol,
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('jobid'),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("jobid"),
             )
         )
         print(
             "Sent request to %s:%d/getPartialOutput/%s/%s/"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('jobid'),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("jobid"),
             )
         )
         print(response.text)
@@ -345,10 +392,10 @@ def tango_getPartialOutput():
         print(
             "Failed to send request to %s:%d/getPartialOutput/%s/%s/"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('jobid'),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("jobid"),
             )
         )
         print(str(err))
@@ -368,21 +415,21 @@ def tango_poll():
             "%s://%s:%d/poll/%s/%s/%s/"
             % (
                 _tango_protocol,
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('courselab'),
-                urllib.parse.quote(get_arg('outputFile')),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                urllib.parse.quote(get_arg("outputFile")),
             )
         )
         print(
             "Sent request to %s:%d/poll/%s/%s/%s/"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('courselab'),
-                urllib.parse.quote(get_arg('outputFile')),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                urllib.parse.quote(get_arg("outputFile")),
             )
         )
         print(response.text)
@@ -391,11 +438,11 @@ def tango_poll():
         print(
             "Failed to send request to %s:%d/poll/%s/%s/%s/"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('courselab'),
-                urllib.parse.quote(get_arg('outputFile')),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("courselab"),
+                urllib.parse.quote(get_arg("outputFile")),
             )
         )
         print(str(err))
@@ -412,15 +459,19 @@ def tango_info():
             raise Exception("Invalid usage: [info] " + info_help)
 
         response = requests.get(
-            "%s://%s:%d/info/%s/" % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'))
+            "%s://%s:%d/info/%s/"
+            % (_tango_protocol, get_arg("server"), get_arg("port"), get_arg("key"))
+        )
+        print(
+            "Sent request to %s:%d/info/%s/"
+            % (get_arg("server"), get_arg("port"), get_arg("key"))
         )
-        print("Sent request to %s:%d/info/%s/" % (get_arg('server'), get_arg('port'), get_arg('key')))
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/info/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"))
         )
         print(str(err))
         sys.exit(0)
@@ -437,18 +488,24 @@ def tango_jobs():
 
         response = requests.get(
             "%s://%s:%d/jobs/%s/%d/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs'))
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("deadJobs"),
+            )
         )
         print(
             "Sent request to %s:%d/jobs/%s/%d/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("deadJobs"))
         )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/jobs/%s/%d/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('deadJobs'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("deadJobs"))
         )
         print(str(err))
         sys.exit(0)
@@ -465,18 +522,24 @@ def tango_pool():
 
         response = requests.get(
             "%s://%s:%d/pool/%s/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image'))
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("image"),
+            )
         )
         print(
             "Sent request to %s:%d/pool/%s/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("image"))
         )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/pool/%s/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"), get_arg("image"))
         )
         print(str(err))
         sys.exit(0)
@@ -492,23 +555,30 @@ def tango_prealloc():
         if res != 0:
             raise Exception("Invalid usage: [prealloc] " + prealloc_help)
 
-        vmObj["vmms"] = get_arg('vmms')
-        vmObj["cores"] = get_arg('cores')
-        vmObj["memory"] = get_arg('memory')
+        vmObj["vmms"] = get_arg("vmms")
+        vmObj["cores"] = get_arg("cores")
+        vmObj["memory"] = get_arg("memory")
 
         response = requests.post(
             "%s://%s:%d/prealloc/%s/%s/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key'), get_arg('image'), get_arg('num')),
+            % (
+                _tango_protocol,
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("image"),
+                get_arg("num"),
+            ),
             data=json.dumps(vmObj),
         )
         print(
             "Sent request to %s:%d/prealloc/%s/%s/%s/ \t vmObj=%s"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('image'),
-                get_arg('num'),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("image"),
+                get_arg("num"),
                 json.dumps(vmObj),
             )
         )
@@ -518,11 +588,11 @@ def tango_prealloc():
         print(
             "Failed to send request to %s:%d/prealloc/%s/%s/%s/ \t vmObj=%s"
             % (
-                get_arg('server'),
-                get_arg('port'),
-                get_arg('key'),
-                get_arg('image'),
-                get_arg('num'),
+                get_arg("server"),
+                get_arg("port"),
+                get_arg("key"),
+                get_arg("image"),
+                get_arg("num"),
                 json.dumps(vmObj),
             )
         )
@@ -548,21 +618,24 @@ def tango_build():
         if res != 0:
             raise Exception("Invalid usage: [build] " + build_help)
 
-        f = open(get_arg('filename'), "rb")
-        header = {"imageName": get_arg('imageName')}
+        f = open(get_arg("filename"), "rb")
+        header = {"imageName": get_arg("imageName")}
         response = requests.post(
             "%s://%s:%d/build/%s/"
-            % (_tango_protocol, get_arg('server'), get_arg('port'), get_arg('key')),
+            % (_tango_protocol, get_arg("server"), get_arg("port"), get_arg("key")),
             data=f.read(),
             headers=header,
         )
-        print("Sent request to %s:%d/build/%s/" % (get_arg('server'), get_arg('port'), get_arg('key')))
+        print(
+            "Sent request to %s:%d/build/%s/"
+            % (get_arg("server"), get_arg("port"), get_arg("key"))
+        )
         print(response.text)
 
     except Exception as err:
         print(
             "Failed to send request to %s:%d/build/%s/"
-            % (get_arg('server'), get_arg('port'), get_arg('key'))
+            % (get_arg("server"), get_arg("port"), get_arg("key"))
         )
         print(str(err))
         sys.exit(0)
@@ -572,11 +645,11 @@ def tango_build():
 
 
 def tango_runJob():
-    if get_arg('runJob') is None:
+    if get_arg("runJob") is None:
         print("Invalid usage: [runJob]")
         sys.exit(0)
 
-    dir = get_arg('runJob')
+    dir = get_arg("runJob")
     infiles = [
         file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))
     ]
@@ -585,7 +658,7 @@ def tango_runJob():
 
     args.jobname += "-0"
     args.outputFile += "-0"
-    for i in range(1, get_arg('numJobs') + 1):
+    for i in range(1, get_arg("numJobs") + 1):
         print(
             "----------------------------------------- STARTING JOB "
             + str(i)
@@ -608,27 +681,27 @@ def tango_runJob():
 
 
 def router():
-    if get_arg('open'):
+    if get_arg("open"):
         tango_open()
-    elif get_arg('upload'):
+    elif get_arg("upload"):
         tango_upload()
-    elif get_arg('addJob'):
+    elif get_arg("addJob"):
         tango_addJob()
-    elif get_arg('poll'):
+    elif get_arg("poll"):
         tango_poll()
-    elif get_arg('info'):
+    elif get_arg("info"):
         tango_info()
-    elif get_arg('jobs'):
+    elif get_arg("jobs"):
         tango_jobs()
-    elif get_arg('pool'):
+    elif get_arg("pool"):
         tango_pool()
-    elif get_arg('prealloc'):
+    elif get_arg("prealloc"):
         tango_prealloc()
-    elif get_arg('runJob'):
+    elif get_arg("runJob"):
         tango_runJob()
-    elif get_arg('getPartialOutput'):
+    elif get_arg("getPartialOutput"):
         tango_getPartialOutput()
-    elif get_arg('build'):
+    elif get_arg("build"):
         tango_build()
 
 
@@ -637,32 +710,34 @@ def router():
 #
 args = parser.parse_args()
 if (
-    not get_arg('open')
-    and not get_arg('upload')
-    and not get_arg('addJob')
-    and not get_arg('poll')
-    and not get_arg('info')
-    and not get_arg('jobs')
-    and not get_arg('pool')
-    and not get_arg('prealloc')
-    and not get_arg('runJob')
-    and not get_arg('getPartialOutput')
-    and not get_arg('build')
+    not get_arg("open")
+    and not get_arg("upload")
+    and not get_arg("addJob")
+    and not get_arg("poll")
+    and not get_arg("info")
+    and not get_arg("jobs")
+    and not get_arg("pool")
+    and not get_arg("prealloc")
+    and not get_arg("runJob")
+    and not get_arg("getPartialOutput")
+    and not get_arg("build")
 ):
     parser.print_help()
     sys.exit(0)
 
-if get_arg('ssl'):
+if get_arg("ssl"):
     _tango_protocol = "https"
-    if get_arg('port') == 3000:
+    if get_arg("port") == 3000:
         args.port = 443
 
 
 try:
-    response = requests.get("%s://%s:%d/" % (_tango_protocol, get_arg('server'), get_arg('port')))
+    response = requests.get(
+        "%s://%s:%d/" % (_tango_protocol, get_arg("server"), get_arg("port"))
+    )
     response.raise_for_status()
 except BaseException:
-    print("Tango not reachable on %s:%d!\n" % (get_arg('server'), get_arg('port')))
+    print("Tango not reachable on %s:%d!\n" % (get_arg("server"), get_arg("port")))
     sys.exit(0)
 
 router()
diff --git a/jobManager.py b/jobManager.py
index 50cd9553..a9f85f46 100644
--- a/jobManager.py
+++ b/jobManager.py
@@ -26,7 +26,6 @@
 from vmms.interface import VMMSInterface
 
 
-
 class JobManager(object):
     def __init__(self, queue: JobQueue):
         self.daemon = True
@@ -67,7 +66,9 @@ def __manage(self) -> None:
             # Blocks until we get a next job
             job: TangoJob = self.jobQueue.getNextPendingJob()
             if not job.accessKey and Config.REUSE_VMS:
-                self.log.info(f"job has access key {job.accessKey} and is calling reuseVM")
+                self.log.info(
+                    f"job has access key {job.accessKey} and is calling reuseVM"
+                )
                 vm = None
                 while vm is None:
                     vm = self.jobQueue.reuseVM(job)
@@ -92,9 +93,7 @@ def __manage(self) -> None:
                         self.log.error("ERROR initialization VM: %s", e)
                         self.log.error(traceback.format_exc())
                     if preVM is None:
-                        raise Exception(
-                            "EC2 SSH VM initialization failed: see log"
-                        )
+                        raise Exception("EC2 SSH VM initialization failed: see log")
                 else:
                     self.log.info(f"job {job.id} is not an ec2 vmms job")
                     # Try to find a vm on the free list and allocate it to
@@ -122,15 +121,15 @@ def __manage(self) -> None:
                 )
                 # Mark the job assigned
                 self.jobQueue.assignJob(job.id, preVM)
-                Worker(
-                    job, vmms, self.jobQueue, self.preallocator, preVM
-                ).start()
+                Worker(job, vmms, self.jobQueue, self.preallocator, preVM).start()
 
             except Exception as err:
                 if job is None:
                     self.log.info("job_manager: job is None")
                 else:
-                    self.log.error("job failed during creation %d %s" % (job.id, str(err)))
+                    self.log.error(
+                        "job failed during creation %d %s" % (job.id, str(err))
+                    )
                     self.jobQueue.makeDead(job, str(err))
 
 
@@ -146,7 +145,10 @@ def __manage(self) -> None:
         tango_server.log.debug("Resetting Tango VMs")
         tango_server.resetTango(tango_server.preallocator.vmms)
         for key in tango_server.preallocator.machines.keys():
-            machine: Tuple[List[TangoMachine], TangoQueue] = ([], TangoQueue.create(key))
+            machine: Tuple[List[TangoMachine], TangoQueue] = (
+                [],
+                TangoQueue.create(key),
+            )
             machine[1].make_empty()
             tango_server.preallocator.machines.set(key, machine)
 
diff --git a/jobQueue.py b/jobQueue.py
index 9e9c664d..56bc2ad7 100644
--- a/jobQueue.py
+++ b/jobQueue.py
@@ -16,6 +16,7 @@
 from config import Config
 from preallocator import Preallocator
 from typing import Optional
+
 #
 # JobQueue - This class defines the job queue and the functions for
 # manipulating it. The actual queue is made up of two smaller
@@ -55,7 +56,9 @@ def __init__(self, preallocator: Preallocator) -> None:
         using the makeUnassigned api.
         """
         self.liveJobs: TangoDictionary[TangoJob] = TangoDictionary.create("liveJobs")
-        self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create("deadJobs") # Servees as a record of both failed and completed jobs
+        self.deadJobs: TangoDictionary[TangoJob] = TangoDictionary.create(
+            "deadJobs"
+        )  # Servees as a record of both failed and completed jobs
         self.unassignedJobs: TangoQueue[int] = TangoQueue.create("unassignedLiveJobs")
         self.queueLock = threading.Lock()
         self.preallocator: Preallocator = preallocator
@@ -137,7 +140,7 @@ def add(self, job):
 
         # Since we assume that the job is new, we set the number of retries
         # of this job to 0
-        assert(job.retries == 0)
+        assert job.retries == 0
 
         # Add the job to the queue. Careful not to append the trace until we
         # know the job has actually been added to the queue.
@@ -169,8 +172,8 @@ def add(self, job):
 
         return str(job.id)
 
-    # TODO: get rid of this return value, it is not used anywhere 
-    def addDead(self, job) -> int: 
+    # TODO: get rid of this return value, it is not used anywhere
+    def addDead(self, job) -> int:
         """addDead - add a job to the dead queue.
         Called by validateJob when a job validation fails.
         Returns -1 on failure and the job id on success
@@ -248,7 +251,7 @@ def get(self, id):
         self.log.debug("get| Released lock to job queue.")
         return job
 
-    # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker. 
+    # TODO: this function is a little weird. It sets the state of job to be "assigned", but not to which worker.
     # TODO: It does assign the job to a particular VM though.
     # Precondition: jobId is in self.liveJobs
     def assignJob(self, jobId, vm=None):
@@ -315,11 +318,11 @@ def makeDead(self, job: TangoJob, reason):
         if job.id not in self.liveJobs:
             self.log.error("makeDead| Job ID: %s not found in live jobs" % (job.id))
             return -1
-        
+
         self.log.info("makeDead| Found job ID: %s in the live queue" % (job.id))
         status = 0
         self.log.info("Terminated job %s:%s: %s" % (job.name, job.id, reason))
-        
+
         # Remove the job from the live jobs dictionary
         job.deleteFromDict(self.liveJobs)
         # Add the job to the dead jobs dictionary
@@ -358,7 +361,9 @@ def getNextPendingJob(self) -> TangoJob:
         """
         # Blocks till the next item is added
         id = self.unassignedJobs.get()
-        assert id is not None, ".get with default arguments should block and never return None"
+        assert (
+            id is not None
+        ), ".get with default arguments should block and never return None"
 
         self.log.debug("_getNextPendingJob|Acquiring lock to job queue.")
         self.queueLock.acquire()
@@ -395,5 +400,3 @@ def reuseVM(self, job):
                 return job.vm
             else:
                 raise Exception("Job assigned without vm")
-
-
diff --git a/preallocator.py b/preallocator.py
index eb2d27c9..da7f8b1d 100644
--- a/preallocator.py
+++ b/preallocator.py
@@ -10,6 +10,7 @@
 from config import Config
 from typing import Tuple, List, Dict
 from vmms.interface import VMMSInterface
+
 #
 # Preallocator - This class maintains a pool of active VMs for future
 # job requests.  The pool is stored in dictionary called
@@ -23,7 +24,9 @@
 
 class Preallocator(object):
     def __init__(self, vmms: Dict[str, VMMSInterface]) -> None:
-        self.machines: TangoDictionary[Tuple[List[TangoMachine], TangoQueue[TangoMachine]]] = TangoDictionary.create("machines")
+        self.machines: TangoDictionary[
+            Tuple[List[TangoMachine], TangoQueue[TangoMachine]]
+        ] = TangoDictionary.create("machines")
         self.lock = threading.Lock()
         self.nextID = TangoIntValue("nextID", 1000)
         self.vmms: Dict[str, VMMSInterface] = vmms
diff --git a/restful_tango/server.py b/restful_tango/server.py
index 1a177ce0..5b4c495d 100755
--- a/restful_tango/server.py
+++ b/restful_tango/server.py
@@ -46,7 +46,9 @@ def prepare(self):
         if not os.path.exists(tempdir):
             os.mkdir(tempdir, 0o700)
         if os.path.exists(tempdir) and not os.path.isdir(tempdir):
-            tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,))
+            tangoREST.log.error(
+                "Cannot process uploads, %s is not a directory" % (tempdir,)
+            )
             return self.send_error()
         self.tempfile = NamedTemporaryFile(prefix="upload", dir=tempdir, delete=False)
         self.hasher = hashlib.md5()
@@ -129,7 +131,9 @@ def prepare(self):
         if not os.path.exists(tempdir):
             os.mkdir(tempdir, 0o700)
         if os.path.exists(tempdir) and not os.path.isdir(tempdir):
-            tangoREST.log.error("Cannot process uploads, %s is not a directory" % (tempdir,))
+            tangoREST.log.error(
+                "Cannot process uploads, %s is not a directory" % (tempdir,)
+            )
             return self.send_error()
         self.tempfile = NamedTemporaryFile(prefix="docker", dir=tempdir, delete=False)
 
diff --git a/restful_tango/tangoREST.py b/restful_tango/tangoREST.py
index d246daba..1b0f9881 100644
--- a/restful_tango/tangoREST.py
+++ b/restful_tango/tangoREST.py
@@ -195,7 +195,7 @@ def convertJobObj(self, dirName, jobObj):
             accessKey=accessKey,
             accessKeyId=accessKeyId,
             disableNetwork=disableNetwork,
-            stopBefore=stopBefore
+            stopBefore=stopBefore,
         )
 
         self.log.debug("inputFiles: %s" % [file.localFile for file in input])
@@ -308,7 +308,7 @@ def upload(self, key, courselab, file, tempfile, fileMD5):
             except Exception as e:
                 exc_type, _, exc_tb = sys.exc_info()
                 assert exc_type is not None
-                assert exc_tb is not None # currently handling an exception
+                assert exc_tb is not None  # currently handling an exception
                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                 print(exc_type, fname, exc_tb.tb_lineno)
                 self.log.error("upload request failed: %s" % str(e))
@@ -339,7 +339,7 @@ def addJob(self, key, courselab, jobStr):
             except Exception as e:
                 exc_type, _, exc_tb = sys.exc_info()
                 assert exc_type is not None
-                assert exc_tb is not None # currently handling an exception
+                assert exc_tb is not None  # currently handling an exception
                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                 print(exc_type, fname, exc_tb.tb_lineno)
                 self.log.error("addJob request failed: %s" % str(e))
diff --git a/tangoObjects.py b/tangoObjects.py
index e679bce0..37dbf0dd 100644
--- a/tangoObjects.py
+++ b/tangoObjects.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 # tangoREST.py
 #
 # Implements objects used to pass state within Tango.
@@ -118,7 +119,9 @@ def __init__(
         self._outputFile = outputFile
         self._name = name
         self._notifyURL = notifyURL
-        self._timeout = timeout # How long to run the autodriver on the job for before timing out.
+        self._timeout = (
+            timeout  # How long to run the autodriver on the job for before timing out.
+        )
         self._trace: list[str] = []
         self._maxOutputFileSize = maxOutputFileSize
         self._remoteLocation: Optional[str] = None
@@ -126,19 +129,21 @@ def __init__(
         self._accessKey = accessKey
         self._disableNetwork = disableNetwork
         self._stopBefore = stopBefore
-        self._id: Optional[int] = None # uninitialized until it gets added to either the live or dead queue
+        self._id: Optional[
+            int
+        ] = None  # uninitialized until it gets added to either the live or dead queue
 
     def __repr__(self):
         self.syncRemote()
         return f"ID: {self._id} - Name: {self.name}"
-    
+
     # TODO: reduce code size/duplication by setting TangoJob as a dataclass
     # Getters for private variables
     @property
     def assigned(self):
-        self.syncRemote() # Is it necessary to sync here?
+        self.syncRemote()  # Is it necessary to sync here?
         return self._assigned
-    
+
     @property
     def retries(self):
         self.syncRemote()
@@ -148,67 +153,67 @@ def retries(self):
     def vm(self):
         self.syncRemote()
         return self._vm
-    
+
     @property
     def input(self):
         self.syncRemote()
         return self._input
-    
+
     @property
     def outputFile(self):
         self.syncRemote()
         return self._outputFile
-    
+
     @property
     def name(self):
         self.syncRemote()
         return self._name
-    
+
     @property
     def notifyURL(self):
         self.syncRemote()
         return self._notifyURL
-    
+
     @property
     def timeout(self):
         self.syncRemote()
         return self._timeout
-    
+
     @property
     def trace(self):
         self.syncRemote()
         return self._trace
-    
+
     @property
     def maxOutputFileSize(self):
         self.syncRemote()
         return self._maxOutputFileSize
-    
+
     @property
     def remoteLocation(self):
         self.syncRemote()
         return self._remoteLocation
-    
+
     @property
     def accessKeyId(self):
         self.syncRemote()
         return self._accessKeyId
-    
+
     @property
     def accessKey(self):
         self.syncRemote()
         return self._accessKey
-    
+
     @property
     def disableNetwork(self):
         self.syncRemote()
         return self._disableNetwork
-    
+
     @property
     def stopBefore(self):
         self.syncRemote()
         return self._stopBefore
-    
+
     @property
     def id(self) -> int:
         self.syncRemote()
@@ -224,7 +229,7 @@ def resetRetries(self):
         self.syncRemote()
         self._retries = 0
         self.updateRemote()
-        
+
     def incrementRetries(self):
         self.syncRemote()
         self._retries += 1
@@ -258,14 +263,14 @@ def setId(self, new_id: int) -> None:
             dictionary.delete(key)
             self._remoteLocation = dict_hash + ":" + str(new_id)
             self.updateRemote()
-            
+
     def setTimeout(self, new_timeout):
         self.syncRemote()
         self._timeout = new_timeout
         self.updateRemote()
 
     def setKeepForDebugging(self, keep_for_debugging: bool):
-        if (self._vm is not None):
+        if self._vm is not None:
             self.syncRemote()
             self._vm.keep_for_debugging = keep_for_debugging
             self.updateRemote()
@@ -284,15 +289,16 @@ def __updateSelf(self, other_job):
         self._maxOutputFileSize = other_job._maxOutputFileSize
         self._id = other_job._id
 
-
     def syncRemote(self) -> None:
         if Config.USE_REDIS and self._remoteLocation is not None:
             dict_hash = self._remoteLocation.split(":")[0]
             key = self._remoteLocation.split(":")[1]
             dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash)
-            temp_job = dictionary.get(key) # Key should be in dictionary
+            temp_job = dictionary.get(key)  # Key should be in dictionary
             if temp_job is None:
-                print(f"Job {key} not found in dictionary {dict_hash}") # TODO: add better error handling for TangoJob
+                print(
+                    f"Job {key} not found in dictionary {dict_hash}"
+                )  # TODO: add better error handling for TangoJob
                 return
             self.__updateSelf(temp_job)
 
@@ -302,20 +308,19 @@ def updateRemote(self) -> None:
             key = self._remoteLocation.split(":")[1]
             dictionary: TangoDictionary[TangoJob] = TangoDictionary.create(dict_hash)
             dictionary.set(key, self)
-            
-    def deleteFromDict(self, dictionary : TangoDictionary) -> None:
+
+    def deleteFromDict(self, dictionary: TangoDictionary) -> None:
         assert self._id is not None
         dictionary.delete(self._id)
         self._remoteLocation = None
-        
-    def addToDict(self, dictionary : TangoDictionary) -> None:
+
+    def addToDict(self, dictionary: TangoDictionary) -> None:
         assert self._id is not None
         dictionary.set(self._id, self)
         assert self._remoteLocation is None, "Job already has a remote location"
         if Config.USE_REDIS:
             self._remoteLocation = dictionary.hash_name + ":" + str(self._id)
             self.updateRemote()
-        
 
 
 def TangoIntValue(object_name, obj):
@@ -359,9 +364,11 @@ def get(self):
     def set(self, val):
         self.val = val
         return val
-    
 
-QueueElem = TypeVar('QueueElem')
+
+QueueElem = TypeVar("QueueElem")
+
+
 class TangoQueue(Protocol[QueueElem]):
     @staticmethod
     def create(key_name: str) -> TangoQueue[QueueElem]:
@@ -373,19 +380,26 @@ def create(key_name: str) -> TangoQueue[QueueElem]:
     @abstractmethod
     def qsize(self) -> int:
         ...
+
     def empty(self) -> bool:
         ...
+
     def put(self, item: QueueElem) -> None:
         ...
+
     def get(self, block=True, timeout=None) -> Optional[QueueElem]:
         ...
+
     def get_nowait(self) -> Optional[QueueElem]:
         ...
+
     def remove(self, item: QueueElem) -> None:
         ...
+
     def make_empty(self) -> None:
         ...
 
+
 class ExtendedQueue(Queue, TangoQueue[QueueElem]):
     """Python Thread safe Queue with the remove and clean function added"""
 
@@ -404,7 +418,8 @@ def remove(self, value):
     def make_empty(self):
         with self.mutex:
             self.queue.clear()
-            
+
+
 class TangoRemoteQueue(TangoQueue):
 
     """Simple Queue with Redis Backend"""
@@ -467,47 +482,55 @@ def remove(self, item):
     def make_empty(self) -> None:
         self.__db.delete(self.key)
 
-T = TypeVar('T')
+
+T = TypeVar("T")
 KeyType = Union[str, int]
 # Dictionary from string to T
 class TangoDictionary(Protocol[T]):
-
     @staticmethod
     def create(dictionary_name: str) -> TangoDictionary[T]:
         if Config.USE_REDIS:
             return TangoRemoteDictionary(dictionary_name)
         else:
             return TangoNativeDictionary()
-        
+
     @property
     @abstractmethod
     def hash_name(self) -> str:
         ...
-        
+
     @abstractmethod
     def __contains__(self, id: KeyType) -> bool:
         ...
+
     @abstractmethod
     def set(self, id: KeyType, obj: T) -> str:
         ...
+
     @abstractmethod
     def get(self, id: KeyType) -> Optional[T]:
         ...
+
     @abstractmethod
     def getExn(self, id: KeyType) -> T:
         ...
+
     @abstractmethod
     def keys(self) -> list[str]:
         ...
+
     @abstractmethod
     def values(self) -> list[T]:
         ...
+
     @abstractmethod
     def delete(self, id: KeyType) -> None:
         ...
+
     @abstractmethod
     def make_empty(self) -> None:
         ...
+
     @abstractmethod
     def items(self) -> list[tuple[str, T]]:
         ...
@@ -525,8 +548,6 @@ def items(self) -> list[tuple[str, T]]:
 #         return TangoNativeDictionary()
 
 
-
-
 class TangoRemoteDictionary(TangoDictionary[T]):
     def __init__(self, object_name):
         self.r = getRedisConnection()
diff --git a/tests/sample_test/expected_output.txt b/tests/sample_test/expected_output.txt
index 70c379b6..802992c4 100644
--- a/tests/sample_test/expected_output.txt
+++ b/tests/sample_test/expected_output.txt
@@ -1 +1 @@
-Hello world
\ No newline at end of file
+Hello world
diff --git a/tests/stressTest.py b/tests/stressTest.py
index 6494a8c4..eeb46bfa 100644
--- a/tests/stressTest.py
+++ b/tests/stressTest.py
@@ -14,7 +14,17 @@
 start_time = time.time()
 expected_output = ""
 
-def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
+
+def printProgressBar(
+    iteration,
+    total,
+    prefix="",
+    suffix="",
+    decimals=1,
+    length=100,
+    fill="█",
+    printEnd="\r",
+):
     """
     Call in a loop to create terminal progress bar
     @params:
@@ -29,39 +39,65 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1,
     """
     percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
     filledLength = int(length * iteration // total)
-    bar = fill * filledLength + '-' * (length - filledLength)
-    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
+    bar = fill * filledLength + "-" * (length - filledLength)
+    print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd)
     # Print New Line on Complete
-    if iteration == total: 
+    if iteration == total:
         print()
 
-def run_stress_test(num_submissions, submission_delay, autograder_image, output_file, tango_port, cli_path, 
-                    job_name, job_path, instance_type, timeout, ec2):
-    printProgressBar(0, num_submissions, prefix = 'Jobs Added:', suffix = 'Complete', length = 50)
-    with open(output_file, 'a') as f:
+
+def run_stress_test(
+    num_submissions,
+    submission_delay,
+    autograder_image,
+    output_file,
+    tango_port,
+    cli_path,
+    job_name,
+    job_path,
+    instance_type,
+    timeout,
+    ec2,
+):
+    printProgressBar(
+        0, num_submissions, prefix="Jobs Added:", suffix="Complete", length=50
+    )
+    with open(output_file, "a") as f:
         f.write(f"Stress testing with {num_submissions} submissions\n")
-        
+
         for i in range(1, num_submissions + 1):
             command = [
-                'python3', cli_path,
-                '-P', str(tango_port),
-                '-k', 'test',
-                '-l', job_name,
-                '--runJob', job_path,
-                '--image', autograder_image,
-                '--instanceType', instance_type,
-                '--timeout', str(timeout),
-                '--callbackURL', ("http://localhost:8888/autograde_done?id=%d" % (i))
+                "python3",
+                cli_path,
+                "-P",
+                str(tango_port),
+                "-k",
+                "test",
+                "-l",
+                job_name,
+                "--runJob",
+                job_path,
+                "--image",
+                autograder_image,
+                "--instanceType",
+                instance_type,
+                "--timeout",
+                str(timeout),
+                "--callbackURL",
+                ("http://localhost:8888/autograde_done?id=%d" % (i)),
             ]
             if ec2:
-                command += ['--ec2']
+                command += ["--ec2"]
             subprocess.run(command, stdout=f, stderr=f)
             f.write(f"Submission {i} completed\n")
-            printProgressBar(i, num_submissions, prefix = 'Jobs Added:', suffix = 'Complete', length = 50)
+            printProgressBar(
+                i, num_submissions, prefix="Jobs Added:", suffix="Complete", length=50
+            )
             if submission_delay > 0:
                 time.sleep(submission_delay)
         print()
 
+
 class AutogradeDoneHandler(tornado.web.RequestHandler):
     def post(self):
         global finished_tests
@@ -71,10 +107,16 @@ def post(self):
         id = self.get_query_argument("id")
         fileBody = self.request.files["file"][0]["body"].decode()
         scoreJson = fileBody.split("\n")[-2]
-        with open(os.path.join(test_dir, "output", "output%s.txt" % id), 'w') as f:
+        with open(os.path.join(test_dir, "output", "output%s.txt" % id), "w") as f:
             f.write(fileBody)
         finished_tests[str(id)] = scoreJson
-        printProgressBar(len(finished_tests), sub_num, prefix = 'Tests Done:', suffix = 'Complete', length = 50)
+        printProgressBar(
+            len(finished_tests),
+            sub_num,
+            prefix="Tests Done:",
+            suffix="Complete",
+            length=50,
+        )
         self.write("ok")
         self.flush()
 
@@ -87,6 +129,7 @@ def on_finish(self):
             print("\nShutting down server...")
             tornado.ioloop.IOLoop.current().stop()
 
+
 def create_summary():
     success = []
     failed = []
@@ -95,7 +138,7 @@ def create_summary():
             success.append(i)
         else:
             failed.append(i)
-    with open(os.path.join(test_dir, "summary.txt"), 'w') as f:
+    with open(os.path.join(test_dir, "summary.txt"), "w") as f:
         f.write("Total Time: %d seconds\n" % (time.time() - start_time))
         f.write("Total Succeeded: %d / %d\n" % (len(success), sub_num))
         f.write("Total Failed: %d / %d\n" % (len(failed), sub_num))
@@ -107,21 +150,28 @@ def create_summary():
         for i in range(0, len(failed)):
             f.write("Test Case #%d: %s\n" % (failed[i], finished_tests[str(failed[i])]))
 
+
 def make_app():
-    return tornado.web.Application([
-        (r"/autograde_done", AutogradeDoneHandler),
-    ])
+    return tornado.web.Application(
+        [
+            (r"/autograde_done", AutogradeDoneHandler),
+        ]
+    )
+
 
 def notifyServer():
     global shutdown_event
     app = make_app()
     app.listen(8888)
-    printProgressBar(0, sub_num, prefix = 'Tests Done:', suffix = 'Complete', length = 50)
+    printProgressBar(0, sub_num, prefix="Tests Done:", suffix="Complete", length=50)
     tornado.ioloop.IOLoop.current().start()
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Stress test script for Tango")
-    parser.add_argument('--test_dir', type=str, required=True, help="Directory to run the test in")
+    parser.add_argument(
+        "--test_dir", type=str, required=True, help="Directory to run the test in"
+    )
 
     args = parser.parse_args()
 
@@ -129,10 +179,10 @@ def notifyServer():
 
     test_dir = args.test_dir
 
-    with open(os.path.join(args.test_dir, dirname + '.yaml'), 'r') as f:
+    with open(os.path.join(args.test_dir, dirname + ".yaml"), "r") as f:
         data = yaml.load(f, Loader=yaml.SafeLoader)
-    
-    with open(os.path.join(args.test_dir, data["expected_output"]), 'r') as f:
+
+    with open(os.path.join(args.test_dir, data["expected_output"]), "r") as f:
         expected_output = f.read()
 
     sub_num = data["num_submissions"]
@@ -152,10 +202,10 @@ def notifyServer():
         data["tango_port"],
         data["cli_path"],
         dirname,
-        os.path.join(args.test_dir, 'input'),
+        os.path.join(args.test_dir, "input"),
         data["instance_type"],
         data["timeout"],
-        data["ec2"]
+        data["ec2"],
     )
 
-    notifyServer()
\ No newline at end of file
+    notifyServer()
diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 4b2e4090..533a8cd0 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -34,10 +34,15 @@
 logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
 
 valid_instance_types: Set[InstanceTypeType] = set(get_args(InstanceTypeType))
+
+
 def check_instance_type(instance_type: str) -> TypeGuard[InstanceTypeType]:
     return instance_type in valid_instance_types
 
-def timeout_with_retries(command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2) -> int:
+
+def timeout_with_retries(
+    command: List[str], time_out: float = 1, retries: int = 3, retry_delay: float = 2
+) -> int:
     """timeout - Run a unix command with a timeout. Return -1 on
     timeout, otherwise return the return value from the command, which
     is typically 0 for success, 1-255 for failure.
@@ -106,7 +111,9 @@ def release_vm_semaphore() -> None:
         Ec2SSH._vm_semaphore.release()
 
     # TODO: the arguments accessKeyId and accessKey don't do anything
-    def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None) -> None:
+    def __init__(
+        self, accessKeyId: Optional[str] = None, accessKey: Optional[str] = None
+    ) -> None:
         """log - logger for the instance
         connection - EC2Connection object that stores the connection
         info to the EC2 network
@@ -139,11 +146,15 @@ def __init__(self, accessKeyId: Optional[str] = None, accessKey: Optional[str] =
             # self.createKeyPair()
         # create boto3resource
 
-        self.img2ami: Dict[str, Image] = {} # this is a bad name, should really be img_name to img
+        self.img2ami: Dict[
+            str, Image
+        ] = {}  # this is a bad name, should really be img_name to img
         self.images: List[Image] = []
         try:
             # This is a service resource
-            self.boto3resource: EC2ServiceResource = boto3.resource("ec2", config.Config.EC2_REGION) # TODO: rename this ot self.ec2resource
+            self.boto3resource: EC2ServiceResource = boto3.resource(
+                "ec2", config.Config.EC2_REGION
+            )  # TODO: rename this ot self.ec2resource
             self.boto3client = boto3.client("ec2", config.Config.EC2_REGION)
 
             # Get images from ec2
@@ -262,7 +273,7 @@ def createSecurityGroup(self) -> None:
         except Exception as e:
             self.log.debug("ERROR checking for existing security group: %s", e)
         # ! Note: We've never encountered the lines below before (there was a type error),
-        # ! because we've always had a security group. 
+        # ! because we've always had a security group.
         # ! Difficult to test because it involves deleting all security groups.
         try:
             security_group_response = self.boto3client.create_security_group(
@@ -270,9 +281,7 @@ def createSecurityGroup(self) -> None:
                 Description="Autolab security group - allowing all traffic",
             )
             security_group_id = security_group_response["GroupId"]
-            self.boto3client.authorize_security_group_ingress(
-                GroupId=security_group_id
-            )
+            self.boto3client.authorize_security_group_ingress(GroupId=security_group_id)
         except Exception as e:
             self.log.debug("ERROR in creating security group: %s", e)
 
@@ -297,7 +306,6 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
             # ensure that security group exists
             self.createSecurityGroup()
 
-
             reservation: List[Instance] = self.boto3resource.create_instances(
                 ImageId=ec2instance["ami"],
                 KeyName=self.key_pair_name,
@@ -305,13 +313,12 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
                 InstanceType=instance_type,
                 MaxCount=1,
                 MinCount=1,
-                InstanceMarketOptions=
-                        {
+                InstanceMarketOptions={
                     "MarketType": "spot",
                     "SpotOptions": {
                         "SpotInstanceType": "one-time",
-                        "InstanceInterruptionBehavior": "terminate"
-                    }
+                        "InstanceInterruptionBehavior": "terminate",
+                    },
                 },
             )
 
@@ -339,18 +346,13 @@ def initializeVM(self, vm: TangoMachine) -> Literal[0, -1]:
                 # reload the state of the new instance
                 try_load_instance(newInstance)
                 for inst in instances.filter(InstanceIds=[newInstance.id]):
-                    self.log.debug(
-                        "VM %s %s: is running" % (vm.name, newInstance.id)
-                    )
+                    self.log.debug("VM %s %s: is running" % (vm.name, newInstance.id))
                     instanceRunning = True
 
                 if instanceRunning:
                     break
 
-                if (
-                    time.time() - start_time
-                    > config.Config.INITIALIZEVM_TIMEOUT
-                ):
+                if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT:
                     raise ValueError(
                         "VM %s %s: timeout (%d seconds) before reaching 'running' state"
                         % (
@@ -471,18 +473,21 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
             # Sleep a bit before trying again
             time.sleep(config.Config.TIMER_POLL_INTERVAL)
 
-    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
+    def copyIn(
+        self,
+        vm: TangoMachine,
+        inputFiles: List[InputFile],
+        job_id: Optional[int] = None,
+    ) -> int:
         """copyIn - Copy input files to VM
         Args:
         - vm is a TangoMachine object
-        - inputFiles is a list of objects with attributes localFile and destFile. 
+        - inputFiles is a list of objects with attributes localFile and destFile.
             localFile is the file on the host, destFile is the file on the VM.
-        - job_id is the job id of the job being run on the VM. 
+        - job_id is the job id of the job being run on the VM.
             It is used for logging purposes only.
         """
-        self.log.info(
-            "copyIn %s - writing files" % self.instanceName(vm.id, vm.name)
-        )
+        self.log.info("copyIn %s - writing files" % self.instanceName(vm.id, vm.name))
 
         domain_name = self.domainName(vm)
 
@@ -504,18 +509,14 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional
             self.log.info("%s for job %s" % (line, job_id))
         self.log.info("Return Code: %s, job: %s" % (result.returncode, job_id))
         if result.stderr != 0:
-            self.log.info(
-                "Standard Error: %s, job: %s" % (result.stderr, job_id)
-            )
+            self.log.info("Standard Error: %s, job: %s" % (result.stderr, job_id))
 
         # Validate inputFiles structure
         if not inputFiles or not all(
             hasattr(file, "localFile") and hasattr(file, "destFile")
             for file in inputFiles
         ):
-            self.log.info(
-                "Error: Invalid inputFiles Structure, job: %s" % job_id
-            )
+            self.log.info("Error: Invalid inputFiles Structure, job: %s" % job_id)
 
         for file in inputFiles:
             self.log.info("%s - %s" % (file.localFile, file.destFile))
@@ -524,8 +525,7 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional
                 + self.ssh_flags
                 + [
                     file.localFile,
-                    "%s@%s:~/autolab/%s"
-                    % (self.ec2User, domain_name, file.destFile),
+                    "%s@%s:~/autolab/%s" % (self.ec2User, domain_name, file.destFile),
                 ],
                 config.Config.COPYIN_TIMEOUT,
             )
@@ -535,7 +535,13 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional
 
         return 0
 
-    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int:
+    def runJob(
+        self,
+        vm: TangoMachine,
+        runTimeout: int,
+        maxOutputFileSize: int,
+        disableNetwork: bool,
+    ) -> int:
         """runJob - Run the make command on a VM using SSH and
         redirect output to file "output".
         """
@@ -558,9 +564,7 @@ def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disa
         # no logging for now
 
         ret = VMMSUtils.timeout(
-            ["ssh"]
-            + self.ssh_flags
-            + ["%s@%s" % (self.ec2User, domain_name), runcmd],
+            ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), runcmd],
             runTimeout * 2,
         )
 
@@ -615,7 +619,6 @@ def copyOut(self, vm: TangoMachine, destFile: str) -> int:
                 # re.error = xxx_todo_changeme
                 # Error copying out the timing data (probably runJob failed)
                 pass
-                
 
         return VMMSUtils.timeout(
             ["scp"]
@@ -639,9 +642,7 @@ def destroyVM(self, vm: TangoMachine) -> None:
                 InstanceIds=[vm.instance_id]
             )
             if not instances:
-                self.log.debug(
-                    "no instances found with instance id %s", vm.instance_id
-                )
+                self.log.debug("no instances found with instance id %s", vm.instance_id)
             # Keep the vm and mark with meaningful tags for debugging
             if (
                 hasattr(config.Config, "KEEP_VM_AFTER_FAILURE")
@@ -668,9 +669,7 @@ def destroyVM(self, vm: TangoMachine) -> None:
             if not self.useDefaultKeyPair:
                 self.deleteKeyPair()
         except Exception as e:
-            self.log.error(
-                "destroyVM failed: %s for vm %s" % (e, vm.instance_id)
-            )
+            self.log.error("destroyVM failed: %s for vm %s" % (e, vm.instance_id))
 
         Ec2SSH.release_vm_semaphore()
 
@@ -704,13 +703,8 @@ def getVMs(self) -> List[TangoMachine]:
                     instance.tags, "Name"
                 )  # inst name PREFIX-serial-IMAGE
                 # Name tag is the standard form of prefix-serial-image
-                if not (
-                    instName
-                    and re.match("%s-" % config.Config.PREFIX, instName)
-                ):
-                    self.log.debug(
-                        "getVMs: Instance id %s skipped" % vm.instance_id
-                    )
+                if not (instName and re.match("%s-" % config.Config.PREFIX, instName)):
+                    self.log.debug("getVMs: Instance id %s skipped" % vm.instance_id)
                     continue  # instance without name tag or proper prefix
 
                 vm.name = instName
@@ -723,8 +717,7 @@ def getVMs(self) -> List[TangoMachine]:
 
                 vms.append(vm)
                 self.log.debug(
-                    "getVMs: Instance id %s, name %s"
-                    % (vm.instance_id, vm.name)
+                    "getVMs: Instance id %s, name %s" % (vm.instance_id, vm.name)
                 )
         except Exception as e:
             self.log.debug("getVMs Failed: %s" % e)
@@ -734,7 +727,9 @@ def getVMs(self) -> List[TangoMachine]:
     def existsVM(self, vm: TangoMachine) -> bool:
         """existsVM - Checks whether a VM exists in the vmms."""
         # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrationec2.html
-        filters: Sequence[FilterTypeDef] = [{"Name": "instance-state-name", "Values": ["running"]}]
+        filters: Sequence[FilterTypeDef] = [
+            {"Name": "instance-state-name", "Values": ["running"]}
+        ]
         # gets all running instances
         instances = self.boto3resource.instances.filter(Filters=filters)
         for instance in instances:
@@ -766,13 +761,11 @@ def getPartialOutput(self, vm: TangoMachine) -> str:
         )
 
         sshcmd = (
-            ["ssh"]
-            + self.ssh_flags
-            + ["%s@%s" % (self.ec2User, domain_name), runcmd]
+            ["ssh"] + self.ssh_flags + ["%s@%s" % (self.ec2User, domain_name), runcmd]
         )
 
-        output = subprocess.check_output(
-            sshcmd, stderr=subprocess.STDOUT
-        ).decode("utf-8")
+        output = subprocess.check_output(sshcmd, stderr=subprocess.STDOUT).decode(
+            "utf-8"
+        )
 
         return output
diff --git a/vmms/interface.py b/vmms/interface.py
index a437401a..33454f13 100644
--- a/vmms/interface.py
+++ b/vmms/interface.py
@@ -13,11 +13,22 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
         ...
 
     @abstractmethod
-    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
+    def copyIn(
+        self,
+        vm: TangoMachine,
+        inputFiles: List[InputFile],
+        job_id: Optional[int] = None,
+    ) -> int:
         ...
 
     @abstractmethod
-    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int: # -1 to infinity
+    def runJob(
+        self,
+        vm: TangoMachine,
+        runTimeout: int,
+        maxOutputFileSize: int,
+        disableNetwork: bool,
+    ) -> int:  # -1 to infinity
         ...
 
     @abstractmethod
@@ -27,7 +38,7 @@ def copyOut(self, vm: TangoMachine, destFile: str) -> int:
     @abstractmethod
     def destroyVM(self, vm: TangoMachine) -> None:
         ...
-    
+
     @abstractmethod
     def safeDestroyVM(self, vm: TangoMachine) -> None:
         ...
@@ -47,7 +58,7 @@ def instanceName(self, id: int, name: str) -> str:
     @abstractmethod
     def getImages(self) -> List[str]:
         ...
-    
+
     @abstractmethod
     def getPartialOutput(self, vm: TangoMachine) -> str:
         ...
diff --git a/vmms/localDocker.py b/vmms/localDocker.py
index fa727bb6..5617b807 100644
--- a/vmms/localDocker.py
+++ b/vmms/localDocker.py
@@ -18,7 +18,6 @@
 from vmms.sharedUtils import VMMSUtils
 
 
-
 #
 # User defined exceptions
 #
@@ -71,7 +70,12 @@ def waitVM(self, vm: TangoMachine, max_secs: int) -> Literal[0, -1]:
         """waitVM - Nothing to do for waitVM"""
         return 0
 
-    def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional[int] = None) -> int:
+    def copyIn(
+        self,
+        vm: TangoMachine,
+        inputFiles: List[InputFile],
+        job_id: Optional[int] = None,
+    ) -> int:
         """copyIn - Create a directory to be mounted as a volume
         for the docker containers. Copy input files to this directory.
         """
@@ -90,7 +94,13 @@ def copyIn(self, vm: TangoMachine, inputFiles: List[InputFile], job_id: Optional
             )
         return 0
 
-    def runJob(self, vm: TangoMachine, runTimeout: int, maxOutputFileSize: int, disableNetwork: bool) -> int:
+    def runJob(
+        self,
+        vm: TangoMachine,
+        runTimeout: int,
+        maxOutputFileSize: int,
+        disableNetwork: bool,
+    ) -> int:
         """runJob - Run a docker container by doing the follows:
         - mount directory corresponding to this job to /home/autolab
           in the container
@@ -154,7 +164,9 @@ def destroyVM(self, vm: TangoMachine) -> None:
         volumePath = self.getVolumePath("")
         # Do a hard kill on corresponding docker container.
         # Return status does not matter.
-        VMMSUtils.timeout(["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT)
+        VMMSUtils.timeout(
+            ["docker", "rm", "-f", instanceName], config.Config.DOCKER_RM_TIMEOUT
+        )
         # Destroy corresponding volume if it exists.
         if instanceName in os.listdir(volumePath):
             shutil.rmtree(volumePath + instanceName)
diff --git a/vmms/sharedUtils.py b/vmms/sharedUtils.py
index 0165b313..15cb2777 100644
--- a/vmms/sharedUtils.py
+++ b/vmms/sharedUtils.py
@@ -4,6 +4,7 @@
 from typing import List
 import config
 
+
 class VMMSUtils:
     @staticmethod
     def constructInstanceName(id: int, name: str) -> str:
@@ -12,7 +13,7 @@ def constructInstanceName(id: int, name: str) -> str:
         instance names manually.
         """
         return "%s-%d-%s" % (config.Config.PREFIX, id, name)
-    
+
     @staticmethod
     def timeout(command: List[str], time_out: float = 1) -> int:
         """timeout - Run a unix command with a timeout. Return -1 on
@@ -43,4 +44,3 @@ def timeout(command: List[str], time_out: float = 1) -> int:
         else:
             returncode = poll_result
         return returncode
-
diff --git a/worker.py b/worker.py
index 8d92123c..8b7b115b 100644
--- a/worker.py
+++ b/worker.py
@@ -19,6 +19,7 @@
 from typing import Dict, Optional
 from vmms.interface import VMMSInterface
 from preallocator import Preallocator
+
 #
 # Worker - The worker class is very simple and very dumb. The goal is
 # to walk through the VMMS interface, track the job's progress, and if
@@ -30,6 +31,7 @@
 # anything else in the system.
 #
 
+
 class DetachMethod(Enum):
     RETURN_TO_POOL = "return_to_pool"
     DESTROY_WITHOUT_REPLACEMENT = "destroy_without_replacement"
@@ -38,12 +40,19 @@ class DetachMethod(Enum):
 
 # We always preallocate a VM for the worker to use
 class Worker(threading.Thread):
-    def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preallocator: Preallocator, preVM: TangoMachine):
+    def __init__(
+        self,
+        job: TangoJob,
+        vmms: VMMSInterface,
+        jobQueue: JobQueue,
+        preallocator: Preallocator,
+        preVM: TangoMachine,
+    ):
         threading.Thread.__init__(self)
         self.daemon = True
         self.job = job
         self.vmms: VMMSInterface = vmms
-        self.jobQueue : JobQueue = jobQueue
+        self.jobQueue: JobQueue = jobQueue
         self.preallocator = preallocator
         self.preVM = preVM
         threading.Thread.__init__(self)
@@ -55,8 +64,7 @@ def __init__(self, job: TangoJob, vmms: VMMSInterface, jobQueue: JobQueue, preal
     #
     def __del__(self):
         assert self.cleanupStatus, "Worker must call detachVM before returning"
-    
-    
+
     def detachVM(self, detachMethod: DetachMethod):
         """detachVM - Detach the VM from this worker. The options are
         to return it to the pool's free list (return_vm), destroy it
@@ -99,7 +107,10 @@ def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None:
 
         # Try a few times before giving up
         if self.job.retries < Config.JOB_RETRIES:
-            self.log.error("Retrying job %s:%d, retries: %d" % (self.job.name, self.job.id, self.job.retries))
+            self.log.error(
+                "Retrying job %s:%d, retries: %d"
+                % (self.job.name, self.job.id, self.job.retries)
+            )
             self.job.appendTrace(
                 "%s|Retrying job %s:%d, retries: %d"
                 % (datetime.now().ctime(), self.job.name, self.job.id, self.job.retries)
@@ -114,7 +125,9 @@ def rescheduleJob(self, hdrfile: str, ret: Dict[str, int], err: str) -> None:
         # Here is where we give up
         else:
             full_err = f"Internal Error: {err}. Unable to complete job after {Config.JOB_RETRIES} tries. Please resubmit.\nJob status: waitVM={ret['waitvm']} initializeVM={ret['initializevm']} copyIn={ret['copyin']} runJob={ret['runjob']} copyOut={ret['copyout']}"
-            self.log.error(f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err))
+            self.log.error(
+                f"Giving up on job %s:%d. %s" % (self.job.name, self.job.id, full_err)
+            )
             self.job.appendTrace(
                 "%s|Giving up on job %s:%d. %s"
                 % (datetime.now().ctime(), self.job.name, self.job.id, full_err)
@@ -168,18 +181,22 @@ def notifyServer(self, job: TangoJob) -> None:
                 )
                 fh.close()
             else:
-                self.log.info("No callback URL for job %s:%d" % (self.job.name, self.job.id))
+                self.log.info(
+                    "No callback URL for job %s:%d" % (self.job.name, self.job.id)
+                )
         except Exception as e:
             self.log.debug("Error in notifyServer: %s" % str(e))
 
-    def afterJobExecution(self, hdrfile: str, msg: str, detachMethod: DetachMethod) -> None: 
+    def afterJobExecution(
+        self, hdrfile: str, msg: str, detachMethod: DetachMethod
+    ) -> None:
         self.jobQueue.makeDead(self.job, msg)
-        
+
         # Update the text that users see in the autodriver output file
         self.appendMsg(hdrfile, msg)
         self.catFiles(hdrfile, self.job.outputFile)
         os.chmod(self.job.outputFile, 0o644)
-        
+
         # Thread exit after termination
         self.detachVM(detachMethod)
         self.notifyServer(self.job)
@@ -221,7 +238,7 @@ def run(self) -> None:
                 )
             )
             self.log.debug("Assigned job to preallocated VM")
-            ret["initializevm"] = 0 # Vacuous success since it doesn't happen
+            ret["initializevm"] = 0  # Vacuous success since it doesn't happen
 
             vm = self.job.vm
 
@@ -243,7 +260,9 @@ def run(self) -> None:
             if self.job.stopBefore == "waitvm":
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 self.job.setKeepForDebugging(True)
-                self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
+                self.afterJobExecution(
+                    hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE
+                )
                 return
             ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT)
 
@@ -277,15 +296,19 @@ def run(self) -> None:
                     self.job.id,
                 )
             )
-            if (self.job.stopBefore == "copyin"):
+            if self.job.stopBefore == "copyin":
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 self.job.setKeepForDebugging(True)
-                self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
+                self.afterJobExecution(
+                    hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE
+                )
                 self.log.debug(msg)
                 return
             # Copy input files to VM
             ret["copyin"] = self.vmms.copyIn(vm, self.job.input, self.job.id)
-            self.log.debug(f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}")
+            self.log.debug(
+                f"After copyIn: ret[copyin] = {ret['copyin']}, job_id: {str(self.job.id)}"
+            )
 
             if ret["copyin"] != 0:
                 Config.copyin_errors += 1
@@ -293,11 +316,7 @@ def run(self) -> None:
                 self.job.vm.notes = str(self.job.id) + "_" + self.job.name
                 self.job.setKeepForDebugging(True)
                 self.log.debug(msg)
-                self.rescheduleJob(
-                    hdrfile,
-                    ret,
-                    msg
-                )
+                self.rescheduleJob(hdrfile, ret, msg)
                 return
 
             self.log.info(
@@ -309,10 +328,12 @@ def run(self) -> None:
                 % (datetime.now().ctime(), self.job.name, self.job.id, ret["copyin"])
             )
 
-            if (self.job.stopBefore == "runjob"):
+            if self.job.stopBefore == "runjob":
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 self.job.setKeepForDebugging(True)
-                self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
+                self.afterJobExecution(
+                    hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE
+                )
                 return
             # Run the job on the virtual machine
             ret["runjob"] = self.vmms.runJob(
@@ -343,13 +364,9 @@ def run(self) -> None:
                         ret["runjob"]
                     )
                 Config.runjob_errors += 1
-                self.rescheduleJob(
-                    hdrfile,
-                    ret,
-                    msg
-                )
+                self.rescheduleJob(hdrfile, ret, msg)
                 return
-            
+
             self.log.info(
                 "Job %s:%d executed [status=%s]"
                 % (self.job.name, self.job.id, ret["runjob"])
@@ -359,10 +376,12 @@ def run(self) -> None:
                 % (datetime.now().ctime(), self.job.name, self.job.id, ret["runjob"])
             )
 
-            if (self.job.stopBefore == "copyout"):
+            if self.job.stopBefore == "copyout":
                 msg = "Execution stopped before %s" % self.job.stopBefore
                 self.job.setKeepForDebugging(True)
-                self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE)
+                self.afterJobExecution(
+                    hdrfile, msg, detachMethod=DetachMethod.DESTROY_AND_REPLACE
+                )
                 return
             # Copy the output back.
             ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile)
@@ -371,10 +390,10 @@ def run(self) -> None:
                 self.rescheduleJob(
                     hdrfile,
                     ret,
-                    f"Internal error: copyOut failed (status={ret['copyout']})"
+                    f"Internal error: copyOut failed (status={ret['copyout']})",
                 )
                 return
-            
+
             self.log.info(
                 "Output copied for job %s:%d [status=%d]"
                 % (self.job.name, self.job.id, ret["copyout"])
@@ -390,13 +409,17 @@ def run(self) -> None:
             self.log.info("Success: job %s:%d finished" % (self.job.name, self.job.id))
 
             for status in ret.values():
-                assert status == 0, "Should not get to the success point if any stage failed"
+                assert (
+                    status == 0
+                ), "Should not get to the success point if any stage failed"
                 # TODO: test this, then remove everything below this point
-                
+
             # Move the job from the live queue to the dead queue
             # with an explanatory message
             msg = "Success: Autodriver returned normally"
-            self.afterJobExecution(hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL)
+            self.afterJobExecution(
+                hdrfile, msg, detachMethod=DetachMethod.RETURN_TO_POOL
+            )
             return
 
         #
@@ -412,6 +435,6 @@ def run(self) -> None:
             # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set
             if self.preVM and not vm:
                 self.job.makeVM(self.preVM)
-                vm = self.preVM 
+                vm = self.preVM
             if vm:
                 self.detachVM(DetachMethod.DESTROY_AND_REPLACE)

From b0bb611a66f8955f16f775a63d683808407dd15f Mon Sep 17 00:00:00 2001
From: Anthony Yip <anthony.ch.yip@gmail.com>
Date: Mon, 16 Mar 2026 13:47:08 -0400
Subject: [PATCH 39/39] clean up worker.py

---
 vmms/ec2SSH.py |  1 -
 worker.py      | 14 +++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py
index 533a8cd0..f55c3c05 100644
--- a/vmms/ec2SSH.py
+++ b/vmms/ec2SSH.py
@@ -745,7 +745,6 @@ def getImages(self) -> List[str]:
         """getImages - return a constant; actually use the ami specified in config"""
         return [key for key in self.img2ami]
 
-    # TODO: later
     def getTag(self, tagList: List[TagTypeDef], tagKey: str) -> Optional[str]:
         if tagList:
             for tag in tagList:
diff --git a/worker.py b/worker.py
index 8b7b115b..aab2e46b 100644
--- a/worker.py
+++ b/worker.py
@@ -38,7 +38,7 @@ class DetachMethod(Enum):
     DESTROY_AND_REPLACE = "replace"
 
 
-# We always preallocate a VM for the worker to use
+# We always preallocate a VM for the worker to use, hence it isn't Optional
 class Worker(threading.Thread):
     def __init__(
         self,
@@ -428,13 +428,5 @@ def run(self) -> None:
         except Exception as err:
             self.log.exception("Internal Error")
             self.appendMsg(self.job.outputFile, "Internal Error: %s" % err)
-            # if vm is set, then the normal job assignment completed,
-            # and detachVM can be run
-            # if vm is not set but self.preVM is set, we still need
-            # to return the VM, but have to initialize self.job.vm first
-            # TODO: move self.job.makeVM to the start of the try block, so it should be an error if vm fails to be set
-            if self.preVM and not vm:
-                self.job.makeVM(self.preVM)
-                vm = self.preVM
-            if vm:
-                self.detachVM(DetachMethod.DESTROY_AND_REPLACE)
+            # error must have occurred after the job had its VM set
+            self.detachVM(DetachMethod.DESTROY_AND_REPLACE)