From 40733e18b949562b290cea1d3f8ba14dc64c24b6 Mon Sep 17 00:00:00 2001
From: Brandon Seah <brandon.seah@thuenen.de>
Date: Tue, 13 Feb 2024 12:21:16 +0100
Subject: [PATCH 1/5] Optionally run hmmscan scripts locally in parallel

---
 codetta.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/codetta.py b/codetta.py
index 2954bea..a6b395d 100755
--- a/codetta.py
+++ b/codetta.py
@@ -3,7 +3,8 @@
 import scipy.special
 import argparse
 import os
-from subprocess import call, Popen, PIPE
+from subprocess import call, run, Popen, PIPE, CalledProcessError
+from concurrent.futures import ProcessPoolExecutor, as_completed
 import numpy as np
 import sys
 import random
@@ -36,8 +37,13 @@ def argument_parsing():
     parser.add_argument('--align_output', help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])')
     parser.add_argument('--inference_output', help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out')
     parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str)
-    #parser.add_argument('--parallelize_hmmscan', help='send hmmscan jobs to computing cluster, specify SLURM (s). Remember to modify the template file in resources directory accordingly.', type=str, choices=['s'])
-    
+    parser.add_argument(
+        '--parallelize_hmmscan',
+        help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster.  Remember to modify the template file in resources directory accordingly. Default: None',
+        nargs="?", choices=['s', 'l'], default=None
+    )
+    parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int)
+
     return parser.parse_args()
 
 def initialize_globals():
@@ -153,6 +159,41 @@ def initialize_emissions_dict(resource_dir, profile_db):
             except KeyError:
                 pass
 
+
+def _exec_script(path):
+    """Chmod and execute shell script, returning True if success"""
+    try:
+        run(['chmod', '777', path], check=True)
+        run([path], check=True)
+        return True
+    except CalledProcessError as err:
+        print('error executing hmmscan shell script: %s' % err)
+        return False
+
+
+def _exec_script_parallel(paths, nproc):
+    """Execute list of shell scripts in parallel
+
+    Parameters
+    ----------
+    paths : list
+        Paths to shell scripts
+    nproc : int
+        Number of processes to run in parallel
+    """
+    with ProcessPoolExecutor(max_workers=nproc) as executor:
+        futures = {
+            executor.submit(_exec_script, path) : path
+            for path in paths
+        }
+        for future in as_completed(futures):
+            path = futures[future]
+            if future.result():
+                print('hmmscan shell script %s executed successfully' % path)
+            else:
+                print('hmmscan shell script %s failed' % path)
+
+
 class GeneticCode:
     """
     Class to store parameters and inference of the genetic code of a set of 
@@ -174,6 +215,7 @@ def __init__(self, args):
         self.hmmer_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), 'hmmer-3.3.2/bin'))
         self.identifier = args.identifier
         self.parallelize_hmmscan = args.parallelize_hmmscan
+        self.nproc = args.nproc
         if args.evalue != None and args.evalue < 0:
             sys.exit('ERROR: e-value threshold must be positive')
         else:
@@ -494,14 +536,19 @@ def processing_genome(self):
                     (self.hmmer_dir, preliminary_translation_file, seq_names_file, self.hmmer_dir, self.scratch_dir, shell_count, self.resource_dir, self.profiles))
             shell_count += 1
         
-        # Run hmmscan shell scripts one at a time
+        # Run hmmscan shell scripts serially
         if self.parallelize_hmmscan == None:
             for shell_i in range(shell_count):
                 print('Running hmmscan shell script %i out of %i' % (shell_i + 1, shell_count))
                 shell_script = '%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i)
                 dum = call(["chmod", "777", shell_script])
                 dum = call([shell_script])
-        # If SLURM parallelization is turned on
+        # Run scripts locally in parallel
+        elif self.parallelize_hmmscan == 'l':
+            print('Running %i hmmscan shell scripts parallelized in %i processes' % (shell_count, self.nproc))
+            shell_scripts = ['%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i) for shell_i in range(shell_count)]
+            _exec_script_parallel(shell_scripts, self.nproc)
+        # Submit jobs to SLURM array
         elif self.parallelize_hmmscan == 's':
             # remember to change the partition name in the resources/template_job_array.sh file!
             print('Submitting a SLURM job array of %i hmmscan jobs' % shell_count)
@@ -926,7 +973,6 @@ def main():
     
     args.identifier = None
     args.download_type = None
-    args.parallelize_hmmscan = None
 
     # initialize genetic code with command line args and download genome
     initialize_globals()

From 675761ef3dc1954ff5c85d6ded6f3d2d4487a7e5 Mon Sep 17 00:00:00 2001
From: Brandon Seah <brandon.seah@thuenen.de>
Date: Tue, 13 Feb 2024 12:25:55 +0100
Subject: [PATCH 2/5] Update command line args in codetta_align.py

---
 codetta_align.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/codetta_align.py b/codetta_align.py
index 84fe381..cbb649b 100755
--- a/codetta_align.py
+++ b/codetta_align.py
@@ -12,8 +12,13 @@ def argument_parsing():
                                                 to [ALIGN_OUTPUT].[PROFILES FILE].temp_files/ and output files will be written to \
                                                 [ALIGN_OUTPUT].[PROFILES FILE].[extensions]. (default: [SEQUENCE_FILE])')
     parser.add_argument('-p', '--profiles', help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)')
-    parser.add_argument('--parallelize_hmmscan', help='send hmmscan jobs to computing cluster. Specify SLURM (s), and modify template file in resources directory accordingly.', type=str, choices=['s'])
     parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str)
+    parser.add_argument(
+        '--parallelize_hmmscan',
+        help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster.  Remember to modify the template file in resources directory accordingly. Default: None',
+        nargs="?", choices=['s', 'l'], default=None
+    )
+    parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int)
         
     return parser.parse_args()
 

From 183d9a1a27075ab48eb8c628ef5f49847d56946c Mon Sep 17 00:00:00 2001
From: Brandon Seah <brandon.seah@thuenen.de>
Date: Tue, 13 Feb 2024 12:32:53 +0100
Subject: [PATCH 3/5] Formatting

---
 codetta.py | 70 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/codetta.py b/codetta.py
index a6b395d..94385d6 100755
--- a/codetta.py
+++ b/codetta.py
@@ -17,35 +17,64 @@
 # silence numpy overflow errors (in np.exp, especially)
 np.seterr(over='ignore')
 
+
 def argument_parsing():
     parser = argparse.ArgumentParser(description="infer genetic code used by an organism from nucleotide sequence")
-    parser.add_argument('sequence_file', help='specify the input nucleotide sequence file in FASTA format.')
-    
+    parser.add_argument(
+        'sequence_file',
+        help='specify the input nucleotide sequence file in FASTA format.')
     # remaining arguments all are set optionally, otherwise default values
-    parser.add_argument('-p', '--profiles', help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)')
-    parser.add_argument('-s', '--results_summary', help='file path to append one-line result summary ', type=str, default=None)
+    parser.add_argument(
+        '-p', '--profiles',
+        help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)')
+    parser.add_argument(
+        '-s', '--results_summary', type=str, default=None,
+        help='file path to append one-line result summary ')
     #parser.add_argument('-i', '--identifier', help='GenBank genome assembly accession or GenBank nucleotide accession', type=str)
     #parser.add_argument('-d', '--download_type', help='specify whether download is for GenBank genome assembly accession (a) or GenBank nucleotide accession (c)', type=str, choices=['a', 'c'])
-    parser.add_argument('-e', '--evalue', help='profile HMM hit e-value threshold (default: 1e-10)', type=float, default=1e-10)
-    parser.add_argument('-r', '--probability_threshold', help='threshold for decoding probabilities (default: 0.9999)', type=float, default=0.9999)
-    parser.add_argument('-f', '--max_fraction', help='maximum fraction of observations for a codon coming from a single Pfam position (default: 0.01)', type=float, default=0.01)
-    parser.add_argument('-m', '--mito_pfams', help='flag to include Pfam domains commonly found in mitochondria', action="store_true", default=False)
-    parser.add_argument('-t', '--transposon_pfams', help='flag to include Pfam domains associated with transposons and other mobile genetic elements', action="store_true", default=False)
-    parser.add_argument('-v', '--viral_pfams', help='flag to include Pfam domains associated with viruses', action="store_true", default=False)
-    parser.add_argument('-u', '--selenocysteine_pfams', help='flag to include Pfam domains known to contain selenocysteine', action="store_true", default=False)
-    parser.add_argument('-y', '--pyrrolysine_pfams', help='flag to include Pfam domains known to contain pyrrolysine', action="store_true", default=False)
-    parser.add_argument('--align_output', help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])')
-    parser.add_argument('--inference_output', help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out')
-    parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str)
     parser.add_argument(
-        '--parallelize_hmmscan',
-        help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster.  Remember to modify the template file in resources directory accordingly. Default: None',
-        nargs="?", choices=['s', 'l'], default=None
-    )
-    parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int)
+        '-e', '--evalue', type=float, default=1e-10, 
+        help='profile HMM hit e-value threshold (default: 1e-10)')
+    parser.add_argument(
+        '-r', '--probability_threshold', type=float, default=0.9999,
+        help='threshold for decoding probabilities (default: 0.9999)')
+    parser.add_argument(
+        '-f', '--max_fraction', type=float, default=0.01,
+        help='maximum fraction of observations for a codon coming from a single Pfam position (default: 0.01)')
+    parser.add_argument(
+        '-m', '--mito_pfams', action="store_true", default=False,
+        help='flag to include Pfam domains commonly found in mitochondria')
+    parser.add_argument(
+        '-t', '--transposon_pfams', action="store_true", default=False,
+        help='flag to include Pfam domains associated with transposons and other mobile genetic elements')
+    parser.add_argument(
+        '-v', '--viral_pfams', action="store_true", default=False,
+        help='flag to include Pfam domains associated with viruses')
+    parser.add_argument(
+        '-u', '--selenocysteine_pfams', action="store_true", default=False,
+        help='flag to include Pfam domains known to contain selenocysteine')
+    parser.add_argument(
+        '-y', '--pyrrolysine_pfams', action="store_true", default=False,
+        help='flag to include Pfam domains known to contain pyrrolysine')
+    parser.add_argument(
+        '--align_output',
+        help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])')
+    parser.add_argument(
+        '--inference_output',
+        help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out')
+    parser.add_argument(
+        '--resource_directory', type=str,
+        help='directory where resource files can be found (default: [script dir]/resources)')
+    parser.add_argument(
+        '--parallelize_hmmscan', nargs="?", choices=['s', 'l'], default=None,
+        help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster.  Remember to modify the template file in resources directory accordingly. Default: None')
+    parser.add_argument(
+        '--nproc', default=2, type=int,
+        help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.')
 
     return parser.parse_args()
 
+
 def initialize_globals():
     # standard genetic code dictionary used for translating nucleotide triplets to amino acids
     global gencode 
@@ -86,6 +115,7 @@ def initialize_globals():
     global std_gen_code
     std_gen_code = ''.join([gencode[''.join(codon)] for codon in codons]).replace('_', '*')
 
+
 def initialize_emissions_dict(resource_dir, profile_db):
     # dictionary where keys are profile HMM names and values are emission probabilities for every position
     # load dictionary if it exists

From 3e7c2c44a0904fcb77146499be63109fab1f4054 Mon Sep 17 00:00:00 2001
From: Brandon Seah <brandon.seah@thuenen.de>
Date: Tue, 13 Feb 2024 12:36:35 +0100
Subject: [PATCH 4/5] Rename nproc as njobs, more accurate

---
 codetta.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/codetta.py b/codetta.py
index 94385d6..0afb1c4 100755
--- a/codetta.py
+++ b/codetta.py
@@ -69,7 +69,7 @@ def argument_parsing():
         '--parallelize_hmmscan', nargs="?", choices=['s', 'l'], default=None,
         help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster.  Remember to modify the template file in resources directory accordingly. Default: None')
     parser.add_argument(
-        '--nproc', default=2, type=int,
+        '--njobs', default=2, type=int,
         help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.')
 
     return parser.parse_args()
@@ -201,17 +201,17 @@ def _exec_script(path):
         return False
 
 
-def _exec_script_parallel(paths, nproc):
+def _exec_script_parallel(paths, njobs):
     """Execute list of shell scripts in parallel
 
     Parameters
     ----------
     paths : list
         Paths to shell scripts
-    nproc : int
-        Number of processes to run in parallel
+    jobs : int
+        Number of scripts to run in parallel
     """
-    with ProcessPoolExecutor(max_workers=nproc) as executor:
+    with ProcessPoolExecutor(max_workers=njobs) as executor:
         futures = {
             executor.submit(_exec_script, path) : path
             for path in paths
@@ -245,7 +245,7 @@ def __init__(self, args):
         self.hmmer_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), 'hmmer-3.3.2/bin'))
         self.identifier = args.identifier
         self.parallelize_hmmscan = args.parallelize_hmmscan
-        self.nproc = args.nproc
+        self.njobs = args.njobs
         if args.evalue != None and args.evalue < 0:
             sys.exit('ERROR: e-value threshold must be positive')
         else:
@@ -575,9 +575,9 @@ def processing_genome(self):
                 dum = call([shell_script])
         # Run scripts locally in parallel
         elif self.parallelize_hmmscan == 'l':
-            print('Running %i hmmscan shell scripts parallelized in %i processes' % (shell_count, self.nproc))
+            print('Running %i hmmscan shell scripts parallelized in batches of %i jobs' % (shell_count, self.njobs))
             shell_scripts = ['%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i) for shell_i in range(shell_count)]
-            _exec_script_parallel(shell_scripts, self.nproc)
+            _exec_script_parallel(shell_scripts, self.njobs)
         # Submit jobs to SLURM array
         elif self.parallelize_hmmscan == 's':
             # remember to change the partition name in the resources/template_job_array.sh file!

From e018d41935e117a75a7dd85e088f02181222dbb4 Mon Sep 17 00:00:00 2001
From: Brandon Seah <brandon.seah@thuenen.de>
Date: Tue, 13 Feb 2024 12:36:44 +0100
Subject: [PATCH 5/5] Update Readme on parallelization

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index b1c83f8..532d585 100644
--- a/README.md
+++ b/README.md
@@ -252,6 +252,18 @@ modify the job array template file and the code in `codetta.py`
 (`processing_genome` function) that writes and sends the job array file.
 
 
+### Parallelizing the alignment step in a local machine
+
+If your local machine has sufficient resources, the `codetta_align` step can
+also be parallelized on the local machine. `hmmscan` jobs will be run in
+batches, instead of one by one.
+
+Use the option `--parallelize_hmmscan l` to parallelize locally.
+
+Specify the number of `hmmscan` jobs per batch with `--njobs`. Note that each
+job uses 2 CPUs.
+
+
 ### Building a custom profile HMM database
 
 Pfam domains are expected to align to