From 40733e18b949562b290cea1d3f8ba14dc64c24b6 Mon Sep 17 00:00:00 2001 From: Brandon Seah Date: Tue, 13 Feb 2024 12:21:16 +0100 Subject: [PATCH 1/5] Optionally run hmmscan scripts locally in parallel --- codetta.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/codetta.py b/codetta.py index 2954bea..a6b395d 100755 --- a/codetta.py +++ b/codetta.py @@ -3,7 +3,8 @@ import scipy.special import argparse import os -from subprocess import call, Popen, PIPE +from subprocess import call, run, Popen, PIPE, CalledProcessError +from concurrent.futures import ProcessPoolExecutor, as_completed import numpy as np import sys import random @@ -36,8 +37,13 @@ def argument_parsing(): parser.add_argument('--align_output', help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])') parser.add_argument('--inference_output', help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out') parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str) - #parser.add_argument('--parallelize_hmmscan', help='send hmmscan jobs to computing cluster, specify SLURM (s). Remember to modify the template file in resources directory accordingly.', type=str, choices=['s']) - + parser.add_argument( + '--parallelize_hmmscan', + help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster. Remember to modify the template file in resources directory accordingly. Default: None', + nargs="?", choices=['s', 'l'], default=None + ) + parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int) + return parser.parse_args() def initialize_globals(): @@ -153,6 +159,41 @@ def initialize_emissions_dict(resource_dir, profile_db): except KeyError: pass + +def _exec_script(path): + """Chmod and execute shell script, returning True if success""" + try: + run(['chmod', '777', path], check=True) + run([path], check=True) + return True + except CalledProcessError as err: + print('error executing hmmscan shell script: %s' % err) + return False + + +def _exec_script_parallel(paths, nproc): + """Execute list of shell scripts in parallel + + Parameters + ---------- + paths : list + Paths to shell scripts + nproc : int + Number of processes to run in parallel + """ + with ProcessPoolExecutor(max_workers=nproc) as executor: + futures = { + executor.submit(_exec_script, path) : path + for path in paths + } + for future in as_completed(futures): + path = futures[future] + if future.result(): + print('hmmscan shell script %s executed successfully' % path) + else: + print('hmmscan shell script %s failed' % path) + + class GeneticCode: """ Class to store parameters and inference of the genetic code of a set of @@ -174,6 +215,7 @@ def __init__(self, args): self.hmmer_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), 'hmmer-3.3.2/bin')) self.identifier = args.identifier self.parallelize_hmmscan = args.parallelize_hmmscan + self.nproc = args.nproc if args.evalue != None and args.evalue < 0: sys.exit('ERROR: e-value threshold must be positive') else: @@ -494,14 +536,19 @@ def processing_genome(self): (self.hmmer_dir, preliminary_translation_file, seq_names_file, self.hmmer_dir, self.scratch_dir, shell_count, self.resource_dir, self.profiles)) shell_count += 1 - # Run hmmscan shell scripts one at a time + # Run hmmscan shell scripts serially if self.parallelize_hmmscan == None: for shell_i in range(shell_count): print('Running hmmscan shell script %i out of %i' % (shell_i + 1, shell_count)) shell_script = '%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i) dum = call(["chmod", "777", shell_script]) dum = call([shell_script]) - # If SLURM parallelization is turned on + # Run scripts locally in parallel + elif self.parallelize_hmmscan == 'l': + print('Running %i hmmscan shell scripts parallelized in %i processes' % (shell_count, self.nproc)) + shell_scripts = ['%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i) for shell_i in range(shell_count)] + _exec_script_parallel(shell_scripts, self.nproc) + # Submit jobs to SLURM array elif self.parallelize_hmmscan == 's': # remember to change the partition name in the resources/template_job_array.sh file! print('Submitting a SLURM job array of %i hmmscan jobs' % shell_count) @@ -926,7 +973,6 @@ def main(): args.identifier = None args.download_type = None - args.parallelize_hmmscan = None # initialize genetic code with command line args and download genome initialize_globals() From 675761ef3dc1954ff5c85d6ded6f3d2d4487a7e5 Mon Sep 17 00:00:00 2001 From: Brandon Seah Date: Tue, 13 Feb 2024 12:25:55 +0100 Subject: [PATCH 2/5] Update command line args in codetta_align.py --- codetta_align.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/codetta_align.py b/codetta_align.py index 84fe381..cbb649b 100755 --- a/codetta_align.py +++ b/codetta_align.py @@ -12,8 +12,13 @@ def argument_parsing(): to [ALIGN_OUTPUT].[PROFILES FILE].temp_files/ and output files will be written to \ [ALIGN_OUTPUT].[PROFILES FILE].[extensions]. (default: [SEQUENCE_FILE])') parser.add_argument('-p', '--profiles', help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)') - parser.add_argument('--parallelize_hmmscan', help='send hmmscan jobs to computing cluster. Specify SLURM (s), and modify template file in resources directory accordingly.', type=str, choices=['s']) parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str) + parser.add_argument( + '--parallelize_hmmscan', + help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster. Remember to modify the template file in resources directory accordingly. Default: None', + nargs="?", choices=['s', 'l'], default=None + ) + parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int) return parser.parse_args() From 183d9a1a27075ab48eb8c628ef5f49847d56946c Mon Sep 17 00:00:00 2001 From: Brandon Seah Date: Tue, 13 Feb 2024 12:32:53 +0100 Subject: [PATCH 3/5] Formatting --- codetta.py | 70 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/codetta.py b/codetta.py index a6b395d..94385d6 100755 --- a/codetta.py +++ b/codetta.py @@ -17,35 +17,64 @@ # silence numpy overflow errors (in np.exp, especially) np.seterr(over='ignore') + def argument_parsing(): parser = argparse.ArgumentParser(description="infer genetic code used by an organism from nucleotide sequence") - parser.add_argument('sequence_file', help='specify the input nucleotide sequence file in FASTA format.') - + parser.add_argument( + 'sequence_file', + help='specify the input nucleotide sequence file in FASTA format.') # remaining arguments all are set optionally, otherwise default values - parser.add_argument('-p', '--profiles', help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)') - parser.add_argument('-s', '--results_summary', help='file path to append one-line result summary ', type=str, default=None) + parser.add_argument( + '-p', '--profiles', + help='profile HMM database file, must be in located in resource directory (default: Pfam-A_enone.hmm)') + parser.add_argument( + '-s', '--results_summary', type=str, default=None, + help='file path to append one-line result summary ') #parser.add_argument('-i', '--identifier', help='GenBank genome assembly accession or GenBank nucleotide accession', type=str) #parser.add_argument('-d', '--download_type', help='specify whether download is for GenBank genome assembly accession (a) or GenBank nucleotide accession (c)', type=str, choices=['a', 'c']) - parser.add_argument('-e', '--evalue', help='profile HMM hit e-value threshold (default: 1e-10)', type=float, default=1e-10) - parser.add_argument('-r', '--probability_threshold', help='threshold for decoding probabilities (default: 0.9999)', type=float, default=0.9999) - parser.add_argument('-f', '--max_fraction', help='maximum fraction of observations for a codon coming from a single Pfam position (default: 0.01)', type=float, default=0.01) - parser.add_argument('-m', '--mito_pfams', help='flag to include Pfam domains commonly found in mitochondria', action="store_true", default=False) - parser.add_argument('-t', '--transposon_pfams', help='flag to include Pfam domains associated with transposons and other mobile genetic elements', action="store_true", default=False) - parser.add_argument('-v', '--viral_pfams', help='flag to include Pfam domains associated with viruses', action="store_true", default=False) - parser.add_argument('-u', '--selenocysteine_pfams', help='flag to include Pfam domains known to contain selenocysteine', action="store_true", default=False) - parser.add_argument('-y', '--pyrrolysine_pfams', help='flag to include Pfam domains known to contain pyrrolysine', action="store_true", default=False) - parser.add_argument('--align_output', help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])') - parser.add_argument('--inference_output', help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out') - parser.add_argument('--resource_directory', help='directory where resource files can be found (default: [script dir]/resources)', type=str) parser.add_argument( - '--parallelize_hmmscan', - help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster. Remember to modify the template file in resources directory accordingly. Default: None', - nargs="?", choices=['s', 'l'], default=None - ) - parser.add_argument('--nproc', help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.', default=2, type=int) + '-e', '--evalue', type=float, default=1e-10, + help='profile HMM hit e-value threshold (default: 1e-10)') + parser.add_argument( + '-r', '--probability_threshold', type=float, default=0.9999, + help='threshold for decoding probabilities (default: 0.9999)') + parser.add_argument( + '-f', '--max_fraction', type=float, default=0.01, + help='maximum fraction of observations for a codon coming from a single Pfam position (default: 0.01)') + parser.add_argument( + '-m', '--mito_pfams', action="store_true", default=False, + help='flag to include Pfam domains commonly found in mitochondria') + parser.add_argument( + '-t', '--transposon_pfams', action="store_true", default=False, + help='flag to include Pfam domains associated with transposons and other mobile genetic elements') + parser.add_argument( + '-v', '--viral_pfams', action="store_true", default=False, + help='flag to include Pfam domains associated with viruses') + parser.add_argument( + '-u', '--selenocysteine_pfams', action="store_true", default=False, + help='flag to include Pfam domains known to contain selenocysteine') + parser.add_argument( + '-y', '--pyrrolysine_pfams', action="store_true", default=False, + help='flag to include Pfam domains known to contain pyrrolysine') + parser.add_argument( + '--align_output', + help='prefix for files created by codetta_align and codetta_summary. This can include a path. (default: [SEQUENCE_FILE])') + parser.add_argument( + '--inference_output', + help='output file for codetta_infer step. Default is [ALIGN_OUTPUT].[PROFILES FILE].[inference parameters].genetic_code.out') + parser.add_argument( + '--resource_directory', type=str, + help='directory where resource files can be found (default: [script dir]/resources)') + parser.add_argument( + '--parallelize_hmmscan', nargs="?", choices=['s', 'l'], default=None, + help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster. Remember to modify the template file in resources directory accordingly. Default: None') + parser.add_argument( + '--nproc', default=2, type=int, + help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.') return parser.parse_args() + def initialize_globals(): # standard genetic code dictionary used for translating nucleotide triplets to amino acids global gencode @@ -86,6 +115,7 @@ def initialize_globals(): global std_gen_code std_gen_code = ''.join([gencode[''.join(codon)] for codon in codons]).replace('_', '*') + def initialize_emissions_dict(resource_dir, profile_db): # dictionary where keys are profile HMM names and values are emission probabilities for every position # load dictionary if it exists From 3e7c2c44a0904fcb77146499be63109fab1f4054 Mon Sep 17 00:00:00 2001 From: Brandon Seah Date: Tue, 13 Feb 2024 12:36:35 +0100 Subject: [PATCH 4/5] Rename nproc as njobs, more accurate --- codetta.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/codetta.py b/codetta.py index 94385d6..0afb1c4 100755 --- a/codetta.py +++ b/codetta.py @@ -69,7 +69,7 @@ def argument_parsing(): '--parallelize_hmmscan', nargs="?", choices=['s', 'l'], default=None, help='parallelize hmmscan jobs [l]ocally or by sending to [s]LURM computing cluster. Remember to modify the template file in resources directory accordingly. Default: None') parser.add_argument( - '--nproc', default=2, type=int, + '--njobs', default=2, type=int, help='number of hmmscan jobs to run in parallel, if parallelizing locally. Note that each hmmscan job uses 2 CPUs.') return parser.parse_args() @@ -201,17 +201,17 @@ def _exec_script(path): return False -def _exec_script_parallel(paths, nproc): +def _exec_script_parallel(paths, njobs): """Execute list of shell scripts in parallel Parameters ---------- paths : list Paths to shell scripts - nproc : int - Number of processes to run in parallel + jobs : int + Number of scripts to run in parallel """ - with ProcessPoolExecutor(max_workers=nproc) as executor: + with ProcessPoolExecutor(max_workers=njobs) as executor: futures = { executor.submit(_exec_script, path) : path for path in paths @@ -245,7 +245,7 @@ def __init__(self, args): self.hmmer_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), 'hmmer-3.3.2/bin')) self.identifier = args.identifier self.parallelize_hmmscan = args.parallelize_hmmscan - self.nproc = args.nproc + self.njobs = args.njobs if args.evalue != None and args.evalue < 0: sys.exit('ERROR: e-value threshold must be positive') else: @@ -575,9 +575,9 @@ def processing_genome(self): dum = call([shell_script]) # Run scripts locally in parallel elif self.parallelize_hmmscan == 'l': - print('Running %i hmmscan shell scripts parallelized in %i processes' % (shell_count, self.nproc)) + print('Running %i hmmscan shell scripts parallelized in batches of %i jobs' % (shell_count, self.njobs)) shell_scripts = ['%s/hmmscan_%i.sh' % (self.scratch_dir, shell_i) for shell_i in range(shell_count)] - _exec_script_parallel(shell_scripts, self.nproc) + _exec_script_parallel(shell_scripts, self.njobs) # Submit jobs to SLURM array elif self.parallelize_hmmscan == 's': # remember to change the partition name in the resources/template_job_array.sh file! From e018d41935e117a75a7dd85e088f02181222dbb4 Mon Sep 17 00:00:00 2001 From: Brandon Seah Date: Tue, 13 Feb 2024 12:36:44 +0100 Subject: [PATCH 5/5] Update Readme on parallelization --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index b1c83f8..532d585 100644 --- a/README.md +++ b/README.md @@ -252,6 +252,18 @@ modify the job array template file and the code in `codetta.py` (`processing_genome` function) that writes and sends the job array file. +### Parallelizing the alignment step in a local machine + +If your local machine has sufficient resources, the `codetta_align` step can +also be parallelized on the local machine. `hmmscan` jobs will be run in +batches, instead of one by one. + +Use the option `--parallelize_hmmscan l` to parallelize locally. + +Specify the number of `hmmscan` jobs per batch with `--njobs`. Note that each +job uses 2 CPUs. + + ### Building a custom profile HMM database Pfam domains are expected to align to