diff --git a/app.py b/app.py index e195f7b..abc90f7 100644 --- a/app.py +++ b/app.py @@ -14,47 +14,70 @@ app.secret_key = 'development key' UPLOAD_FOLDER = './uploads' -# The type of file that needs to be uploaded to the server by user. -UPLOAD_FILES = ['in_fasta', 'in_gff'] # Route for submitting data on the production site. @app.route('/', methods=['GET', 'POST']) def submit(): + # The type of file that needs to be uploaded to the server by user. + UPLOAD_FILES = {'in_fasta': None, 'in_gff': None} form = SubmitJob(request.form) # Validate user input according to the validation rules defined in forms.py. if request.method == 'POST' and form.validate(): - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and request.form[ - 'position']: - flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') + # Get list of file names, and filter out empty files + downstream_fasta_files = [file for file in request.files.getlist('downstream_fasta') if file.filename] + upstream_fasta_files = [file for file in request.files.getlist('upstream_fasta') if file.filename] + in_fasta_files = [file for file in request.files.getlist('in_fasta') if file.filename] + in_gff_files = [file for file in request.files.getlist('in_gff') if file.filename] + if (downstream_fasta_files or upstream_fasta_files) and request.form['position']: + flash(f"Error: You must provide either the position, or the upstream and downstream sequences. Not all.", 'error') return redirect(url_for('submit')) - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename): - if not (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): + if (downstream_fasta_files or upstream_fasta_files): + if not (downstream_fasta_files and upstream_fasta_files): flash("Error: Must enter both upstream and downstream", 'error') return redirect(url_for('submit')) # Verify position is a number + positions = None try: - if request.form['position']: - int(request.form['position']) - except ValueError: - flash("Error: Position must be an integer, like -1, 0, 1.", 'error') - return redirect(url_for('submit')) - # Verify that the name of the uploaded file is different - if request.form['position'] and (request.files['in_fasta'].filename == request.files['in_gff'].filename): - flash("Error: ref_fasta and ref_gff must be different files", 'error') + position_str = request.form['position'] + if position_str: + positions = process_position(position_str) + # print("Positions:", positions) + except ValueError as e: + flash(str(e), 'error') return redirect(url_for('submit')) - elif (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): - filenames = [request.files['in_fasta'].filename, - request.files['in_gff'].filename, - request.files['downstream_fasta'].filename, - request.files['upstream_fasta'].filename] - if len(filenames) != len(set(filenames)): - flash("Error: ref_fasta, ref_gff, downstream_fasta, upstream_fasta must be different files", 'error') + # Verify that the name of the uploaded file is different by set() + # User upload in_fasta and in_gff + if in_fasta_files and in_gff_files: + all_in_filenames = [] + in_fasta_filenames = [file.filename for file in in_fasta_files] + in_gff_filenames = [file.filename for file in in_gff_files] + all_in_filenames.extend(in_fasta_filenames) + all_in_filenames.extend(in_gff_filenames) + # When position provide + if request.form['position'] and (len(all_in_filenames) != len(set(all_in_filenames))): + flash("Error: in_fasta and in_gff must be different files", 'error') return redirect(url_for('submit')) - - if not (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and not \ - request.form['position']: + # When position not provide, use upstream_fasta and downstream_fasta + elif downstream_fasta_files and upstream_fasta_files: + all_stream_fasta_filenames = [] + downstream_fasta_filenames = [file.filename for file in downstream_fasta_files] + upstream_fasta_filenames = [file.filename for file in upstream_fasta_files] + all_stream_fasta_filenames.extend(downstream_fasta_filenames) + all_stream_fasta_filenames.extend(upstream_fasta_filenames) + if len(all_stream_fasta_filenames) != len(set(all_stream_fasta_filenames)): + flash("Error: downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + # Check all uniqueness for all files + all_filenames = [] + all_filenames.extend(all_in_filenames) + all_filenames.extend(all_stream_fasta_filenames) + if len(all_filenames) != len(set(all_filenames)): + flash("Error: in_fasta, in_gff, downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + if not (downstream_fasta_files or upstream_fasta_files) and not request.form['position']: flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') return redirect(url_for('submit')) + else: # User Submits Job # # (1) Create unique ID for each submission @@ -68,20 +91,23 @@ def submit(): # (3) Upload files from user device to server # Verify all files are present before uploading - for files in UPLOAD_FILES: + for files in UPLOAD_FILES.keys(): verified = verify_uploads(files) if not verified: return redirect(url_for('submit')) # Upload Files to UPLOAD_DIR/timestamp/ if verified: - for files in UPLOAD_FILES: - upload(target_dir, files) + for key in UPLOAD_FILES.keys(): + UPLOAD_FILES[key] = format_paths(upload(target_dir, key), target_dir) + # Set defualt None to up/down stream fasta + for file_key in ['upstream_fasta', 'downstream_fasta']: + UPLOAD_FILES[file_key] = None + UPLOAD_FILES[file_key] = None - if not request.form['position'] and request.files['upstream_fasta'].filename and request.files[ - 'downstream_fasta'].filename: - upload(target_dir, 'upstream_fasta') - upload(target_dir, 'downstream_fasta') + if not request.form['position']: + UPLOAD_FILES['upstream_fasta'] = format_paths(upload(target_dir, 'upstream_fasta'), target_dir) + UPLOAD_FILES['downstream_fasta'] = format_paths(upload(target_dir, 'downstream_fasta'), target_dir) # (4) Send the job to the backend # Connect to the Redis server and intial a queue @@ -93,13 +119,13 @@ def submit(): timestamp, request.form['email'], request.form['chrom'], - request.files['upstream_fasta'].filename, - request.files['downstream_fasta'].filename, - request.form['position'], + UPLOAD_FILES['upstream_fasta'], + UPLOAD_FILES['downstream_fasta'], #request.files['upstream_fasta'].filename, + positions, #request.form['position'], request.form['ref_fasta'], request.form['ref_gff'], - request.files['in_fasta'].filename, - request.files['in_gff'].filename), + UPLOAD_FILES['in_fasta'], + UPLOAD_FILES['in_gff']), result_ttl=-1, job_timeout=3000 ) @@ -112,7 +138,8 @@ def submit(): # Route for submitting data on the test site @app.route('/test', methods=['GET', 'POST']) def submit_test(): - + # The type of file that needs to be uploaded to the server by user. + UPLOAD_FILES = {'in_fasta': None, 'in_gff': None} # Path for local files DEFAULT_FILES = { 'ref_fasta': './staticData/ref/test-ref.fa', @@ -125,34 +152,58 @@ def submit_test(): # Validate user input based on test site rule form = Testjob(request.form) if request.method == 'POST' and form.validate(): - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and request.form[ - 'position']: + downstream_fasta_files = [file for file in request.files.getlist('downstream_fasta') if file.filename] + upstream_fasta_files = [file for file in request.files.getlist('upstream_fasta') if file.filename] + in_fasta_files = [file for file in request.files.getlist('in_fasta') if file.filename] + in_gff_files = [file for file in request.files.getlist('in_gff') if file.filename] + if (downstream_fasta_files or upstream_fasta_files) and request.form['position']: flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') - return redirect(url_for('submit_test')) - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename): - if not (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): + return redirect(url_for('submit')) + if (downstream_fasta_files or upstream_fasta_files): + if not (downstream_fasta_files and upstream_fasta_files): flash("Error: Must enter both upstream and downstream", 'error') - return redirect(url_for('submit_test')) + return redirect(url_for('submit')) # Verify position is a number + positions = None try: - if request.form['position']: - int(request.form['position']) - except ValueError: - flash("Error: Position must be an integer, like -1, 0, 1.", 'error') - return redirect(url_for('submit_test')) - # Verify that the name of the uploaded file is not empty and different - if request.form['position'] and (request.files['in_fasta'].filename and request.files['in_gff'].filename) \ - and (request.files['in_fasta'].filename == request.files['in_gff'].filename): - flash("Error: ref_fasta and ref_gff must be different files", 'error') - return redirect(url_for('submit_test')) - elif (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename and request.files['in_fasta'].filename and request.files['in_gff'].filename): - filenames_test = [request.files['in_fasta'].filename, - request.files['in_gff'].filename, - request.files['downstream_fasta'].filename, - request.files['upstream_fasta'].filename] - if len(filenames_test) != len(set(filenames_test)): - flash("Error: ref_fasta, ref_gff, downstream_fasta, upstream_fasta must be different files", 'error') - return redirect(url_for('submit_test')) + position_str = request.form['position'] + if position_str: + positions = process_position(position_str) + # print("Positions:", positions) + except ValueError as e: + flash(str(e), 'error') + return redirect(url_for('submit')) + # Ensure the uploaded file names are unique and not empty using set() + if in_fasta_files and in_gff_files: + all_in_filenames = [] + in_fasta_filenames = [file.filename for file in in_fasta_files] + in_gff_filenames = [file.filename for file in in_gff_files] + all_in_filenames.extend(in_fasta_filenames) + all_in_filenames.extend(in_gff_filenames) + # When position provide + if request.form['position'] and (len(all_in_filenames) != len(set(all_in_filenames))): + flash("Error: in_fasta and in_gff must be different files", 'error') + return redirect(url_for('submit')) + # When position not provide, use upstream_fasta and downstream_fasta + elif downstream_fasta_files and upstream_fasta_files: + all_stream_fasta_filenames = [] + downstream_fasta_filenames = [file.filename for file in downstream_fasta_files] + upstream_fasta_filenames = [file.filename for file in upstream_fasta_files] + all_stream_fasta_filenames.extend(downstream_fasta_filenames) + all_stream_fasta_filenames.extend(upstream_fasta_filenames) + if len(all_stream_fasta_filenames) != len(set(all_stream_fasta_filenames)): + flash("Error: downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + # Check all uniqueness for all files + all_filenames = [] + all_filenames.extend(all_in_filenames) + all_filenames.extend(all_stream_fasta_filenames) + if len(all_filenames) != len(set(all_filenames)): + flash("Error: in_fasta, in_gff, downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + if not (downstream_fasta_files or upstream_fasta_files) and not request.form['position']: + flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') + return redirect(url_for('submit')) else: # User Submits Job # # (1) Create unique ID for each submission @@ -162,7 +213,6 @@ def submit_test(): if not os.path.isfile('database.db'): db_create() - # (3) Upload files from user device to server # Verify all files are present before uploading for files in UPLOAD_FILES: @@ -172,19 +222,17 @@ def submit_test(): # Choose to upload new files or use local files if verified: - uploaded_files = {} - for file_key in UPLOAD_FILES: - uploaded_files[file_key] = upload_test(target_dir, file_key, DEFAULT_FILES) + for file_key in UPLOAD_FILES.keys(): + UPLOAD_FILES[file_key] = format_paths(upload_test(target_dir, file_key, DEFAULT_FILES), target_dir) # Set defualt None to up/down stream fasta for file_key in ['upstream_fasta', 'downstream_fasta']: - uploaded_files['upstream_fasta'] = None - uploaded_files['downstream_fasta'] = None + UPLOAD_FILES['upstream_fasta'] = None + UPLOAD_FILES['downstream_fasta'] = None # Uploaded upstream/downstream files when position is not provided if not request.form['position']: for file_key in ['upstream_fasta', 'downstream_fasta']: - uploaded_files[file_key] = upload_test(target_dir, file_key, DEFAULT_FILES) - + UPLOAD_FILES[file_key] = format_paths(upload_test(target_dir, file_key, DEFAULT_FILES), target_dir) # Replace Ref Sequence with local path if test files detected if request.form['ref_fasta'] == 'test-ref.fa': uploaded_files['ref_fasta'] = DEFAULT_FILES['ref_fasta'] @@ -193,9 +241,9 @@ def submit_test(): if request.form['ref_gff'] == 'test-ref.gtf': uploaded_files['ref_gff'] = DEFAULT_FILES['ref_gff'] else: - uploaded_files['ref_gff'] = request.form['ref_gff'] + UPLOAD_FILES['ref_gff'] = request.form['ref_gff'] - db_test_submit(request, uploaded_files, timestamp) + db_test_submit(request, UPLOAD_FILES, timestamp) # (4) Send job to the backend # Use the redis queue as same as production site @@ -207,13 +255,13 @@ def submit_test(): timestamp, request.form['email'], # ys4680@nyu.edu request.form['chrom'], # 1 - uploaded_files['upstream_fasta'], # by default - uploaded_files['downstream_fasta'], - request.form['position'], - uploaded_files['ref_fasta'], # by default - uploaded_files['ref_gff'], # by default - uploaded_files['in_fasta'], # by default - uploaded_files['in_gff'] # by default + UPLOAD_FILES['upstream_fasta'], # by default + UPLOAD_FILES['downstream_fasta'], + positions, + UPLOAD_FILES['ref_fasta'], # by default + UPLOAD_FILES['ref_gff'], # by default + UPLOAD_FILES['in_fasta'], # by default + UPLOAD_FILES['in_gff'] # by default ), result_ttl=-1, job_timeout=3000 diff --git a/forms.py b/forms.py index 89042ec..a927d4d 100644 --- a/forms.py +++ b/forms.py @@ -1,6 +1,6 @@ from wtforms import Form, StringField, FileField from wtforms.validators import URL, Optional, InputRequired, Email -from flask_wtf.file import FileRequired, FileAllowed +from flask_wtf.file import FileRequired, FileAllowed, MultipleFileField ALLOWED_EXTENSIONS = {'fa', 'gff', 'gff3', 'gtf', 'fasta', 'fna', 'tar', 'gz'} @@ -17,42 +17,42 @@ class SubmitJob(Form): ]) chrom = StringField('Chromosome', - description="ID of the chromosome to modify. Must match ID in FASTA file.", + description="ID of the chromosome to modify. Must match ID in FASTA file(s).", validators=[ InputRequired() ]) # POSITION position = StringField('Position', - description="Position in chromosome at which to insert . Can use -1 to add to end " - "of chromosome. Note: Position is 0-based", + description="Position(s) in chromosome at which to insert . Can use -1 to add to end " + "of chromosome. Note: Position(s) is 0-based and comma-separated.", validators=[Optional()]) # OR - upstream_fasta = FileField('Upstream Sequence', - description="FASTA file with upstream sequence.", + upstream_fasta = MultipleFileField('Upstream Sequence', + description="FASTA file(s) with upstream sequence.", validators=[ Optional() ]) - downstream_fasta = FileField('Downstream Sequence', - description="FASTA file with downstream sequence.", + downstream_fasta = MultipleFileField('Downstream Sequence', + description="FASTA file(s) with downstream sequence.", validators=[ Optional() ]) # Uploads - in_fasta = FileField('Inserted Sequence (FASTA)', - description="New sequence to be inserted into reference genome.", + in_fasta = MultipleFileField('Inserted Sequence (FASTA)', + description="New sequence(s) to be inserted into reference genome.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), FileRequired() ]) - in_gff = FileField('Inserted Reference (gff3 or gtf)', - description="GFF file describing new FASTA sequence to be inserted.", + in_gff = MultipleFileField('Inserted Reference (gff3 or gtf)', + description="GFF file(s) describing new FASTA sequence(s) to be inserted.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), - InputRequired() + FileRequired() ]) # Downloads ref_fasta = StringField('Reference Sequence (FASTA)', @@ -88,7 +88,7 @@ class Testjob(Form): default="ys4680@nyu.edu") # hard code email chrom = StringField('Chromosome', - description="ID of the chromosome to modify. Must match ID in FASTA file.", + description="ID of the chromosome to modify. Must match ID in FASTA file(s).", validators=[ InputRequired() ], @@ -96,35 +96,35 @@ class Testjob(Form): # POSITION position = StringField('Position', - description="Position in chromosome at which to insert . Can use -1 to add to end " - "of chromosome. Note: Position is 0-based", + description="Position(s) in chromosome at which to insert . Can use -1 to add to end " + "of chromosome. Note: Position(s) is 0-based and comma-separated.", validators=[Optional()]) # OR - upstream_fasta = FileField('Upstream Sequence', - description="FASTA file with upstream sequence. If no file is selected, the system will use 'test-up.fa' as a default.", + upstream_fasta = MultipleFileField('Upstream Sequence', + description="FASTA file(s) with upstream sequence. If no file(s) is selected, the system will use 'test-up.fa' as a default.", validators=[ Optional() ]) - downstream_fasta = FileField('Downstream Sequence', - description="FASTA file with downstream sequence. If no file is selected, the system will use 'test-down.fa' as a default.", + downstream_fasta = MultipleFileField('Downstream Sequence', + description="FASTA file(s) with downstream sequence. If no file(s) is selected, the system will use 'test-down.fa' as a default.", validators=[ Optional() ]) # Uploads - in_fasta = FileField('Inserted Sequence (FASTA)', - description="Please upload the new sequence to be inserted into the reference genome. If no file is selected, the system will use 'test-in.fa' as a default.", + in_fasta = MultipleFileField('Inserted Sequence (FASTA)', + description="Please upload the new sequence(s) to be inserted into the reference genome. If no file(s) is selected, the system will use 'test-in.fa' as a default.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), # FileRequired() ]) - in_gff = FileField('Inserted Reference (gff3 or gtf)', - description="Please upload the GFF file describing the new FASTA sequence to be inserted. If no file is selected, the system will use 'test-in.gff' as a default.", + in_gff = MultipleFileField('Inserted Reference (gff3 or gtf)', + description="Please upload the GFF file(s) describing the new FASTA sequence(s) to be inserted. If no file(s) is selected, the system will use 'test-in.gff' as a default.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), - # InputRequired() + # FileRequired() ]) # Downloads ref_fasta = StringField('Reference Sequence (FASTA)', diff --git a/job.py b/job.py index af7899a..b1c6390 100644 --- a/job.py +++ b/job.py @@ -23,15 +23,21 @@ def redisjob(target_dir, timestamp, email, chrom, upstream_fasta, downstream_fasta, position, ref_fastaURL, ref_gffURL, in_fasta, in_gff): + # Convert list of insert sequences to string + in_fasta_str = ','.join(in_fasta) + in_gff_str = ','.join(in_gff) if position: + command = "bash ./run.sh {} {} {} {} {} {} {} {} {}".format(target_dir, timestamp, email, chrom, - ref_fastaURL, ref_gffURL, in_fasta, - in_gff, position) + ref_fastaURL, ref_gffURL, in_fasta_str, + in_gff_str, position) else: + upstream_fasta_str = ','.join(upstream_fasta) + downstream_fasta_str = ','.join(downstream_fasta) command = "bash ./run.sh {} {} {} {} {} {} {} {} {} {}".format(target_dir, timestamp, email, chrom, - ref_fastaURL, ref_gffURL, in_fasta, - in_gff, upstream_fasta, - downstream_fasta) + ref_fastaURL, ref_gffURL, in_fasta_str, + in_gff_str, upstream_fasta_str, + downstream_fasta_str) try: #subprocess.run([command]) os.system(command) @@ -81,62 +87,75 @@ def allowed_file(filename): # Verify that the file is uploaded to the form and it is allowed def verify_uploads(file): - fileObj = request.files[file] - - if fileObj.filename == '': - flash('No ' + file + ' file selected for uploading', 'error') - return False - if fileObj and allowed_file(fileObj.filename): - return True - else: - flash('Invalid File Type for ' + file, 'error') - return False + fileObj_lists = request.files.getlist(file) + for uploaded_file in fileObj_lists: + if uploaded_file.filename == '': + flash('No ' + file + ' file selected for uploading', 'error') + return False + if uploaded_file and allowed_file(uploaded_file.filename): + continue + else: + flash('Invalid File Type for ' + file, 'error') + return False + return True # verify_uploads() for test site def verify_test_uploads(file): - fileObj = request.files[file] - # Not require user to upload files - if fileObj.filename == '': - # flash('No ' + file + ' file selected for uploading', 'error') - # return False - return True # If no file is uploaded then the default file is used - if fileObj and allowed_file(fileObj.filename): - return True - else: - flash('Invalid File Type for ' + file, 'error') - return False + fileObj_lists = request.files.getlist(file) + for uploaded_file in fileObj_lists: + # Not require user to upload files + if uploaded_file.filename == '': + # flash('No ' + file + ' file selected for uploading', 'error') + # return False + continue # If no file is uploaded then the default file is used + if uploaded_file and allowed_file(uploaded_file.filename): + continue + else: + flash('Invalid File Type for ' + file, 'error') + return False + return True # Upload files to uploads/timestamp def upload(target_dir, file): - fileObj = request.files[file] + fileObj_lists = request.files.getlist(file) # make the directory based on timestamp os.system('mkdir -p ' + target_dir) # save the file - fileObj.save(os.path.join(target_dir, - secure_filename(fileObj.filename))) + filenames = [] + for fileObj in fileObj_lists: + filename = secure_filename(fileObj.filename) + fileObj.save(os.path.join(target_dir, filename)) + filenames.append(filename) + return filenames # upload file function for test site, return by filename def upload_test(target_dir, file_key, default_files): # if file is empty (indicated use default file), fileObj set to None - fileObj = request.files[file_key] if file_key in request.files else None + fileObj_lists = request.files.getlist(file_key) if file_key in request.files else [] os.makedirs(target_dir, exist_ok=True) # dirs for upload files # User provided new files in form - if fileObj: - # save the uploaded file - filename = secure_filename(fileObj.filename) - file_path = os.path.join(target_dir, filename) - fileObj.save(file_path) - return fileObj.filename + filenames = [] + if fileObj_lists: + for fileObj in fileObj_lists: + # save the uploaded file + filename = secure_filename(fileObj.filename) + file_path = os.path.join(target_dir, filename) + fileObj.save(file_path) + filenames.append(fileObj.filename) else: - # Use default file if no file was uploaded - src = os.path.abspath(default_files[file_key]) - dst = os.path.join(target_dir, os.path.basename(src)) - # Create soft link in uploads/timestamp - if not os.path.exists(dst): - os.symlink(src, dst) - return os.path.basename(src) + # default_files is a list of file names + src_files = default_files[file_key] + for src in src_files: + # Use default file if no file was uploaded + src = os.path.abspath(default_files[file_key]) + dst = os.path.join(target_dir, os.path.basename(src)) + # Create soft link in uploads/timestamp + if not os.path.exists(dst): + os.symlink(src, dst) + filenames.append(os.path.basename(src)) + return filenames def download(target_dir, URL): @@ -284,3 +303,20 @@ def db_update(timestamp, set_id, set_value): db.execute("UPDATE submissions SET " + set_id + "=? where timestamp=? ", (set_value, timestamp)) db.commit() db.close() + +def process_position(position_str): + try: + position = int(position_str) + return position_str # return Str + except ValueError: + try: + position_list = [pos.strip() for pos in position_str.split(',')] + for pos in position_list: + int(pos) + return position_str + except ValueError: + raise ValueError("Position must be an integer or a comma-separated list of integers.") + +# Format file path into: ./uploads/$timestamp/item +def format_paths(file_list, target_dir): + return [os.path.join(target_dir, filename) for filename in file_list] \ No newline at end of file diff --git a/reform.py b/reform.py index 659bb3c..82c78c0 100644 --- a/reform.py +++ b/reform.py @@ -1,446 +1,595 @@ import argparse import re import os +import gzip +import tempfile from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna - +try: + import pgzip as gzip_module + print("Using pgzip for gzip operations.") +except ImportError: + import gzip as gzip_module + print("pgzip not found, falling back to gzip.") def main(): - ## Retrieve command line arguments - in_arg = get_input_args() - - ## Retrieve ouptut directory - output_dir = in_arg.output_dir - - ## Read the new fasta (to be inserted into the ref genome) - record = list(SeqIO.parse(in_arg.in_fasta, "fasta"))[0] - - ## Generate index of sequences from ref reference fasta - chrom_seqs = SeqIO.index(in_arg.ref_fasta, 'fasta') - - ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) - - ## Get the position to insert the new sequence - positions = get_position(in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str) - position = positions['position'] - down_position = positions['down_position'] - if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta, position, in_arg.chrom)) - - ## Build the new chromosome sequence with the inserted_seq - ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) - new_length = str(len(new_seq)) - new_record = SeqRecord( - Seq(new_seq, generic_dna), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) - ) - - ## Create new fasta file with modified chromosome - ref_basename = os.path.basename(in_arg.ref_fasta) - ref_name = os.path.splitext(ref_basename)[0] - new_fasta = output_dir + ref_name + '_reformed.fa' - with open(new_fasta, "w") as f: - for s in chrom_seqs: - if s == seq.id: - SeqIO.write([new_record], f, "fasta") - else: - SeqIO.write([chrom_seqs[s]], f, "fasta") - - print("New fasta file created: ", new_fasta) - print("Preparing to create new annotation file") - - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff) - - ## Create new gff file - annotation_basename = os.path.basename(in_arg.ref_gff) - (annotation_name, annotation_ext) = os.path.splitext(annotation_basename) - new_gff_name = output_dir + annotation_name + '_reformed' + annotation_ext - new_gff = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, - len(str(record.seq))) - print("New {} file created: {} ".format(annotation_ext.upper(), new_gff.name)) + ## Retrieve command line arguments and number of iterations + in_arg, iterations = get_input_args() + + ## Retrieve ouptut directory + output_dir = in_arg.output_dir + + ## List for previous postion and modification length + prev_modifications = [] + + ## Path for the files generated in sequential processing, mainly managed by tempfile. + prev_fasta_path = None + prev_gff_path = None + + ## Sequential processing + for index in range(iterations): + ## Read the new fasta (to be inserted into the ref genome) + try: + record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + except IndexError: + filename = in_arg.in_fasta[index].name + raise ValueError(f"Error: {filename} is not a valid FASTA file.") + except Exception as e: + raise ValueError(f"Error parsing FASTA file: {str(e)}") + + ## Generate index of sequences from ref reference fasta + if prev_fasta_path: + chrom_seqs = index_fasta(prev_fasta_path) + os.remove(prev_fasta_path) + else: + chrom_seqs = index_fasta(in_arg.ref_fasta) + + ## Obtain the sequence of the chromosome we want to modify + seq = chrom_seqs[in_arg.chrom] + seq_str = str(seq.seq) + + ## Get the position to insert the new sequence + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + position = positions['position'] + down_position = positions['down_position'] + + ## Save current modification which include position(index) and length changed. + if position == down_position: + length_changed = len(str(record.seq)) + else: + length_changed = len(str(record.seq)) - (down_position - position - 1) + prev_modifications.append((position,length_changed)) + + if position != down_position: + print("Removing nucleotides from position {} - {}".format(position, down_position)) + print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" + .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + + ## Build the new chromosome sequence with the inserted_seq + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] + chrom_length = str(len(seq_str)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq, generic_dna), + id=seq.id, + description=seq.description.replace(chrom_length, new_length) + ) + + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = output_dir + ref_name + '_reformed.fa' + with open(new_fasta, "w") as f: + for s in chrom_seqs: + if s == seq.id: + SeqIO.write([new_record], f, "fasta") + else: + SeqIO.write([chrom_seqs[s]], f, "fasta") + print("New fasta file created: ", new_fasta) + + print("Preparing to create new annotation file") + + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + print(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_name = output_dir + annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + print("New {} file created: {} ".format(annotation_ext.upper(), prev_gff_path)) + + +def index_fasta(fasta_path): + ''' + Process incoming FASTA files, supporting both decompressed and uncompressed + formats. It takes a file path (fasta_path), decompresses the file into a + temporary file if it is a compressed file ending in .gz, and then indexes + the temporary file. Finally, the indexing result is returned. + ''' + if fasta_path.endswith('.gz'): + # Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f_path = tmp_f.name + ## Use pgzip or gzip to decompress parallely. Set thread=None means use all cores + with gzip_module.open(fasta_path, 'rt', thread=None) as f: + tmp_f.write(f.read()) + chrom_seqs = SeqIO.index(tmp_f_path, 'fasta') + # remove temp file + os.remove(tmp_f_path) + else: + chrom_seqs = SeqIO.index(fasta_path, 'fasta') + return chrom_seqs + +def get_ref_basename(filepath): + ''' + Takes a filepath and returns the filename and extension minus the .gz. + ''' + base = os.path.basename(filepath) + if base.endswith('.gz'): + base = base[:-3] # remove .gz + name, ext = os.path.splitext(base) + return name, ext def modify_gff_line(elements, start=None, end=None, comment=None): - ''' - Modifies an existing GFF line and returns the modified line. Currently, you can - override the start position, end position, and comment column. - - gff_out: an open file handle for writing new gff lines to - elements: a list containing each column (9 in total) of a single feature line in a gff file - start: if provided, overwrite the start position in elements with this start position - end: if provided, overwrite the end position in elements with this end position - comment: if provided, overwrite the comments column in elements with this comment - ''' - if start == None: - start = int(elements[3]) - if end == None: - end = int(elements[4]) - if comment == None: - comment = elements[8] - if not comment.endswith('\n'): - comment += '\n' - - ## Return the modified line - return ("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], - elements[6], elements[7], comment)) - - + ''' + Modifies an existing GFF line and returns the modified line. Currently, you can + override the start position, end position, and comment column. + + gff_out: an open file handle for writing new gff lines to + elements: a list containing each column (9 in total) of a single feature line in a gff file + start: if provided, overwrite the start position in elements with this start position + end: if provided, overwrite the end position in elements with this end position + comment: if provided, overwrite the comments column in elements with this comment + ''' + if start == None: + start = int(elements[3]) + if end == None: + end = int(elements[4]) + if comment == None: + comment = elements[8] + if not comment.endswith('\n'): + comment += '\n' + + ## Return the modified line + return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) + def get_in_gff_lines(in_gff): - ''' - Takes a gff file and returns a list of lists where - each parent list item is a single line of the gff file - and the child elements are the columns of the line - ''' - with open(in_gff, "r") as f: - in_gff_lines = [] - for line in f: - line_elements = line.split('\t') - # Skip comment lines - if line.startswith("#"): - continue - # Ensure lines have 9 columns - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) - exit() - in_gff_lines.append(line_elements) - return in_gff_lines - - -def get_position(position, upstream, downstream, chrom, seq_str): - ''' - Determine the position in seq_str to insert the new sequence given - the position, upstream, downstream, and chrom arguments. - Note that either position, or upstream AND downstream sequences must - be provided. - ''' - if position is not None and position >= -1: - print("Checking position validity") - if position > len(seq_str): - print("** ERROR: Position greater than length of chromosome.") - print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) - exit() - elif position == -1: - position = len(seq_str) - # At the moment we don't accept down position as a param - # so set down_position = position - down_position = position - print("Position valid") - else: - print("No valid position specified, checking for upstream and downstream sequence") - if upstream is not None and downstream is not None: - seq_str = seq_str.upper() - upstream_fasta = list(SeqIO.parse(upstream, "fasta")) - upstream_seq = str(upstream_fasta[0].seq).upper() - downstream_fasta = list(SeqIO.parse(downstream, "fasta")) - downstream_seq = str(downstream_fasta[0].seq).upper() - # Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die - upstream_seq_count = seq_str.count(upstream_seq) - downstream_seq_count = seq_str.count(downstream_seq) - if upstream_seq_count == 1 and downstream_seq_count == 1: - ## Obtain the starting position of the left_strand - index = seq_str.find(upstream_seq) - position = index + len(upstream_seq) - down_position = seq_str.find(downstream_seq) - else: - print( - "** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") - print("Chromosome: {}\n".format(chrom)) - print("Upstream sequence found {} times".format(upstream_seq_count)) - print("Downstream sequence found {} times".format(downstream_seq_count)) - exit() - else: - print("** ERROR: You must specify a valid position or upstream and downstream sequences.") - exit() - if down_position < position: - print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.") - exit() - return {'position': position, 'down_position': down_position} - - -def write_in_gff_lines(gff_out, in_gff_lines, position, split_features): - for l in in_gff_lines: - new_gff_line = modify_gff_line( - l, start=int(l[3]) + position, end=int(l[4]) + position) - gff_out.write(new_gff_line) - - ## If insertion caused any existing features to be split, add - ## the split features now immediately after adding the new features - for sf in split_features: - modified_line = modify_gff_line( - sf[0], start=sf[1], end=sf[2], comment=sf[3]) - gff_out.write(modified_line) - - ## Return True after writing the new GFF lines - return True - + ''' + Takes a gff file and returns a list of lists where + each parent list item is a single line of the gff file + and the child elements are the columns of the line + ''' + with open(in_gff, "r") as f: + in_gff_lines = [] + for line in f: + line_elements = line.split('\t') + # Skip comment lines + if line.startswith("#"): + continue + # Ensure lines have 9 columns + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + exit() + in_gff_lines.append(line_elements) + return in_gff_lines + +def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): + ''' + Determine the position in seq_str to insert the new sequence given + the position, upstream, downstream, and chrom arguments. + Note that either position, or upstream AND downstream sequences must + be provided. + ''' + if positions and index < len(positions) and positions[index] >= -1: + print("Checking position validity") + position = positions[index] + ## Update current postion based on previous modification + for pos, lc in prev_modifications: + ## Also ignore when postion == -1 + if position >= pos: + position += lc + if position > len(seq_str): + print("** ERROR: Position greater than length of chromosome.") + print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) + exit() + elif position == -1: + position = len(seq_str) + # At the moment we don't accept down position as a param + # so set down_position = position + down_position = position + print("Position valid") + else: + print("No valid position specified, checking for upstream and downstream sequence") + if index < len(upstream) and index < len(downstream): + seq_str = seq_str.upper() + upstream_fasta = list(SeqIO.parse(upstream[index], "fasta")) + upstream_seq = str(upstream_fasta[0].seq).upper() + downstream_fasta = list(SeqIO.parse(downstream[index], "fasta")) + downstream_seq = str(downstream_fasta[0].seq).upper() + # Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die + upstream_seq_count = seq_str.count(upstream_seq) + downstream_seq_count = seq_str.count(downstream_seq) + if upstream_seq_count == 1 and downstream_seq_count == 1: + ## Obtain the starting position of the left_strand + new_index = seq_str.find(upstream_seq) + position = new_index + len(upstream_seq) + down_position = seq_str.find(downstream_seq) + else: + print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") + print("Chromosome: {}\n".format(chrom)) + print("Upstream sequence found {} times".format(upstream_seq_count)) + print("Downstream sequence found {} times".format(downstream_seq_count)) + exit() + else: + print("** ERROR: You must specify a valid position or upstream and downstream sequences.") + exit() + if down_position < position: + print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.") + exit() + return {'position': position, 'down_position': down_position} + +def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom): + ## Replace the chromosome ID from in_gff with the correct chromosome ID + for l in in_gff_lines: + l[0] = chrom + # Handling of single-line comments + if len(in_gff_lines) == 1: + l = in_gff_lines[0] + ## Check length + ## l[3] is start position of fasta in in.gtf and l[4] is end position + seq_id = l[0] + if int(l[4]) - int(l[3]) + 1 != sequence_length: + print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.") + ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta + new_gff_line = modify_gff_line( + l, start=1 + position, end=sequence_length + position) + gff_out.write(new_gff_line) + # Handling of multiple-line comments + else: + ### Step1: extract all start and end into corresponding set() + start_positions = set() + end_positions = set() + for l in in_gff_lines: + start_positions.add(int(l[3])) + end_positions.add(int(l[4])) + ### Step2: Find min start and max end + min_start = min(start_positions) + max_end = max(end_positions) + ### Step3: Check sequence length, validness of min_start and max_end + if max_end - min_start + 1 != sequence_length: + raise ValueError(f"Error: Annotation length does not match sequence length. " + f"Expected {sequence_length}, but got {max_end - min_start + 1}") + if min_start < 1: + raise ValueError(f"Error: Invalid min_start value: {min_start}. It must be a positive integer.") + if min_start > max_end: + raise ValueError(f"Error: min_start ({min_start}) cannot be greater than max_end ({max_end}).") + ### Step4: Adjust start and end. Offset will be 0 if no adjust need + offset = min_start - 1 + for l in in_gff_lines: + ### Step5: Correct start(l[3]) and end(l[4]) by minus offset + new_gff_line = modify_gff_line( + l, start = int(l[3]) - offset + position, end = int(l[4]) - offset + position) + gff_out.write(new_gff_line) + + ## If insertion caused any existing features to be split, add + ## the split features now immediately after adding the new features + for sf in split_features: + modified_line = modify_gff_line( + sf[0], start = sf[1], end = sf[2], comment = sf[3]) + gff_out.write(modified_line) + + ## Return True after writing the new GFF lines + return True def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): - ''' - Goes line by line through a gff file to remove existing features - (or parts of existing features) and/or insert new features. - In the process of adding new features, existing features - may be cut-off on either end or split into two. - This attempts to handle all cases. - new_gff_name: the name of the new gff file to create - ref_gff: the reference gff file to modify - in_gff_lines: a list of lists where each nested list is a list of - columns (in gff format) associated with each new feature to insert - position: start position of removal of existing sequence - down_position: end position of removal of existing sequence - chrom_id: the ID of the chromosome to modify - new_seq_length: the length of the new sequence being added to the chromosome - ''' - gff_out = open(new_gff_name, "w") - in_gff_lines_appended = False - split_features = [] - last_seen_chrom_id = None - gff_ext = new_gff_name.split('.')[-1] - with open(ref_gff, "r") as f: - for line in f: - line_elements = line.split('\t') - if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: - ## Edit the length of the chromosome - original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - else: - gff_chrom_id = line_elements[0] - gff_feat_start = int(line_elements[3]) - gff_feat_end = int(line_elements[4]) - gff_feat_type = line_elements[2] - gff_feat_strand = line_elements[6] - gff_comments = line_elements[8].strip() - - # If we've seen at least one chromosome - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) - # and now we're on a new chromosome in the original gff file - # and the in_gff_lines have not yet been appended: - # we assume they need to be appended to the end of the chromosome - # so append them before proceeding - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and gff_chrom_id != last_seen_chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - last_seen_chrom_id = gff_chrom_id - - # If this is not the chromosome of interest - # Or if the current feature ends before any - # modification (which occurs at position) - # Then simply write the feature as is (no modification) - # (remember gff co-ordinates are 1 based and position - # is 0 based, therefor check less than *or equal to*) - if gff_chrom_id != chrom_id or gff_feat_end <= position: - gff_out.write(line) - - # Modify chromosome feature length (if feature is - # "chromosome" or "region") - elif gff_feat_type in ['chromosome', 'region']: - original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - - # Split feature into 2 if feature starts before position - # and ends after down_position - elif gff_feat_start <= position and gff_feat_end > down_position: - print("Feature split") - # Which side of the feature depends on the strand (we add this as a comment) - (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") - - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(x), - gff_ext - ) - # Modified feature ends at 'position' - modified_line = modify_gff_line( - line_elements, - end=position, - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # The downstream flank(s) will be added immediately after the - # in_gff (new) features are written. - # First, attempt to rename IDs (to indicate split) - renamed_id_attributes = rename_id(line) - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(y), - gff_ext - ) - split_features.append( - ( - line_elements, - position + new_seq_length + 1, - gff_feat_end + new_seq_length - (down_position - position), - renamed_id_attributes + new_comment - ) - ) - - # Change end position of feature to cut off point (position) if the - # feature ends within the deletion (between position & down_position) - elif gff_feat_start <= position and gff_feat_end <= down_position: - # Which side of the feature depends on the strand (we add this as a comment) - x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - end=position, - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Skip this feature if it falls entirely within the deletion - elif gff_feat_start > position and gff_feat_end <= down_position: - print("Skip feature (this feature was removed from sequence)") - continue - - else: - if not in_gff_lines_appended: - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - # Change start position of feature to after cutoff point if - # the feature starts within the deletion - if (gff_feat_start > position - and gff_feat_start <= down_position - and gff_feat_end > down_position): - x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - start=position + new_seq_length + 1, - end=gff_feat_end + new_seq_length - (down_position - position), - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Offset all downstream feature positions by offset length - elif gff_feat_start > down_position: - offset_length = new_seq_length - (down_position - position) - modified_line = modify_gff_line( - line_elements, - start=gff_feat_start + offset_length, - end=gff_feat_end + offset_length - ) - gff_out.write(modified_line) - - else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) - exit() - - # If we've iterated over the entire original gff - # (i.e. out of the for loop which iterates over it) - # and still haven't written in_gff_lines - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): - # we assume they need to be appended to the end of the genome - # (i.e. end of the last chromosome) - # so append them now - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - # Checking to ensure in_gff_lines written - if not in_gff_lines_appended: - print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") - exit() - - gff_out.close() - - return gff_out - + ''' + Goes line by line through a gff file to remove existing features + (or parts of existing features) and/or insert new features. + In the process of adding new features, existing features + may be cut-off on either end or split into two. + This attempts to handle all cases. + new_gff_name: the name of the new gff file to create + ref_gff: the reference gff file to modify + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + position: start position of removal of existing sequence + down_position: end position of removal of existing sequence + chrom_id: the ID of the chromosome to modify + new_seq_length: the length of the new sequence being added to the chromosome + ''' + with open(new_gff_name, "w") as gff_out: + in_gff_lines_appended = False + split_features = [] + last_seen_chrom_id = None + gff_ext = new_gff_name.split('.')[-1] + ref_gff_path = ref_gff + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + # Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + ref_gff_path = tmp_f.name + with open(ref_gff_path, "r") as f: + for line in f: + line_elements = line.split('\t') + if line.startswith("#"): + if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + ## Edit the length of the chromosome + original_length = int(line_elements[3]) + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + else: + gff_chrom_id = line_elements[0] + gff_feat_start = int(line_elements[3]) + gff_feat_end = int(line_elements[4]) + gff_feat_type = line_elements[2] + gff_feat_strand = line_elements[6] + gff_comments = line_elements[8].strip() + + # If we've seen at least one chromosome + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) + # and now we're on a new chromosome in the original gff file + # and the in_gff_lines have not yet been appended: + # we assume they need to be appended to the end of the chromosome + # so append them before proceeding + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and gff_chrom_id != last_seen_chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + last_seen_chrom_id = gff_chrom_id + + # If this is not the chromosome of interest + # Or if the current feature ends before any + # modification (which occurs at position) + # Then simply write the feature as is (no modification) + # (remember gff co-ordinates are 1 based and position + # is 0 based, therefor check less than *or equal to*) + if gff_chrom_id != chrom_id or gff_feat_end <= position: + gff_out.write(line) + + # Modify chromosome feature length (if feature is + # "chromosome" or "region") + elif gff_feat_type in ['chromosome', 'region']: + original_length = gff_feat_end + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + + # Split feature into 2 if feature starts before position + # and ends after down_position + elif gff_feat_start <= position and gff_feat_end > down_position: + print("Feature split") + # Which side of the feature depends on the strand (we add this as a comment) + (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") + + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(x), + gff_ext + ) + # Modified feature ends at 'position' + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # The downstream flank(s) will be added immediately after the + # in_gff (new) features are written. + # First, attempt to rename IDs (to indicate split) + renamed_id_attributes = rename_id(line) + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(y), + gff_ext + ) + split_features.append( + ( + line_elements, + position + new_seq_length + 1, + gff_feat_end + new_seq_length - (down_position - position), + renamed_id_attributes + new_comment + ) + ) + + # Change end position of feature to cut off point (position) if the + # feature ends within the deletion (between position & down_position) + elif gff_feat_start <= position and gff_feat_end <= down_position: + # Which side of the feature depends on the strand (we add this as a comment) + x = "3" if gff_feat_strand == "+" else "5" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Skip this feature if it falls entirely within the deletion + elif gff_feat_start > position and gff_feat_end <= down_position: + print("Skip feature (this feature was removed from sequence)") + continue + + else: + if not in_gff_lines_appended: + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Change start position of feature to after cutoff point if + # the feature starts within the deletion + if (gff_feat_start > position + and gff_feat_start <= down_position + and gff_feat_end > down_position): + x = "5" if gff_feat_strand == "+" else "3" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + start = position + new_seq_length + 1, + end = gff_feat_end + new_seq_length - (down_position - position), + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Offset all downstream feature positions by offset length + elif gff_feat_start > down_position: + offset_length = new_seq_length - (down_position - position) + modified_line = modify_gff_line( + line_elements, + start = gff_feat_start + offset_length, + end = gff_feat_end + offset_length + ) + gff_out.write(modified_line) + + else: + print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + exit() + + # If we've iterated over the entire original gff + # (i.e. out of the for loop which iterates over it) + # and still haven't written in_gff_lines + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): + # we assume they need to be appended to the end of the genome + # (i.e. end of the last chromosome) + # so append them now + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Checking to ensure in_gff_lines written + if not in_gff_lines_appended: + print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") + exit() + # Remove temp gtf file + if ref_gff.endswith('.gz'): + os.remove(ref_gff_path) + return new_gff_name def format_comment(comment, ext): - ''' - Format comment according to ext (GFF or GTF) and return - ''' - if ext.lower() == 'gtf': - new_comment = 'reform_comment "{}";'.format(comment) - elif ext.lower().startswith('gff'): - new_comment = "; reform_comment={}".format(comment) - else: - print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) - exit() - return new_comment - - + ''' + Format comment according to ext (GFF or GTF) and return + ''' + if ext.lower() == 'gtf': + new_comment = 'reform_comment "{}";'.format(comment) + elif ext.lower().startswith('gff'): + new_comment = "; reform_comment={}".format(comment) + else: + print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) + exit() + return new_comment + def rename_id(line): - ''' - Given a gff or gtf line, this function will append the string "_split" - to the end of the ID/gene_id attribute - ''' - attributes = line.split('\t')[8].strip() - elements = attributes.split(';') - if elements[0].startswith("ID="): - print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) - return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) - elif elements[0].startswith("gene_id "): - gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] - print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) - return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) - - else: - print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) - return attributes - - + ''' + Given a gff or gtf line, this function will append the string "_split" + to the end of the ID/gene_id attribute + ''' + attributes = line.split('\t')[8].strip() + elements = attributes.split(';') + if elements[0].startswith("ID="): + print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) + return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) + elif elements[0].startswith("gene_id "): + gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] + print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) + return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) + + else: + print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) + return attributes + def get_input_args(): - parser = argparse.ArgumentParser() - - parser.add_argument('--chrom', type=str, required=True, - help="Chromosome name (String)") - parser.add_argument('--in_fasta', type=str, required=True, - help="Path to new sequence to be inserted into reference genome in fasta format") - parser.add_argument('--in_gff', type=str, required=True, - help="Path to GFF file describing new fasta sequence to be inserted") - parser.add_argument('--upstream_fasta', type=str, default=None, - help="Path to Fasta file with upstream sequence. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--downstream_fasta', type=str, default=None, - help="Path to Fasta file with downstream sequence. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--position', type=int, default=None, - help="Position at which to insert new sequence. Note: Position is 0-based. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--ref_fasta', type=str, required=True, - help="Path to reference fasta file") - parser.add_argument('--ref_gff', type=str, required=True, - help="Path to reference gff file") - parser.add_argument('--output_dir', type=str, default=None, + parser = argparse.ArgumentParser() + + parser.add_argument('--chrom', type = str, required = True, + help = "Chromosome name (String)") + parser.add_argument('--in_fasta', type=str, required=True, + help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") + parser.add_argument('--in_gff', type=str, required=True, + help="Path(s) to GFF file(s) describing new fasta sequence(s) to be inserted") + parser.add_argument('--upstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with upstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--downstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with downstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--position', type=str, default=None, + help="Comma-separated positions at which to insert new sequence. Note: Position is 0-based, no space between each comma. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--ref_fasta', type = str, required = True, + help = "Path to reference fasta file") + parser.add_argument('--ref_gff', type = str, required = True, + help = "Path to reference gff file") + parser.add_argument('--output_dir', type=str, default=None, help="Path to output to") - - in_args = parser.parse_args() - - if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): - print("** Error: You must provide either the position, or the upstream and downstream sequences.") - exit() - - return in_args - - + + in_args = parser.parse_args() + in_args.in_fasta = in_args.in_fasta.split(',') + in_args.in_gff = in_args.in_gff.split(',') + if in_args.upstream_fasta: + in_args.upstream_fasta = in_args.upstream_fasta.split(',') + if in_args.downstream_fasta: + in_args.downstream_fasta = in_args.downstream_fasta.split(',') + + if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): + print("** Error: You must provide either the position, or the upstream and downstream sequences.") + exit() + + if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): + print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") + exit() + + if in_args.position is not None: + try: + in_args.position = list(map(int, in_args.position.split(','))) + iterations = iterations = len(in_args.position) + except ValueError: + print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + exit() + else: + iterations = len(in_args.upstream_fasta) + + if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations): + print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") + exit() + + return in_args, iterations + if __name__ == "__main__": - main() + main() diff --git a/run.sh b/run.sh index cb68f2e..77886ae 100644 --- a/run.sh +++ b/run.sh @@ -85,24 +85,25 @@ else fi # Run reform.py +# ./uploads/$timestamp/ if [ ! -z "$position" ]; then - echo /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + echo $HOME/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" - /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + $HOME/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" 2>&1 | tee ./results/$timestamp/$timestamp-worker-err.log else - echo /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta ./uploads/$timestamp/$upstream_fasta \ - --downstream_fasta ./uploads/$timestamp/$downstream_fasta --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + echo $HOME/venv/bin/python reform.py --chrom $chrom --upstream_fasta $upstream_fasta \ + --downstream_fasta $downstream_fasta --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" - /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta ./uploads/$timestamp/$upstream_fasta \ - --downstream_fasta ./uploads/$timestamp/$downstream_fasta --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + $HOME/venv/bin/python reform.py --chrom $chrom --upstream_fasta $upstream_fasta \ + --downstream_fasta $downstream_fasta --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" 2>&1 | tee ./results/$timestamp/$timestamp-worker-err.log fi