From 9aaf3cb1ce751380ff45b896b20a7d0733a6f06e Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Thu, 8 Aug 2024 16:46:59 -0400 Subject: [PATCH 1/2] update with sequential processing --- app.py | 217 +++++++----- forms.py | 30 +- job.py | 122 ++++--- reform.py | 959 ++++++++++++++++++++++++++++++------------------------ run.sh | 21 +- 5 files changed, 774 insertions(+), 575 deletions(-) diff --git a/app.py b/app.py index 8730353..7a6494d 100644 --- a/app.py +++ b/app.py @@ -15,7 +15,7 @@ UPLOAD_FOLDER = './uploads' # The type of file that needs to be uploaded to the server by user. -UPLOAD_FILES = ['in_fasta', 'in_gff'] +UPLOAD_FILES = {'in_fasta': None, 'in_gff': None} # Route for submitting data on the production site. @app.route('/', methods=['GET', 'POST']) @@ -23,38 +23,61 @@ def submit(): form = SubmitJob(request.form) # Validate user input according to the validation rules defined in forms.py. if request.method == 'POST' and form.validate(): - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and request.form[ - 'position']: - flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') + # Get list of file names, and filter out empty files + downstream_fasta_files = [file for file in request.files.getlist('downstream_fasta') if file.filename] + upstream_fasta_files = [file for file in request.files.getlist('upstream_fasta') if file.filename] + in_fasta_files = [file for file in request.files.getlist('in_fasta') if file.filename] + in_gff_files = [file for file in request.files.getlist('in_gff') if file.filename] + if (downstream_fasta_files or upstream_fasta_files) and request.form['position']: + flash(f"Error: You must provide either the position, or the upstream and downstream sequences. Not all.", 'error') return redirect(url_for('submit')) - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename): - if not (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): + if (downstream_fasta_files or upstream_fasta_files): + if not (downstream_fasta_files and upstream_fasta_files): flash("Error: Must enter both upstream and downstream", 'error') return redirect(url_for('submit')) # Verify position is a number + positions = None try: - if request.form['position']: - int(request.form['position']) - except ValueError: - flash("Error: Position must be an integer, like -1, 0, 1.", 'error') - return redirect(url_for('submit')) - # Verify that the name of the uploaded file is different - if request.form['position'] and (request.files['in_fasta'].filename == request.files['in_gff'].filename): - flash("Error: ref_fasta and ref_gff must be different files", 'error') + position_str = request.form['position'] + if position_str: + positions = process_position(position_str) + # print("Positions:", positions) + except ValueError as e: + flash(str(e), 'error') return redirect(url_for('submit')) - elif (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): - filenames = [request.files['in_fasta'].filename, - request.files['in_gff'].filename, - request.files['downstream_fasta'].filename, - request.files['upstream_fasta'].filename] - if len(filenames) != len(set(filenames)): - flash("Error: ref_fasta, ref_gff, downstream_fasta, upstream_fasta must be different files", 'error') + # Verify that the name of the uploaded file is different by set() + # User upload in_fasta and in_gff + if in_fasta_files and in_gff_files: + all_in_filenames = [] + in_fasta_filenames = [file.filename for file in in_fasta_files] + in_gff_filenames = [file.filename for file in in_gff_files] + all_in_filenames.extend(in_fasta_filenames) + all_in_filenames.extend(in_gff_filenames) + # When position provide + if request.form['position'] and (len(all_in_filenames) != len(set(all_in_filenames))): + flash("Error: in_fasta and in_gff must be different files", 'error') return redirect(url_for('submit')) - - if not (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and not \ - request.form['position']: + # When position not provide, use upstream_fasta and downstream_fasta + elif downstream_fasta_files and upstream_fasta_files: + all_stream_fasta_filenames = [] + downstream_fasta_filenames = [file.filename for file in downstream_fasta_files] + upstream_fasta_filenames = [file.filename for file in upstream_fasta_files] + all_stream_fasta_filenames.extend(downstream_fasta_filenames) + all_stream_fasta_filenames.extend(upstream_fasta_filenames) + if len(all_stream_fasta_filenames) != len(set(all_stream_fasta_filenames)): + flash("Error: downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + # Check all uniqueness for all files + all_filenames = [] + all_filenames.extend(all_in_filenames) + all_filenames.extend(all_stream_fasta_filenames) + if len(all_filenames) != len(set(all_filenames)): + flash("Error: in_fasta, in_gff, downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + if not (downstream_fasta_files or upstream_fasta_files) and not request.form['position']: flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') return redirect(url_for('submit')) + else: # User Submits Job # # (1) Create unique ID for each submission @@ -68,20 +91,23 @@ def submit(): # (3) Upload files from user device to server # Verify all files are present before uploading - for files in UPLOAD_FILES: + for files in UPLOAD_FILES.keys(): verified = verify_uploads(files) if not verified: return redirect(url_for('submit')) - # Upload Files to UPLOAD_DIR/timestamp/ + # Upload Files to UPLOAD_DIR/timestamp/, and convert format to./uploads/$timestamp/item if verified: - for files in UPLOAD_FILES: - upload(target_dir, files) + for key in UPLOAD_FILES.keys(): + UPLOAD_FILES[key] = format_paths(upload(target_dir, key), target_dir) + # Set defualt None to up/down stream fasta + for file_key in ['upstream_fasta', 'downstream_fasta']: + UPLOAD_FILES[file_key] = None + UPLOAD_FILES[file_key] = None - if not request.form['position'] and request.files['upstream_fasta'].filename and request.files[ - 'downstream_fasta'].filename: - upload(target_dir, 'upstream_fasta') - upload(target_dir, 'downstream_fasta') + if not request.form['position']: + UPLOAD_FILES['upstream_fasta'] = format_paths(upload(target_dir, 'upstream_fasta'), target_dir) + UPLOAD_FILES['downstream_fasta'] = format_paths(upload(target_dir, 'downstream_fasta'), target_dir) # (4) Send the job to the backend # Connect to the Redis server and intial a queue @@ -93,13 +119,13 @@ def submit(): timestamp, request.form['email'], request.form['chrom'], - request.files['upstream_fasta'].filename, - request.files['downstream_fasta'].filename, - request.form['position'], + UPLOAD_FILES['upstream_fasta'], + UPLOAD_FILES['downstream_fasta'], #request.files['upstream_fasta'].filename, + positions, #request.form['position'], request.form['ref_fasta'], request.form['ref_gff'], - request.files['in_fasta'].filename, - request.files['in_gff'].filename), + UPLOAD_FILES['in_fasta'], + UPLOAD_FILES['in_gff']), result_ttl=-1, job_timeout=3000 ) @@ -117,42 +143,67 @@ def submit_test(): DEFAULT_FILES = { 'ref_fasta': './staticData/ref/Mus_musculus.GRCm38.dna.toplevel.fa', 'ref_gff': './staticData/ref/Mus_musculus.GRCm38.88.gff3', - 'in_fasta': './staticData/inserted/test-in.fa', - 'in_gff': './staticData/inserted/test-in.gtf', - 'upstream_fasta': './staticData/up-down-seq/test-up.fa', - 'downstream_fasta': './staticData/up-down-seq/test-down.fa' + 'in_fasta': ['./staticData/inserted/test-in.fa'], + 'in_gff': ['./staticData/inserted/test-in.gtf'], + 'upstream_fasta': ['./staticData/up-down-seq/test-up.fa'], + 'downstream_fasta': ['./staticData/up-down-seq/test-down.fa'] } # Validate user input based on test site rule form = Testjob(request.form) if request.method == 'POST' and form.validate(): - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename) and request.form[ - 'position']: + downstream_fasta_files = [file for file in request.files.getlist('downstream_fasta') if file.filename] + upstream_fasta_files = [file for file in request.files.getlist('upstream_fasta') if file.filename] + in_fasta_files = [file for file in request.files.getlist('in_fasta') if file.filename] + in_gff_files = [file for file in request.files.getlist('in_gff') if file.filename] + if (downstream_fasta_files or upstream_fasta_files) and request.form['position']: flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') - return redirect(url_for('submit_test')) - if (request.files['downstream_fasta'].filename or request.files['upstream_fasta'].filename): - if not (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename): + return redirect(url_for('submit')) + if (downstream_fasta_files or upstream_fasta_files): + if not (downstream_fasta_files and upstream_fasta_files): flash("Error: Must enter both upstream and downstream", 'error') - return redirect(url_for('submit_test')) + return redirect(url_for('submit')) # Verify position is a number + positions = None try: - if request.form['position']: - int(request.form['position']) - except ValueError: - flash("Error: Position must be an integer, like -1, 0, 1.", 'error') - return redirect(url_for('submit_test')) - # Verify that the name of the uploaded file is not empty and different - if request.form['position'] and (request.files['in_fasta'].filename and request.files['in_gff'].filename) \ - and (request.files['in_fasta'].filename == request.files['in_gff'].filename): - flash("Error: ref_fasta and ref_gff must be different files", 'error') - return redirect(url_for('submit_test')) - elif (request.files['downstream_fasta'].filename and request.files['upstream_fasta'].filename and request.files['in_fasta'].filename and request.files['in_gff'].filename): - filenames_test = [request.files['in_fasta'].filename, - request.files['in_gff'].filename, - request.files['downstream_fasta'].filename, - request.files['upstream_fasta'].filename] - if len(filenames_test) != len(set(filenames_test)): - flash("Error: ref_fasta, ref_gff, downstream_fasta, upstream_fasta must be different files", 'error') - return redirect(url_for('submit_test')) + position_str = request.form['position'] + if position_str: + positions = process_position(position_str) + # print("Positions:", positions) + except ValueError as e: + flash(str(e), 'error') + return redirect(url_for('submit')) + # Verify that the name of the uploaded file is different when user upload (filename is not empty ) + # Verify that the name of the uploaded file is different by set() + if in_fasta_files and in_gff_files: + all_in_filenames = [] + in_fasta_filenames = [file.filename for file in in_fasta_files] + in_gff_filenames = [file.filename for file in in_gff_files] + all_in_filenames.extend(in_fasta_filenames) + all_in_filenames.extend(in_gff_filenames) + # When position provide + if request.form['position'] and (len(all_in_filenames) != len(set(all_in_filenames))): + flash("Error: in_fasta and in_gff must be different files", 'error') + return redirect(url_for('submit')) + # When position not provide, use upstream_fasta and downstream_fasta + elif downstream_fasta_files and upstream_fasta_files: + all_stream_fasta_filenames = [] + downstream_fasta_filenames = [file.filename for file in downstream_fasta_files] + upstream_fasta_filenames = [file.filename for file in upstream_fasta_files] + all_stream_fasta_filenames.extend(downstream_fasta_filenames) + all_stream_fasta_filenames.extend(upstream_fasta_filenames) + if len(all_stream_fasta_filenames) != len(set(all_stream_fasta_filenames)): + flash("Error: downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + # Check all uniqueness for all files + all_filenames = [] + all_filenames.extend(all_in_filenames) + all_filenames.extend(all_stream_fasta_filenames) + if len(all_filenames) != len(set(all_filenames)): + flash("Error: in_fasta, in_gff, downstream_fasta, upstream_fasta must be different files", 'error') + return redirect(url_for('submit')) + if not (downstream_fasta_files or upstream_fasta_files) and not request.form['position']: + flash("Error: You must provide either the position, or the upstream and downstream sequences.", 'error') + return redirect(url_for('submit')) else: # User Submits Job # # (1) Create unique ID for each submission @@ -162,7 +213,6 @@ def submit_test(): if not os.path.isfile('database.db'): db_create() - # (3) Upload files from user device to server # Verify all files are present before uploading for files in UPLOAD_FILES: @@ -172,30 +222,29 @@ def submit_test(): # Choose to upload new files or use local files if verified: - uploaded_files = {} - for file_key in UPLOAD_FILES: - uploaded_files[file_key] = upload_test(target_dir, file_key, DEFAULT_FILES) + for file_key in UPLOAD_FILES.keys(): + UPLOAD_FILES[file_key] = format_paths(upload_test(target_dir, file_key, DEFAULT_FILES), target_dir) # Set defualt None to up/down stream fasta for file_key in ['upstream_fasta', 'downstream_fasta']: - uploaded_files['upstream_fasta'] = None - uploaded_files['downstream_fasta'] = None + UPLOAD_FILES['upstream_fasta'] = None + UPLOAD_FILES['downstream_fasta'] = None # Uploaded upstream/downstream files when position is not provided if not request.form['position']: for file_key in ['upstream_fasta', 'downstream_fasta']: - uploaded_files[file_key] = upload_test(target_dir, file_key, DEFAULT_FILES) + UPLOAD_FILES[file_key] = format_paths(upload_test(target_dir, file_key, DEFAULT_FILES), target_dir) # Replace Ref Sequence with local path if example ftp detected if request.form['ref_fasta'] == 'ftp://ftp.ensembl.org/pub/release-88/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.toplevel.fa.gz': - uploaded_files['ref_fasta'] = DEFAULT_FILES['ref_fasta'] + UPLOAD_FILES['ref_fasta'] = DEFAULT_FILES['ref_fasta'] else: - uploaded_files['ref_fasta'] = request.form['ref_fasta'] + UPLOAD_FILES['ref_fasta'] = request.form['ref_fasta'] if request.form['ref_gff'] == 'ftp://ftp.ensembl.org/pub/release-88/gff3/mus_musculus/Mus_musculus.GRCm38.88.gff3.gz': - uploaded_files['ref_gff'] = DEFAULT_FILES['ref_gff'] + UPLOAD_FILES['ref_gff'] = DEFAULT_FILES['ref_gff'] else: - uploaded_files['ref_gff'] = request.form['ref_gff'] + UPLOAD_FILES['ref_gff'] = request.form['ref_gff'] - db_test_submit(request, uploaded_files, timestamp) + db_test_submit(request, UPLOAD_FILES, timestamp) # (4) Send job to the backend # Use the redis queue as same as production site @@ -207,13 +256,13 @@ def submit_test(): timestamp, request.form['email'], # ys4680@nyu.edu request.form['chrom'], # 1 - uploaded_files['upstream_fasta'], # by default - uploaded_files['downstream_fasta'], - request.form['position'], - uploaded_files['ref_fasta'], # by default - uploaded_files['ref_gff'], # by default - uploaded_files['in_fasta'], # by default - uploaded_files['in_gff'] # by default + UPLOAD_FILES['upstream_fasta'], # by default + UPLOAD_FILES['downstream_fasta'], + positions, + UPLOAD_FILES['ref_fasta'], # by default + UPLOAD_FILES['ref_gff'], # by default + UPLOAD_FILES['in_fasta'], # by default + UPLOAD_FILES['in_gff'] # by default ), result_ttl=-1, job_timeout=3000 diff --git a/forms.py b/forms.py index be4783c..07fbbcc 100644 --- a/forms.py +++ b/forms.py @@ -1,6 +1,6 @@ from wtforms import Form, StringField, FileField from wtforms.validators import URL, Optional, InputRequired, Email -from flask_wtf.file import FileRequired, FileAllowed +from flask_wtf.file import FileRequired, FileAllowed, MultipleFileField ALLOWED_EXTENSIONS = {'fa', 'gff', 'gff3', 'gtf', 'fasta', 'fna', 'tar', 'gz'} @@ -28,31 +28,31 @@ class SubmitJob(Form): "of chromosome. Note: Position is 0-based", validators=[Optional()]) # OR - upstream_fasta = FileField('Upstream Sequence', - description="FASTA file with upstream sequence.", + upstream_fasta = MultipleFileField('Upstream Sequence', + description="FASTA files with upstream sequence.", validators=[ Optional() ]) - downstream_fasta = FileField('Downstream Sequence', - description="FASTA file with downstream sequence.", + downstream_fasta = MultipleFileField('Downstream Sequence', + description="FASTA files with downstream sequence.", validators=[ Optional() ]) # Uploads - in_fasta = FileField('Inserted Sequence (FASTA)', - description="New sequence to be inserted into reference genome.", + in_fasta = MultipleFileField('Inserted Sequence (FASTA)', + description="New sequences to be inserted into reference genome.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), FileRequired() ]) - in_gff = FileField('Inserted Reference (gff3 or gtf)', - description="GFF file describing new FASTA sequence to be inserted.", + in_gff = MultipleFileField('Inserted Reference (gff3 or gtf)', + description="GFF files describing new FASTA sequence to be inserted.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), - InputRequired() + FileRequired() ]) # Downloads ref_fasta = StringField('Reference Sequence (FASTA)', @@ -100,31 +100,31 @@ class Testjob(Form): "of chromosome. Note: Position is 0-based", validators=[Optional()]) # OR - upstream_fasta = FileField('Upstream Sequence', + upstream_fasta = MultipleFileField('Upstream Sequence', description="FASTA file with upstream sequence. If no file is selected, the system will use 'test-up.fa' as a default.", validators=[ Optional() ]) - downstream_fasta = FileField('Downstream Sequence', + downstream_fasta = MultipleFileField('Downstream Sequence', description="FASTA file with downstream sequence. If no file is selected, the system will use 'test-down.fa' as a default.", validators=[ Optional() ]) # Uploads - in_fasta = FileField('Inserted Sequence (FASTA)', + in_fasta = MultipleFileField('Inserted Sequence (FASTA)', description="Please upload the new sequence to be inserted into the reference genome. If no file is selected, the system will use 'test-in.fa' as a default.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), # FileRequired() ]) - in_gff = FileField('Inserted Reference (gff3 or gtf)', + in_gff = MultipleFileField('Inserted Reference (gff3 or gtf)', description="Please upload the GFF file describing the new FASTA sequence to be inserted. If no file is selected, the system will use 'test-in.gff' as a default.", validators=[ Optional(), FileAllowed([ALLOWED_EXTENSIONS], 'Invalid File Type'), - # InputRequired() + # FileRequired() ]) # Downloads ref_fasta = StringField('Reference Sequence (FASTA)', diff --git a/job.py b/job.py index af7899a..b1c6390 100644 --- a/job.py +++ b/job.py @@ -23,15 +23,21 @@ def redisjob(target_dir, timestamp, email, chrom, upstream_fasta, downstream_fasta, position, ref_fastaURL, ref_gffURL, in_fasta, in_gff): + # Convert list of insert sequences to string + in_fasta_str = ','.join(in_fasta) + in_gff_str = ','.join(in_gff) if position: + command = "bash ./run.sh {} {} {} {} {} {} {} {} {}".format(target_dir, timestamp, email, chrom, - ref_fastaURL, ref_gffURL, in_fasta, - in_gff, position) + ref_fastaURL, ref_gffURL, in_fasta_str, + in_gff_str, position) else: + upstream_fasta_str = ','.join(upstream_fasta) + downstream_fasta_str = ','.join(downstream_fasta) command = "bash ./run.sh {} {} {} {} {} {} {} {} {} {}".format(target_dir, timestamp, email, chrom, - ref_fastaURL, ref_gffURL, in_fasta, - in_gff, upstream_fasta, - downstream_fasta) + ref_fastaURL, ref_gffURL, in_fasta_str, + in_gff_str, upstream_fasta_str, + downstream_fasta_str) try: #subprocess.run([command]) os.system(command) @@ -81,62 +87,75 @@ def allowed_file(filename): # Verify that the file is uploaded to the form and it is allowed def verify_uploads(file): - fileObj = request.files[file] - - if fileObj.filename == '': - flash('No ' + file + ' file selected for uploading', 'error') - return False - if fileObj and allowed_file(fileObj.filename): - return True - else: - flash('Invalid File Type for ' + file, 'error') - return False + fileObj_lists = request.files.getlist(file) + for uploaded_file in fileObj_lists: + if uploaded_file.filename == '': + flash('No ' + file + ' file selected for uploading', 'error') + return False + if uploaded_file and allowed_file(uploaded_file.filename): + continue + else: + flash('Invalid File Type for ' + file, 'error') + return False + return True # verify_uploads() for test site def verify_test_uploads(file): - fileObj = request.files[file] - # Not require user to upload files - if fileObj.filename == '': - # flash('No ' + file + ' file selected for uploading', 'error') - # return False - return True # If no file is uploaded then the default file is used - if fileObj and allowed_file(fileObj.filename): - return True - else: - flash('Invalid File Type for ' + file, 'error') - return False + fileObj_lists = request.files.getlist(file) + for uploaded_file in fileObj_lists: + # Not require user to upload files + if uploaded_file.filename == '': + # flash('No ' + file + ' file selected for uploading', 'error') + # return False + continue # If no file is uploaded then the default file is used + if uploaded_file and allowed_file(uploaded_file.filename): + continue + else: + flash('Invalid File Type for ' + file, 'error') + return False + return True # Upload files to uploads/timestamp def upload(target_dir, file): - fileObj = request.files[file] + fileObj_lists = request.files.getlist(file) # make the directory based on timestamp os.system('mkdir -p ' + target_dir) # save the file - fileObj.save(os.path.join(target_dir, - secure_filename(fileObj.filename))) + filenames = [] + for fileObj in fileObj_lists: + filename = secure_filename(fileObj.filename) + fileObj.save(os.path.join(target_dir, filename)) + filenames.append(filename) + return filenames # upload file function for test site, return by filename def upload_test(target_dir, file_key, default_files): # if file is empty (indicated use default file), fileObj set to None - fileObj = request.files[file_key] if file_key in request.files else None + fileObj_lists = request.files.getlist(file_key) if file_key in request.files else [] os.makedirs(target_dir, exist_ok=True) # dirs for upload files # User provided new files in form - if fileObj: - # save the uploaded file - filename = secure_filename(fileObj.filename) - file_path = os.path.join(target_dir, filename) - fileObj.save(file_path) - return fileObj.filename + filenames = [] + if fileObj_lists: + for fileObj in fileObj_lists: + # save the uploaded file + filename = secure_filename(fileObj.filename) + file_path = os.path.join(target_dir, filename) + fileObj.save(file_path) + filenames.append(fileObj.filename) else: - # Use default file if no file was uploaded - src = os.path.abspath(default_files[file_key]) - dst = os.path.join(target_dir, os.path.basename(src)) - # Create soft link in uploads/timestamp - if not os.path.exists(dst): - os.symlink(src, dst) - return os.path.basename(src) + # default_files is a list of file names + src_files = default_files[file_key] + for src in src_files: + # Use default file if no file was uploaded + src = os.path.abspath(default_files[file_key]) + dst = os.path.join(target_dir, os.path.basename(src)) + # Create soft link in uploads/timestamp + if not os.path.exists(dst): + os.symlink(src, dst) + filenames.append(os.path.basename(src)) + return filenames def download(target_dir, URL): @@ -284,3 +303,20 @@ def db_update(timestamp, set_id, set_value): db.execute("UPDATE submissions SET " + set_id + "=? where timestamp=? ", (set_value, timestamp)) db.commit() db.close() + +def process_position(position_str): + try: + position = int(position_str) + return position_str # return Str + except ValueError: + try: + position_list = [pos.strip() for pos in position_str.split(',')] + for pos in position_list: + int(pos) + return position_str + except ValueError: + raise ValueError("Position must be an integer or a comma-separated list of integers.") + +# Format file path into: ./uploads/$timestamp/item +def format_paths(file_list, target_dir): + return [os.path.join(target_dir, filename) for filename in file_list] \ No newline at end of file diff --git a/reform.py b/reform.py index 659bb3c..9288bb2 100644 --- a/reform.py +++ b/reform.py @@ -1,446 +1,559 @@ import argparse import re import os +import gzip +import tempfile from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna - def main(): - ## Retrieve command line arguments - in_arg = get_input_args() - - ## Retrieve ouptut directory - output_dir = in_arg.output_dir - - ## Read the new fasta (to be inserted into the ref genome) - record = list(SeqIO.parse(in_arg.in_fasta, "fasta"))[0] - - ## Generate index of sequences from ref reference fasta - chrom_seqs = SeqIO.index(in_arg.ref_fasta, 'fasta') - - ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) - - ## Get the position to insert the new sequence - positions = get_position(in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str) - position = positions['position'] - down_position = positions['down_position'] - if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta, position, in_arg.chrom)) - - ## Build the new chromosome sequence with the inserted_seq - ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) - new_length = str(len(new_seq)) - new_record = SeqRecord( - Seq(new_seq, generic_dna), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) - ) - - ## Create new fasta file with modified chromosome - ref_basename = os.path.basename(in_arg.ref_fasta) - ref_name = os.path.splitext(ref_basename)[0] - new_fasta = output_dir + ref_name + '_reformed.fa' - with open(new_fasta, "w") as f: - for s in chrom_seqs: - if s == seq.id: - SeqIO.write([new_record], f, "fasta") - else: - SeqIO.write([chrom_seqs[s]], f, "fasta") - - print("New fasta file created: ", new_fasta) - print("Preparing to create new annotation file") - - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff) - - ## Create new gff file - annotation_basename = os.path.basename(in_arg.ref_gff) - (annotation_name, annotation_ext) = os.path.splitext(annotation_basename) - new_gff_name = output_dir + annotation_name + '_reformed' + annotation_ext - new_gff = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, - len(str(record.seq))) - print("New {} file created: {} ".format(annotation_ext.upper(), new_gff.name)) + ## Retrieve command line arguments and number of iterations + in_arg, iterations = get_input_args() + + ## Retrieve ouptut directory + output_dir = in_arg.output_dir + + # TODO: modify the postion from up.fa and down.fa + + ## List for previous postion and modification length + prev_modifications = [] + + ## Path for the files generated in sequential processing, mainly managed by tempfile. + prev_fasta_path = None + prev_gff_path = None + + ## Sequential processing + for index in range(iterations): + ## Read the new fasta (to be inserted into the ref genome) + record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + + ## Generate index of sequences from ref reference fasta + if prev_fasta_path: + chrom_seqs = index_fasta(prev_fasta_path) + os.remove(prev_fasta_path) + else: + chrom_seqs = index_fasta(in_arg.ref_fasta) + + ## Obtain the sequence of the chromosome we want to modify + seq = chrom_seqs[in_arg.chrom] + seq_str = str(seq.seq) + + ## Get the position to insert the new sequence + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + position = positions['position'] + down_position = positions['down_position'] + + ## Save current modification which include position(index) and length changed. + if position == down_position: + length_changed = len(str(record.seq)) + else: + length_changed = len(str(record.seq)) - (down_position - position - 1) + prev_modifications.append((position,length_changed)) + + if position != down_position: + print("Removing nucleotides from position {} - {}".format(position, down_position)) + print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" + .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + + ## Build the new chromosome sequence with the inserted_seq + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] + chrom_length = str(len(seq_str)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq, generic_dna), + id=seq.id, + description=seq.description.replace(chrom_length, new_length) + ) + + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = output_dir + ref_name + '_reformed.fa' + with open(new_fasta, "w") as f: + for s in chrom_seqs: + if s == seq.id: + SeqIO.write([new_record], f, "fasta") + else: + SeqIO.write([chrom_seqs[s]], f, "fasta") + print("New fasta file created: ", new_fasta) + + print("Preparing to create new annotation file") + + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + print(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + print("New {} file created: {} ".format(annotation_ext.upper(), prev_gff_path)) + + +def index_fasta(fasta_path): + ''' + Process incoming FASTA files, supporting both decompressed and uncompressed + formats. It takes a file path (fasta_path), decompresses the file into a + temporary file if it is a compressed file ending in .gz, and then indexes + the temporary file. Finally, the indexing result is returned. + ''' + if fasta_path.endswith('.gz'): + with gzip.open(fasta_path, 'rt') as f: + # Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + tmp_f_path = tmp_f.name + chrom_seqs = SeqIO.index(tmp_f_path, 'fasta') + # remove temp file + os.remove(tmp_f_path) + else: + chrom_seqs = SeqIO.index(fasta_path, 'fasta') + return chrom_seqs + +def get_ref_basename(filepath): + ''' + Takes a filepath and returns the filename and extension minus the .gz. + ''' + base = os.path.basename(filepath) + if base.endswith('.gz'): + base = base[:-3] # remove .gz + name, ext = os.path.splitext(base) + return name, ext def modify_gff_line(elements, start=None, end=None, comment=None): - ''' - Modifies an existing GFF line and returns the modified line. Currently, you can - override the start position, end position, and comment column. - - gff_out: an open file handle for writing new gff lines to - elements: a list containing each column (9 in total) of a single feature line in a gff file - start: if provided, overwrite the start position in elements with this start position - end: if provided, overwrite the end position in elements with this end position - comment: if provided, overwrite the comments column in elements with this comment - ''' - if start == None: - start = int(elements[3]) - if end == None: - end = int(elements[4]) - if comment == None: - comment = elements[8] - if not comment.endswith('\n'): - comment += '\n' - - ## Return the modified line - return ("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], - elements[6], elements[7], comment)) - - + ''' + Modifies an existing GFF line and returns the modified line. Currently, you can + override the start position, end position, and comment column. + + gff_out: an open file handle for writing new gff lines to + elements: a list containing each column (9 in total) of a single feature line in a gff file + start: if provided, overwrite the start position in elements with this start position + end: if provided, overwrite the end position in elements with this end position + comment: if provided, overwrite the comments column in elements with this comment + ''' + if start == None: + start = int(elements[3]) + if end == None: + end = int(elements[4]) + if comment == None: + comment = elements[8] + if not comment.endswith('\n'): + comment += '\n' + + ## Return the modified line + return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) + def get_in_gff_lines(in_gff): - ''' - Takes a gff file and returns a list of lists where - each parent list item is a single line of the gff file - and the child elements are the columns of the line - ''' - with open(in_gff, "r") as f: - in_gff_lines = [] - for line in f: - line_elements = line.split('\t') - # Skip comment lines - if line.startswith("#"): - continue - # Ensure lines have 9 columns - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) - exit() - in_gff_lines.append(line_elements) - return in_gff_lines - - -def get_position(position, upstream, downstream, chrom, seq_str): - ''' - Determine the position in seq_str to insert the new sequence given - the position, upstream, downstream, and chrom arguments. - Note that either position, or upstream AND downstream sequences must - be provided. - ''' - if position is not None and position >= -1: - print("Checking position validity") - if position > len(seq_str): - print("** ERROR: Position greater than length of chromosome.") - print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) - exit() - elif position == -1: - position = len(seq_str) - # At the moment we don't accept down position as a param - # so set down_position = position - down_position = position - print("Position valid") - else: - print("No valid position specified, checking for upstream and downstream sequence") - if upstream is not None and downstream is not None: - seq_str = seq_str.upper() - upstream_fasta = list(SeqIO.parse(upstream, "fasta")) - upstream_seq = str(upstream_fasta[0].seq).upper() - downstream_fasta = list(SeqIO.parse(downstream, "fasta")) - downstream_seq = str(downstream_fasta[0].seq).upper() - # Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die - upstream_seq_count = seq_str.count(upstream_seq) - downstream_seq_count = seq_str.count(downstream_seq) - if upstream_seq_count == 1 and downstream_seq_count == 1: - ## Obtain the starting position of the left_strand - index = seq_str.find(upstream_seq) - position = index + len(upstream_seq) - down_position = seq_str.find(downstream_seq) - else: - print( - "** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") - print("Chromosome: {}\n".format(chrom)) - print("Upstream sequence found {} times".format(upstream_seq_count)) - print("Downstream sequence found {} times".format(downstream_seq_count)) - exit() - else: - print("** ERROR: You must specify a valid position or upstream and downstream sequences.") - exit() - if down_position < position: - print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.") - exit() - return {'position': position, 'down_position': down_position} - + ''' + Takes a gff file and returns a list of lists where + each parent list item is a single line of the gff file + and the child elements are the columns of the line + ''' + with open(in_gff, "r") as f: + in_gff_lines = [] + for line in f: + line_elements = line.split('\t') + # Skip comment lines + if line.startswith("#"): + continue + # Ensure lines have 9 columns + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + exit() + in_gff_lines.append(line_elements) + return in_gff_lines + +def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): + ''' + Determine the position in seq_str to insert the new sequence given + the position, upstream, downstream, and chrom arguments. + Note that either position, or upstream AND downstream sequences must + be provided. + ''' + if positions and index < len(positions) and positions[index] >= -1: + print("Checking position validity") + position = positions[index] + ## Update current postion based on previous modification + for pos, lc in prev_modifications: + ## Also ignore when postion == -1 + if position >= pos: + position += lc + if position > len(seq_str): + print("** ERROR: Position greater than length of chromosome.") + print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) + exit() + elif position == -1: + position = len(seq_str) + # At the moment we don't accept down position as a param + # so set down_position = position + down_position = position + print("Position valid") + else: + print("No valid position specified, checking for upstream and downstream sequence") + if index < len(upstream) and index < len(downstream): + seq_str = seq_str.upper() + upstream_fasta = list(SeqIO.parse(upstream[index], "fasta")) + upstream_seq = str(upstream_fasta[0].seq).upper() + downstream_fasta = list(SeqIO.parse(downstream[index], "fasta")) + downstream_seq = str(downstream_fasta[0].seq).upper() + # Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die + upstream_seq_count = seq_str.count(upstream_seq) + downstream_seq_count = seq_str.count(downstream_seq) + ### TODO: Update postion based on previous modifications + if upstream_seq_count == 1 and downstream_seq_count == 1: + ## Obtain the starting position of the left_strand + new_index = seq_str.find(upstream_seq) + position = new_index + len(upstream_seq) + down_position = seq_str.find(downstream_seq) + else: + print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") + print("Chromosome: {}\n".format(chrom)) + print("Upstream sequence found {} times".format(upstream_seq_count)) + print("Downstream sequence found {} times".format(downstream_seq_count)) + exit() + else: + print("** ERROR: You must specify a valid position or upstream and downstream sequences.") + exit() + if down_position < position: + print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.") + exit() + return {'position': position, 'down_position': down_position} def write_in_gff_lines(gff_out, in_gff_lines, position, split_features): - for l in in_gff_lines: - new_gff_line = modify_gff_line( - l, start=int(l[3]) + position, end=int(l[4]) + position) - gff_out.write(new_gff_line) - - ## If insertion caused any existing features to be split, add - ## the split features now immediately after adding the new features - for sf in split_features: - modified_line = modify_gff_line( - sf[0], start=sf[1], end=sf[2], comment=sf[3]) - gff_out.write(modified_line) - - ## Return True after writing the new GFF lines - return True - + for l in in_gff_lines: + new_gff_line = modify_gff_line( + l, start = int(l[3]) + position, end = int(l[4]) + position) + gff_out.write(new_gff_line) + + ## If insertion caused any existing features to be split, add + ## the split features now immediately after adding the new features + for sf in split_features: + modified_line = modify_gff_line( + sf[0], start = sf[1], end = sf[2], comment = sf[3]) + gff_out.write(modified_line) + + ## Return True after writing the new GFF lines + return True def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): - ''' - Goes line by line through a gff file to remove existing features - (or parts of existing features) and/or insert new features. - In the process of adding new features, existing features - may be cut-off on either end or split into two. - This attempts to handle all cases. - new_gff_name: the name of the new gff file to create - ref_gff: the reference gff file to modify - in_gff_lines: a list of lists where each nested list is a list of - columns (in gff format) associated with each new feature to insert - position: start position of removal of existing sequence - down_position: end position of removal of existing sequence - chrom_id: the ID of the chromosome to modify - new_seq_length: the length of the new sequence being added to the chromosome - ''' - gff_out = open(new_gff_name, "w") - in_gff_lines_appended = False - split_features = [] - last_seen_chrom_id = None - gff_ext = new_gff_name.split('.')[-1] - with open(ref_gff, "r") as f: - for line in f: - line_elements = line.split('\t') - if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: - ## Edit the length of the chromosome - original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - else: - gff_chrom_id = line_elements[0] - gff_feat_start = int(line_elements[3]) - gff_feat_end = int(line_elements[4]) - gff_feat_type = line_elements[2] - gff_feat_strand = line_elements[6] - gff_comments = line_elements[8].strip() - - # If we've seen at least one chromosome - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) - # and now we're on a new chromosome in the original gff file - # and the in_gff_lines have not yet been appended: - # we assume they need to be appended to the end of the chromosome - # so append them before proceeding - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and gff_chrom_id != last_seen_chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - last_seen_chrom_id = gff_chrom_id - - # If this is not the chromosome of interest - # Or if the current feature ends before any - # modification (which occurs at position) - # Then simply write the feature as is (no modification) - # (remember gff co-ordinates are 1 based and position - # is 0 based, therefor check less than *or equal to*) - if gff_chrom_id != chrom_id or gff_feat_end <= position: - gff_out.write(line) - - # Modify chromosome feature length (if feature is - # "chromosome" or "region") - elif gff_feat_type in ['chromosome', 'region']: - original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - - # Split feature into 2 if feature starts before position - # and ends after down_position - elif gff_feat_start <= position and gff_feat_end > down_position: - print("Feature split") - # Which side of the feature depends on the strand (we add this as a comment) - (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") - - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(x), - gff_ext - ) - # Modified feature ends at 'position' - modified_line = modify_gff_line( - line_elements, - end=position, - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # The downstream flank(s) will be added immediately after the - # in_gff (new) features are written. - # First, attempt to rename IDs (to indicate split) - renamed_id_attributes = rename_id(line) - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(y), - gff_ext - ) - split_features.append( - ( - line_elements, - position + new_seq_length + 1, - gff_feat_end + new_seq_length - (down_position - position), - renamed_id_attributes + new_comment - ) - ) - - # Change end position of feature to cut off point (position) if the - # feature ends within the deletion (between position & down_position) - elif gff_feat_start <= position and gff_feat_end <= down_position: - # Which side of the feature depends on the strand (we add this as a comment) - x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - end=position, - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Skip this feature if it falls entirely within the deletion - elif gff_feat_start > position and gff_feat_end <= down_position: - print("Skip feature (this feature was removed from sequence)") - continue - - else: - if not in_gff_lines_appended: - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - # Change start position of feature to after cutoff point if - # the feature starts within the deletion - if (gff_feat_start > position - and gff_feat_start <= down_position - and gff_feat_end > down_position): - x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - start=position + new_seq_length + 1, - end=gff_feat_end + new_seq_length - (down_position - position), - comment=gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Offset all downstream feature positions by offset length - elif gff_feat_start > down_position: - offset_length = new_seq_length - (down_position - position) - modified_line = modify_gff_line( - line_elements, - start=gff_feat_start + offset_length, - end=gff_feat_end + offset_length - ) - gff_out.write(modified_line) - - else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) - exit() - - # If we've iterated over the entire original gff - # (i.e. out of the for loop which iterates over it) - # and still haven't written in_gff_lines - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): - # we assume they need to be appended to the end of the genome - # (i.e. end of the last chromosome) - # so append them now - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features) - - # Checking to ensure in_gff_lines written - if not in_gff_lines_appended: - print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") - exit() - - gff_out.close() - - return gff_out - + ''' + Goes line by line through a gff file to remove existing features + (or parts of existing features) and/or insert new features. + In the process of adding new features, existing features + may be cut-off on either end or split into two. + This attempts to handle all cases. + new_gff_name: the name of the new gff file to create + ref_gff: the reference gff file to modify + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + position: start position of removal of existing sequence + down_position: end position of removal of existing sequence + chrom_id: the ID of the chromosome to modify + new_seq_length: the length of the new sequence being added to the chromosome + ''' + with open(new_gff_name, "w") as gff_out: + in_gff_lines_appended = False + split_features = [] + last_seen_chrom_id = None + gff_ext = new_gff_name.split('.')[-1] + ref_gff_path = ref_gff + if ref_gff.endswith('.gz'): + with gzip.open(ref_gff, 'rt') as f: + # Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + ref_gff_path = tmp_f.name + with open(ref_gff_path, "r") as f: + for line in f: + line_elements = line.split('\t') + if line.startswith("#"): + if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + ## Edit the length of the chromosome + original_length = int(line_elements[3]) + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + else: + gff_chrom_id = line_elements[0] + gff_feat_start = int(line_elements[3]) + gff_feat_end = int(line_elements[4]) + gff_feat_type = line_elements[2] + gff_feat_strand = line_elements[6] + gff_comments = line_elements[8].strip() + + # If we've seen at least one chromosome + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) + # and now we're on a new chromosome in the original gff file + # and the in_gff_lines have not yet been appended: + # we assume they need to be appended to the end of the chromosome + # so append them before proceeding + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and gff_chrom_id != last_seen_chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features) + + last_seen_chrom_id = gff_chrom_id + + # If this is not the chromosome of interest + # Or if the current feature ends before any + # modification (which occurs at position) + # Then simply write the feature as is (no modification) + # (remember gff co-ordinates are 1 based and position + # is 0 based, therefor check less than *or equal to*) + if gff_chrom_id != chrom_id or gff_feat_end <= position: + gff_out.write(line) + + # Modify chromosome feature length (if feature is + # "chromosome" or "region") + elif gff_feat_type in ['chromosome', 'region']: + original_length = gff_feat_end + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + + # Split feature into 2 if feature starts before position + # and ends after down_position + elif gff_feat_start <= position and gff_feat_end > down_position: + print("Feature split") + # Which side of the feature depends on the strand (we add this as a comment) + (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") + + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(x), + gff_ext + ) + # Modified feature ends at 'position' + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # The downstream flank(s) will be added immediately after the + # in_gff (new) features are written. + # First, attempt to rename IDs (to indicate split) + renamed_id_attributes = rename_id(line) + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(y), + gff_ext + ) + split_features.append( + ( + line_elements, + position + new_seq_length + 1, + gff_feat_end + new_seq_length - (down_position - position), + renamed_id_attributes + new_comment + ) + ) + + # Change end position of feature to cut off point (position) if the + # feature ends within the deletion (between position & down_position) + elif gff_feat_start <= position and gff_feat_end <= down_position: + # Which side of the feature depends on the strand (we add this as a comment) + x = "3" if gff_feat_strand == "+" else "5" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Skip this feature if it falls entirely within the deletion + elif gff_feat_start > position and gff_feat_end <= down_position: + print("Skip feature (this feature was removed from sequence)") + continue + + else: + if not in_gff_lines_appended: + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features) + + # Change start position of feature to after cutoff point if + # the feature starts within the deletion + if (gff_feat_start > position + and gff_feat_start <= down_position + and gff_feat_end > down_position): + x = "5" if gff_feat_strand == "+" else "3" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + start = position + new_seq_length + 1, + end = gff_feat_end + new_seq_length - (down_position - position), + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Offset all downstream feature positions by offset length + elif gff_feat_start > down_position: + offset_length = new_seq_length - (down_position - position) + modified_line = modify_gff_line( + line_elements, + start = gff_feat_start + offset_length, + end = gff_feat_end + offset_length + ) + gff_out.write(modified_line) + + else: + print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + exit() + + # If we've iterated over the entire original gff + # (i.e. out of the for loop which iterates over it) + # and still haven't written in_gff_lines + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): + # we assume they need to be appended to the end of the genome + # (i.e. end of the last chromosome) + # so append them now + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features) + + # Checking to ensure in_gff_lines written + if not in_gff_lines_appended: + print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") + exit() + # Remove temp gtf file + if ref_gff.endswith('.gz'): + os.remove(ref_gff_path) + return new_gff_name def format_comment(comment, ext): - ''' - Format comment according to ext (GFF or GTF) and return - ''' - if ext.lower() == 'gtf': - new_comment = 'reform_comment "{}";'.format(comment) - elif ext.lower().startswith('gff'): - new_comment = "; reform_comment={}".format(comment) - else: - print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) - exit() - return new_comment - - + ''' + Format comment according to ext (GFF or GTF) and return + ''' + if ext.lower() == 'gtf': + new_comment = 'reform_comment "{}";'.format(comment) + elif ext.lower().startswith('gff'): + new_comment = "; reform_comment={}".format(comment) + else: + print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) + exit() + return new_comment + def rename_id(line): - ''' - Given a gff or gtf line, this function will append the string "_split" - to the end of the ID/gene_id attribute - ''' - attributes = line.split('\t')[8].strip() - elements = attributes.split(';') - if elements[0].startswith("ID="): - print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) - return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) - elif elements[0].startswith("gene_id "): - gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] - print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) - return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) - - else: - print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) - return attributes - - + ''' + Given a gff or gtf line, this function will append the string "_split" + to the end of the ID/gene_id attribute + ''' + attributes = line.split('\t')[8].strip() + elements = attributes.split(';') + if elements[0].startswith("ID="): + print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) + return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) + elif elements[0].startswith("gene_id "): + gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] + print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) + return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) + + else: + print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) + return attributes + def get_input_args(): - parser = argparse.ArgumentParser() - - parser.add_argument('--chrom', type=str, required=True, - help="Chromosome name (String)") - parser.add_argument('--in_fasta', type=str, required=True, - help="Path to new sequence to be inserted into reference genome in fasta format") - parser.add_argument('--in_gff', type=str, required=True, - help="Path to GFF file describing new fasta sequence to be inserted") - parser.add_argument('--upstream_fasta', type=str, default=None, - help="Path to Fasta file with upstream sequence. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--downstream_fasta', type=str, default=None, - help="Path to Fasta file with downstream sequence. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--position', type=int, default=None, - help="Position at which to insert new sequence. Note: Position is 0-based. Either position, or upstream AND downstream sequence must be provided.") - parser.add_argument('--ref_fasta', type=str, required=True, - help="Path to reference fasta file") - parser.add_argument('--ref_gff', type=str, required=True, - help="Path to reference gff file") - parser.add_argument('--output_dir', type=str, default=None, + parser = argparse.ArgumentParser() + + parser.add_argument('--chrom', type = str, required = True, + help = "Chromosome name (String)") + parser.add_argument('--in_fasta', type=str, required=True, + help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") + parser.add_argument('--in_gff', type=str, required=True, + help="Path(s) to GFF file(s) describing new fasta sequence(s) to be inserted") + parser.add_argument('--upstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with upstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--downstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with downstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--position', type=str, default=None, + help="Comma-separated positions at which to insert new sequence. Note: Position is 0-based, no space between each comma. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--ref_fasta', type = str, required = True, + help = "Path to reference fasta file") + parser.add_argument('--ref_gff', type = str, required = True, + help = "Path to reference gff file") + parser.add_argument('--output_dir', type=str, default=None, help="Path to output to") - - in_args = parser.parse_args() - - if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): - print("** Error: You must provide either the position, or the upstream and downstream sequences.") - exit() - - return in_args - - + + in_args = parser.parse_args() + in_args.in_fasta = in_args.in_fasta.split(',') + in_args.in_gff = in_args.in_gff.split(',') + if in_args.upstream_fasta: + in_args.upstream_fasta = in_args.upstream_fasta.split(',') + if in_args.downstream_fasta: + in_args.downstream_fasta = in_args.downstream_fasta.split(',') + + if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): + print("** Error: You must provide either the position, or the upstream and downstream sequences.") + exit() + + if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): + print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") + exit() + + if in_args.position is not None: + try: + in_args.position = list(map(int, in_args.position.split(','))) + iterations = iterations = len(in_args.position) + except ValueError: + print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + exit() + else: + iterations = len(in_args.upstream_fasta) + + if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations): + print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") + exit() + + return in_args, iterations + if __name__ == "__main__": - main() + main() + # Temp code for step 1 and test, will be remove later + # args, iterations = get_input_args() + # print("iterations:", iterations) + # print("Chromosome:", args.chrom) + # print("Input FASTA files:", args.in_fasta) + # print("Input GFF files:", args.in_gff) + # print("Reference FASTA file:", args.ref_fasta) + # print("Reference GFF file:", args.ref_gff) + # print("Upstream FASTA files:", args.upstream_fasta) + # print("Downstream FASTA files:", args.downstream_fasta) + # print("Position:", list(map(int, args.position.split(',')))) diff --git a/run.sh b/run.sh index cb68f2e..1b8739e 100644 --- a/run.sh +++ b/run.sh @@ -85,24 +85,25 @@ else fi # Run reform.py +# ./uploads/$timestamp/ if [ ! -z "$position" ]; then - echo /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + echo /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" - /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + /home/reform/venv/bin/python reform.py --chrom $chrom --position $position --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" 2>&1 | tee ./results/$timestamp/$timestamp-worker-err.log else - echo /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta ./uploads/$timestamp/$upstream_fasta \ - --downstream_fasta ./uploads/$timestamp/$downstream_fasta --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + echo /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta $upstream_fasta \ + --downstream_fasta $downstream_fasta --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" - /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta ./uploads/$timestamp/$upstream_fasta \ - --downstream_fasta ./uploads/$timestamp/$downstream_fasta --in_fasta ./uploads/$timestamp/$in_fasta \ - --in_gff ./uploads/$timestamp/$in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ + /home/reform/venv/bin/python reform.py --chrom $chrom --upstream_fasta $upstream_fasta \ + --downstream_fasta $downstream_fasta --in_fasta $in_fasta \ + --in_gff $in_gff --ref_fasta "$ref_fasta_path" --ref_gff "$ref_gff_path" \ --output_dir "./results/$timestamp/" 2>&1 | tee ./results/$timestamp/$timestamp-worker-err.log fi From 38e5d73374e6fc95e1de91c1a4f24985562ba069 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 4 Oct 2024 22:20:47 -0400 Subject: [PATCH 2/2] Pass compressed files directly to reform --- run.sh | 61 +++++++++------------------------------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/run.sh b/run.sh index 1b8739e..2054bcf 100644 --- a/run.sh +++ b/run.sh @@ -28,61 +28,18 @@ mkdir -p ./results/$timestamp log_file="./results/$timestamp/$timestamp-worker-out.log" exec > >(tee -a "$log_file") -# Function to determine if a path is a URL -is_url() { - if [[ $1 =~ ^https?:// ]] || [[ $1 =~ ^ftp:// ]]; then - return 0 # It's a URL - else - return 1 # It's a file path - fi -} - -# Function to process downloading and decompressing files -download_and_decompress() { - local file_url=$1 - local target_path=$2 - local file_name=$(basename "$file_url") - - echo "Downloading $file_url" - wget --no-check-certificate -nv "$file_url" -O "$target_path/$file_name" - if [[ ${file_name: -3} == ".gz" ]]; then - echo "pigz -d $target_path/$file_name" - pigz -d "$target_path/$file_name" - file_name=${file_name::-3} - fi -} - # Create the upload directories echo "mkdir -p ./$target_dir" mkdir -p "./$target_dir" # Variables to hold the final paths to be used in the reform.py command -ref_fasta_path="$ref_fasta" -ref_gff_path="$ref_gff" - -# Check and process the reference fasta file -if is_url "$ref_fasta"; then - download_and_decompress "$ref_fasta" "./$target_dir" - ref_fasta_path="./$target_dir/$(basename "$ref_fasta")" - if [[ ${ref_fasta_path: -3} == ".gz" ]]; then - ref_fasta_path="${ref_fasta_path::-3}" - fi -else - echo "Using local file: $ref_fasta" - ref_fasta_path="$ref_fasta" -fi +echo "Downloading $ref_fasta" +wget --no-check-certificate -nv "$ref_fasta" -O "$target_dir/$(basename "$ref_fasta")" +ref_fasta_path="$target_dir/$(basename "$ref_fasta")" -# Check and process the reference gff file -if is_url "$ref_gff"; then - download_and_decompress "$ref_gff" "./$target_dir" - ref_gff_path="./$target_dir/$(basename "$ref_gff")" - if [[ ${ref_gff_path: -3} == ".gz" ]]; then - ref_gff_path="${ref_gff_path::-3}" - fi -else - echo "Using local file: $ref_gff" - ref_gff_path="$ref_gff" -fi +echo "Downloading $ref_gff" +wget --no-check-certificate -nv "$ref_gff" -O "$target_dir/$(basename "$ref_gff")" +ref_gff_path="$target_dir/$(basename "$ref_gff")" # Run reform.py # ./uploads/$timestamp/ @@ -107,9 +64,9 @@ else --output_dir "./results/$timestamp/" 2>&1 | tee ./results/$timestamp/$timestamp-worker-err.log fi -# remove upload folder -echo "rm -Rf ./uploads/$timestamp" -rm -Rf ./uploads/$timestamp +# # remove upload folder +# echo "rm -Rf ./uploads/$timestamp" +# rm -Rf ./uploads/$timestamp # create downloads directory echo "mkdir -p ./downloads/$timestamp"