diff --git a/pipeline/seq_retrieval/.coveragerc b/pipeline/seq_retrieval/.coveragerc index fedc6eb3..58ef2035 100644 --- a/pipeline/seq_retrieval/.coveragerc +++ b/pipeline/seq_retrieval/.coveragerc @@ -6,6 +6,8 @@ omit = src/main.py # Exclude logging code (no need to be tested) src/log_mgmt/* + # Exclude comparison analysis code + src/analysis/* [report] exclude_lines = @@ -15,3 +17,6 @@ exclude_lines = # Exclude main method calling (not testable through unit testing) if __name__ == .__main__.: + + # Exclude all functions labeled with "pragma: no cover" + pragma: no cover diff --git a/pipeline/seq_retrieval/src/analysis/.gitignore b/pipeline/seq_retrieval/src/analysis/.gitignore new file mode 100644 index 00000000..c917bff4 --- /dev/null +++ b/pipeline/seq_retrieval/src/analysis/.gitignore @@ -0,0 +1,10 @@ +# Analyis input and DB files +*.db +*.gff +*.gff3 +*.gz +# Analysis output files +*.tsv +*.out +*.err +*.debug diff --git a/pipeline/seq_retrieval/src/analysis/__init__.py b/pipeline/seq_retrieval/src/analysis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pipeline/seq_retrieval/src/analysis/load_gff_db.py b/pipeline/seq_retrieval/src/analysis/load_gff_db.py new file mode 100644 index 00000000..2a5ec1ff --- /dev/null +++ b/pipeline/seq_retrieval/src/analysis/load_gff_db.py @@ -0,0 +1,13 @@ +import gffutils # type: ignore +import click + + +@click.command(context_settings={'show_default': True}) +@click.option("--infile", type=click.STRING, required=True, help="Example: 'GFF_HUMAN_0.gff'") +@click.option("--outfile", type=click.STRING, required=True, help="Example: 'GFF-HUMAN.db'") +def main(infile: str, outfile: str) -> None: + gffutils.create_db(infile, outfile, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) + + +if __name__ == '__main__': + main() diff --git a/pipeline/seq_retrieval/src/analysis/orf_comparison.py b/pipeline/seq_retrieval/src/analysis/orf_comparison.py new file mode 100644 index 00000000..fc1d7287 --- /dev/null +++ b/pipeline/seq_retrieval/src/analysis/orf_comparison.py @@ -0,0 +1,167 @@ +from gffutils import FeatureDB # type: ignore +from Bio.Data import CodonTable + +from typing import List +import click + +import sys +sys.path.append('../') +from seq_region import SeqRegion, MultiPartSeqRegion # noqa: E402 +from seq_region.multipart_seq_region import find_orfs # noqa: E402 + +import logging # noqa: E402 +from log_mgmt import get_logger, set_log_level # noqa: E402 +logger = get_logger(name=__name__) + + +# db = gffutils.create_db("GFF_HUMAN_0.gff", 'GFF-HUMAN.db', force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True) + +@click.command(context_settings={'show_default': True}) +@click.option("--mod", type=click.STRING, required=True, help="Example: 'XBXL'") +@click.option("--fasta_file", type=click.STRING, required=True, help="Example: '/home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz'") +def main(mod: str, fasta_file: str) -> None: # noqa: C901 + # MOD = 'XBXL' + # FASTA_FILE = 'file:///home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz' + + db = FeatureDB(f'GFF-{mod}.db', keep_order=True) + CDS_OUTPUT_FILENAME = f'{mod}-CDS.tsv' + ORF_OUTPUT_FILENAME = f'{mod}-ORF.tsv' + + DEBUG = False + + if DEBUG: + set_log_level(logging.DEBUG) + CDS_OUTPUT_FILENAME += '.debug' + ORF_OUTPUT_FILENAME += '.debug' + + cds_outfile = open(CDS_OUTPUT_FILENAME, mode='w') + orf_outfile = open(ORF_OUTPUT_FILENAME, mode='w') + + for transcript in db.features_of_type('mRNA'): + # if transcript.id != 'rna23588': + # continue + + # Skip Mitochondrial seqs + if transcript.seqid in ['MtDNA', 'MT', 'M', 'mitochondrion_genome', 'chrmt']: + continue + + exons = db.children(transcript, level=1, featuretype='exon') + + seq_region_objs: List[SeqRegion] = [] + for exon in exons: + + seq_region_objs.append(SeqRegion(seq_id=exon.seqid, start=exon.start, end=exon.end, strand=exon.strand, + fasta_file_url='file://' + fasta_file)) + + # Skip transcripts with no exons in GFF + if len(seq_region_objs) == 0: + continue + + # Skip if error occurs during MultiParSeqRegion definition + transcript_region: MultiPartSeqRegion + try: + transcript_region = MultiPartSeqRegion(seq_regions=seq_region_objs) + except Exception as err: + logger.warning(f'Skipping transcript {transcript.id} because exception during MultiPartSeqRegion creation: {err}') + continue + + # print(f'Transcript: {transcript.start}-{transcript.end}') + + # Skip if Error occured during sequence retrieval + try: + transcript_region.fetch_seq(recursive_fetch=True) + dna_sequence = transcript_region.get_sequence(unmasked=False) + except Exception as err: + logger.warning(f'Skipping transcript {transcript.id} because exception during sequence(s) retrieval: {err}') + continue + # print(dna_sequence) + + codon_table: CodonTable.CodonTable = CodonTable.unambiguous_dna_by_name["Standard"] + + # Find the best open reading frame + orfs = find_orfs(dna_sequence, codon_table, return_type='longest') + + # print(f'ORF: {orfs[0]['seq_start']}-{orfs[0]['seq_end']}') + if len(orfs) > 0: + orf_outfile.write(f"{transcript.id}\t{orfs[0]['seq_start']}\t{orfs[0]['seq_end']}\n") + else: + orf_outfile.write(f"{transcript.id}\t-\t-\n") + + cds_parts_iterator = db.children(transcript, level=1, featuretype='CDS') + cds_parts = [] + for cds_part in cds_parts_iterator: + cds_parts.append(cds_part) + + cds_start: int | None = None + cds_end: int | None = None + + abs_cds_start: int | None = None + abs_cds_end: int | None = None + cds_error: str | None = None + cds_regions: List[SeqRegion] = [] + + if len(cds_parts) > 0: + + for cds_part in cds_parts: + if cds_part.start < transcript.start or transcript.end < cds_part.end: + cds_error = 'out-of-bounds' + logger.warning(f'Out-of-bounds CDS regions detected, cancelling CDS calculations for {transcript.id}.') + break + + cds_regions.append(SeqRegion(seq_id=cds_part.seqid, start=cds_part.start, end=cds_part.end, strand=cds_part.strand, + fasta_file_url='file://' + fasta_file)) + + for i in range(0, len(cds_regions) - 1): + for j in range(i + 1, len(cds_regions)): + logger.info(f'Overlap analysis for {cds_regions[i]} and {cds_regions[j]}') + if cds_regions[i].overlaps(cds_regions[j]): + cds_error = 'overlapping' + logger.warning(f'Overlapping CDS regions detected, cancelling CDS calculations for {transcript.id}.' + + f" Conflicting CDS parts: {cds_regions[i]} and {cds_regions[j]}\n") + break + + if cds_error is None: + abs_cds_start = min(map(lambda region: region.start, cds_regions)) + abs_cds_end = max(map(lambda region: region.end, cds_regions)) + + if transcript_region.strand == '-': + cds_start = transcript_region.seq_to_rel_pos(abs_cds_end) + else: + cds_start = transcript_region.seq_to_rel_pos(abs_cds_start) + + cds_end = sum(map(lambda region: region.seq_length, cds_regions)) + cds_start - 1 + + if cds_start is not None: + cds_outfile.write(f"{transcript.id}\t{cds_start}\t{cds_end}\n") + else: + cds_outfile.write(f"{transcript.id}\t-\t-\n") + + # Define and print comparison result + result = [] + if len(orfs) == 0 and len(cds_parts) == 0: + result.append('equal') + elif len(orfs) == 0: + result.append('undetected-ORF') + elif len(cds_parts) == 0: + result.append('no-CDS') + else: + orf = orfs.pop() + if cds_start is None or cds_end is None: + result.append(f'{cds_error}-CDS') + elif orf['seq_start'] == cds_start and orf['seq_end'] == cds_end: + result.append('equal') + elif orf['seq_end'] == cds_end and orf['seq_start'] < cds_start: + result.append('shortened-start') + elif orf['seq_end'] - orf['seq_start'] > cds_end - cds_start: + result.append('shorter-orf') + else: + result.append('unknown') + + print(f"{transcript.id}\t{','.join(result)}") + + orf_outfile.close() + cds_outfile.close() + + +if __name__ == '__main__': + main() diff --git a/pipeline/seq_retrieval/src/analysis/requirements.txt b/pipeline/seq_retrieval/src/analysis/requirements.txt new file mode 100644 index 00000000..819945c1 --- /dev/null +++ b/pipeline/seq_retrieval/src/analysis/requirements.txt @@ -0,0 +1 @@ +gffutils==0.13 diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 61ea7b37..5f956b8c 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -134,14 +134,10 @@ def main(seq_id: str, seq_strand: str, seq_regions: List[Dict[str, int]], fasta_ seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand, fasta_file_url=fasta_file_url)) - for seq_region in seq_region_objs: - # Retrieve sequence for region - seq_region.fetch_seq() - # Concatenate all regions into single sequence fullRegion = MultiPartSeqRegion(seq_regions=seq_region_objs) - fullRegion.fetch_seq() + fullRegion.fetch_seq(recursive_fetch=True) seq_concat = fullRegion.get_sequence(unmasked=unmasked) logger.debug(f"full region: {fullRegion.seq_id}:{fullRegion.start}-{fullRegion.end}:{fullRegion.strand}") diff --git a/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py b/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py index 4d8cad15..84cf195a 100644 --- a/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py @@ -34,16 +34,17 @@ def __init__(self, seq_regions: List[SeqRegion]): Args: seq_regions: list of SeqRegion objects that constitute this multi-part sequence region.\ All SeqRegions must have identical seq_id, strand and fasta_file_path properties \ - to form a valid MultipartSeqRegion. + to form a valid MultipartSeqRegion. Regions cannot overlap. Raises: - ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties. + ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties, or if regions overlap. """ self.seq_length: int = sum(map(lambda seq_region: seq_region.seq_length, seq_regions)) self.start: int = min(map(lambda seq_region: seq_region.start, seq_regions)) self.end: int = max(map(lambda seq_region: seq_region.end, seq_regions)) + # Ensure one strand strands: Set[str] = set(map(lambda seq_region: seq_region.strand, seq_regions)) if len(strands) > 1: raise ValueError(f"Multiple strands defined accross seq regions ({strands})." @@ -51,6 +52,7 @@ def __init__(self, seq_regions: List[SeqRegion]): else: self.strand = strands.pop() + # Ensure one seq_id seq_ids: Set[str] = set(map(lambda seq_region: seq_region.seq_id, seq_regions)) if len(seq_ids) > 1: raise ValueError(f"Multiple seq_ids defined accross seq regions ({seq_ids})." @@ -58,6 +60,7 @@ def __init__(self, seq_regions: List[SeqRegion]): else: self.seq_id = seq_ids.pop() + # Ensure one fasta_file_path fasta_file_paths: Set[str] = set(map(lambda seq_region: seq_region.fasta_file_path, seq_regions)) if len(fasta_file_paths) > 1: raise ValueError(f"Multiple fasta_file_paths defined accross seq regions ({fasta_file_paths})." @@ -65,6 +68,14 @@ def __init__(self, seq_regions: List[SeqRegion]): else: self.fasta_file_path = fasta_file_paths.pop() + # Ensure no overlap between seq_regions + for i in range(0, len(seq_regions) - 1): + for j in range(i + 1, len(seq_regions)): + if seq_regions[i].overlaps(seq_regions[j]): + raise ValueError(f"Overlapping seq regions found ({seq_regions[i]} and {seq_regions[j]})." + + " a MultiPartSeqRegion cannot consist of overlapping parts.") + + # Sort seq_regions before storing sort_args: Dict[str, Any] = dict(key=lambda region: region.start, reverse=False) if self.strand == '-': @@ -76,18 +87,30 @@ def __init__(self, seq_regions: List[SeqRegion]): self.ordered_seqRegions = ordered_seq_regions @override - def fetch_seq(self) -> None: + def __str__(self) -> str: # pragma: no cover + return self.ordered_seqRegions.__str__() + + @override + def fetch_seq(self, recursive_fetch: bool = False) -> None: """ Fetch genetic (DNA) sequence for MultiPartSeqRegion by chaining \ consisting SeqRegions' sequenes together into one continuous sequence. Chains seqRegions in the order defined in the `ordered_seqRegions` attribute. + Args: + recursive_fetch: if True, fetch sequence for each SeqRegion part of the MultiPartSeqRegion first, before chaining the results. + Returns: Stores resulting sequence in `sequence` attribute. """ - self.set_sequence(''.join(map(lambda region: region.get_sequence(), self.ordered_seqRegions))) + def get_fetch_sequence(region: SeqRegion) -> str: + if recursive_fetch: + region.fetch_seq() + return region.get_sequence() + + self.set_sequence(''.join(map(lambda region: get_fetch_sequence(region), self.ordered_seqRegions))) @override def set_sequence(self, sequence: str) -> None: @@ -154,6 +177,37 @@ def translate(self) -> str | None: logger.warning('No open reading frames found, so no translation made.') return None + def seq_to_rel_pos(self, seq_position: int) -> int: + """ + Convert absolute sequence position to relative position within the MultipartSeqRegion + + Returns: + Relative position on the complete MultipartSeqRegion sequence (1-based) + + Raises: + ValueError: when abs_position falls between SeqRegion parts + """ + rel_position = 0 + for region in self.ordered_seqRegions: + if self.strand == '+': + if region.end < seq_position: + rel_position += region.end - region.start + 1 + elif region.start <= seq_position: + rel_position += seq_position - region.start + 1 + break + else: + raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.') + else: + if seq_position < region.start: + rel_position += region.end - region.start + 1 + elif seq_position <= region.end: + rel_position += region.end - seq_position + 1 + break + else: + raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.') + + return rel_position + def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type: str = 'all') -> List[Dict[str, Any]]: """ @@ -175,12 +229,15 @@ def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type ValueError: if `return_type` does not have a valid value. """ + # Remove any softmasking + unmasked_dna_sequence = dna_sequence.upper() + # Split the DNA sequence in codons (3-base blocks). # Frameshift the sequence by 0, 1 or 2 (skip first N bases) to obtain all possible codons. CODON_SIZE = 3 codons: Dict[int, List[str]] = dict() for frameshift in range(0, CODON_SIZE): - codons[frameshift] = [dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(dna_sequence), CODON_SIZE)] + codons[frameshift] = [unmasked_dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(unmasked_dna_sequence), CODON_SIZE)] # Read through all codons accross all frameshifts and determine the ORFs orfs: List[Dict[str, Any]] = [] diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index b2c33c3c..fe2982ea 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -1,7 +1,7 @@ """ Module containing the SeqRegion class and related functions. """ -from typing import Optional +from typing import Optional, override from Bio import Seq # Bio.Seq biopython submodule import pysam @@ -80,6 +80,14 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur if seq is not None: self.sequence = seq + @override + def __str__(self) -> str: # pragma: no cover + return f'{self.seq_id}:{self.start}-{self.end}:{self.strand}' + + @override + def __repr__(self) -> str: # pragma: no cover + return self.__str__() + def fetch_seq(self) -> None: """ Fetch sequence found at `seq_id`:`start`-`end`:`strand` @@ -138,6 +146,25 @@ def get_sequence(self, unmasked: bool = False) -> str: return seq + def overlaps(self, seq_region_2: "SeqRegion") -> bool: + """ + Compare two SeqRegion instances and check for overlap. + + Args: + seq_region_2: SeqRegion instance to check for overlap with self + + Returns: + True if SeqRegion overlaps with another SeqRegion instance, False otherwise. + """ + if self.fasta_file_path != seq_region_2.fasta_file_path or \ + self.seq_id != seq_region_2.seq_id or self.strand != seq_region_2.strand: + return False + + if max(self.start, seq_region_2.start) <= min(self.end, seq_region_2.end): + return True + else: + return False + def fetch_faidx_files(fasta_file_url: str) -> str: """ diff --git a/pipeline/seq_retrieval/tests/unit/seq_region/test_multipart_seq_region.py b/pipeline/seq_retrieval/tests/unit/seq_region/test_multipart_seq_region.py index 69a4a0ca..67d9a035 100644 --- a/pipeline/seq_retrieval/tests/unit/seq_region/test_multipart_seq_region.py +++ b/pipeline/seq_retrieval/tests/unit/seq_region/test_multipart_seq_region.py @@ -2,7 +2,10 @@ Unit testing for MultiPartSeqRegion class and related functions """ +from Bio.Data import CodonTable + from seq_region import SeqRegion, MultiPartSeqRegion +from seq_region.multipart_seq_region import find_orfs FASTA_FILE_URL = 'file://tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz' @@ -33,13 +36,10 @@ def test_multipart_seq_region_class(): seq_region_list = [exon_1, exon_2, exon_3, exon_4] - for seq_region in seq_region_list: - seq_region.fetch_seq() - multipart_seq_region = MultiPartSeqRegion(seq_region_list) ## Test fetch_seq method - multipart_seq_region.fetch_seq() + multipart_seq_region.fetch_seq(recursive_fetch=True) chained_seq: str = multipart_seq_region.get_sequence() @@ -76,3 +76,24 @@ def test_incomplete_multipart_seq_region(): # Assert failed translation assert incomplete_translation is None + + +def test_orf_detection(): + + # Test detection of ORF in softmasked sequence + # Y48G1C.9b cds + DNA_SEQUENCE = 'ATGATCTCGAAAAAGCACGTGGAATCGATGCACGCGTTGCCGGACCCtaaagaaactgaaatttga' + + codon_table: CodonTable.CodonTable = CodonTable.unambiguous_dna_by_name["Standard"] + + # Find the best open reading frame + orfs = find_orfs(DNA_SEQUENCE, codon_table, return_type='longest') + + assert isinstance(orfs, list) + assert len(orfs) == 1 + + orf = orfs.pop() + assert 'seq_start' in orf.keys() and 'seq_end' in orf.keys() + + assert orf['seq_start'] == 1 + assert orf['seq_end'] == 66 diff --git a/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py b/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py index 7f4ced22..7bc54c26 100644 --- a/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py +++ b/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py @@ -36,3 +36,34 @@ def test_seq_region_class_pos_strand(): assert isinstance(exon_1_seq, str) assert exon_1_seq == 'aacCATGTCGATGTATGGCAAAGACAAGGCGTATATCGAGAATGAGACAAAGTTTCGAGCAGACAGAGATTACTTGAGCCAGCCTGTCTATCAACAAACTGTCTATCGAGAAGGCCCAATTTTGAAACCAGATGTAGAG' + + +def test_seq_region_overlap(): + + # WBGene00016599 Transcript:C42D8.1.1 Exon 1 (mRNA start) + exon_1: SeqRegion = SeqRegion(seq_id='X', start=5109506, end=5109644, strand='+', + fasta_file_url=FASTA_FILE_URL) + + # gk787530 - point mutation (overlap) + gk803418: SeqRegion = SeqRegion(seq_id='X', start=5109543, end=5109543, strand='+', + fasta_file_url=FASTA_FILE_URL) + + assert exon_1.overlaps(gk803418) is True + + # gk320952 - splice region variant (no overlap) + gk320952: SeqRegion = SeqRegion(seq_id='X', start=5110758, end=5110758, strand='+', + fasta_file_url=FASTA_FILE_URL) + + assert exon_1.overlaps(gk320952) is False + + # WBGene00016599 Transcript:C42D8.1.1 Exon 8 (mRNA end) + exon_8: SeqRegion = SeqRegion(seq_id='X', start=5112256, end=5112426, strand='+', + fasta_file_url=FASTA_FILE_URL) + # Transcript:C42D8.8a.1 Exon 11 (overlap on opposite strand) + opp_strand_exon: SeqRegion = SeqRegion(seq_id='X', start=5112422, end=5113420, strand='-', + fasta_file_url=FASTA_FILE_URL) + same_strand_exon: SeqRegion = SeqRegion(seq_id='X', start=5112422, end=5113420, strand='+', + fasta_file_url=FASTA_FILE_URL) + + assert exon_8.overlaps(opp_strand_exon) is False + assert exon_8.overlaps(same_strand_exon) is True