alliance-genome · mluypaert · Apr 23, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 23, 2024
diff --git a/pipeline/seq_retrieval/.coveragerc b/pipeline/seq_retrieval/.coveragerc
@@ -6,6 +6,8 @@ omit =
     src/main.py
     # Exclude logging code (no need to be tested)
     src/log_mgmt/*
+    # Exclude comparison analysis code
+    src/analysis/*
 
 [report]
 exclude_lines =
@@ -15,3 +17,6 @@ exclude_lines =
 
     # Exclude main method calling (not testable through unit testing)
     if __name__ == .__main__.:
+
+    # Exclude all functions labeled with "pragma: no cover"
+    pragma: no cover
diff --git a/pipeline/seq_retrieval/src/analysis/.gitignore b/pipeline/seq_retrieval/src/analysis/.gitignore
@@ -0,0 +1,10 @@
+# Analyis input and DB files
+*.db
+*.gff
+*.gff3
+*.gz
+# Analysis output files
+*.tsv
+*.out
+*.err
+*.debug
diff --git a/pipeline/seq_retrieval/src/analysis/__init__.py b/pipeline/seq_retrieval/src/analysis/__init__.py
diff --git a/pipeline/seq_retrieval/src/analysis/load_gff_db.py b/pipeline/seq_retrieval/src/analysis/load_gff_db.py
@@ -0,0 +1,13 @@
+import gffutils  # type: ignore
+import click
+
+
+@click.command(context_settings={'show_default': True})
+@click.option("--infile", type=click.STRING, required=True, help="Example: 'GFF_HUMAN_0.gff'")
+@click.option("--outfile", type=click.STRING, required=True, help="Example: 'GFF-HUMAN.db'")
+def main(infile: str, outfile: str) -> None:
+    gffutils.create_db(infile, outfile, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pipeline/seq_retrieval/src/analysis/orf_comparison.py b/pipeline/seq_retrieval/src/analysis/orf_comparison.py
@@ -0,0 +1,167 @@
+from gffutils import FeatureDB  # type: ignore
+from Bio.Data import CodonTable
+
+from typing import List
+import click
+
+import sys
+sys.path.append('../')
+from seq_region import SeqRegion, MultiPartSeqRegion  # noqa: E402
+from seq_region.multipart_seq_region import find_orfs  # noqa: E402
+
+import logging  # noqa: E402
+from log_mgmt import get_logger, set_log_level  # noqa: E402
+logger = get_logger(name=__name__)
+
+
+# db = gffutils.create_db("GFF_HUMAN_0.gff", 'GFF-HUMAN.db', force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
+
+@click.command(context_settings={'show_default': True})
+@click.option("--mod", type=click.STRING, required=True, help="Example: 'XBXL'")
+@click.option("--fasta_file", type=click.STRING, required=True, help="Example: '/home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz'")
+def main(mod: str, fasta_file: str) -> None:  # noqa: C901
+    # MOD = 'XBXL'
+    # FASTA_FILE = 'file:///home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz'
+
+    db = FeatureDB(f'GFF-{mod}.db', keep_order=True)
+    CDS_OUTPUT_FILENAME = f'{mod}-CDS.tsv'
+    ORF_OUTPUT_FILENAME = f'{mod}-ORF.tsv'
+
+    DEBUG = False
+
+    if DEBUG:
+        set_log_level(logging.DEBUG)
+        CDS_OUTPUT_FILENAME += '.debug'
+        ORF_OUTPUT_FILENAME += '.debug'
+
+    cds_outfile = open(CDS_OUTPUT_FILENAME, mode='w')
+    orf_outfile = open(ORF_OUTPUT_FILENAME, mode='w')
+
+    for transcript in db.features_of_type('mRNA'):
+        # if transcript.id != 'rna23588':
+        #     continue
+
+        # Skip Mitochondrial seqs
+        if transcript.seqid in ['MtDNA', 'MT', 'M', 'mitochondrion_genome', 'chrmt']:
+            continue
+
+        exons = db.children(transcript, level=1, featuretype='exon')
+
+        seq_region_objs: List[SeqRegion] = []
+        for exon in exons:
+
+            seq_region_objs.append(SeqRegion(seq_id=exon.seqid, start=exon.start, end=exon.end, strand=exon.strand,
+                                             fasta_file_url='file://' + fasta_file))
+
+        # Skip transcripts with no exons in GFF
+        if len(seq_region_objs) == 0:
+            continue
+
+        # Skip if error occurs during MultiParSeqRegion definition
+        transcript_region: MultiPartSeqRegion
+        try:
+            transcript_region = MultiPartSeqRegion(seq_regions=seq_region_objs)
+        except Exception as err:
+            logger.warning(f'Skipping transcript {transcript.id} because exception during MultiPartSeqRegion creation: {err}')
+            continue
+
+        # print(f'Transcript: {transcript.start}-{transcript.end}')
+
+        # Skip if Error occured during sequence retrieval
+        try:
+            transcript_region.fetch_seq(recursive_fetch=True)
+            dna_sequence = transcript_region.get_sequence(unmasked=False)
+        except Exception as err:
+            logger.warning(f'Skipping transcript {transcript.id} because exception during sequence(s) retrieval: {err}')
+            continue
+        # print(dna_sequence)
+
+        codon_table: CodonTable.CodonTable = CodonTable.unambiguous_dna_by_name["Standard"]
+
+        # Find the best open reading frame
+        orfs = find_orfs(dna_sequence, codon_table, return_type='longest')
+
+        # print(f'ORF: {orfs[0]['seq_start']}-{orfs[0]['seq_end']}')
+        if len(orfs) > 0:
+            orf_outfile.write(f"{transcript.id}\t{orfs[0]['seq_start']}\t{orfs[0]['seq_end']}\n")
+        else:
+            orf_outfile.write(f"{transcript.id}\t-\t-\n")
+
+        cds_parts_iterator = db.children(transcript, level=1, featuretype='CDS')
+        cds_parts = []
+        for cds_part in cds_parts_iterator:
+            cds_parts.append(cds_part)
+
+        cds_start: int | None = None
+        cds_end: int | None = None
+
+        abs_cds_start: int | None = None
+        abs_cds_end: int | None = None
+        cds_error: str | None = None
+        cds_regions: List[SeqRegion] = []
+
+        if len(cds_parts) > 0:
+
+            for cds_part in cds_parts:
+                if cds_part.start < transcript.start or transcript.end < cds_part.end:
+                    cds_error = 'out-of-bounds'
+                    logger.warning(f'Out-of-bounds CDS regions detected, cancelling CDS calculations for {transcript.id}.')
+                    break
+
+                cds_regions.append(SeqRegion(seq_id=cds_part.seqid, start=cds_part.start, end=cds_part.end, strand=cds_part.strand,
+                                             fasta_file_url='file://' + fasta_file))
+
+            for i in range(0, len(cds_regions) - 1):
+                for j in range(i + 1, len(cds_regions)):
+                    logger.info(f'Overlap analysis for {cds_regions[i]} and {cds_regions[j]}')
+                    if cds_regions[i].overlaps(cds_regions[j]):
+                        cds_error = 'overlapping'
+                        logger.warning(f'Overlapping CDS regions detected, cancelling CDS calculations for {transcript.id}.'
+                                       + f" Conflicting CDS parts: {cds_regions[i]} and {cds_regions[j]}\n")
+                        break
+
+            if cds_error is None:
+                abs_cds_start = min(map(lambda region: region.start, cds_regions))
+                abs_cds_end = max(map(lambda region: region.end, cds_regions))
+
+                if transcript_region.strand == '-':
+                    cds_start = transcript_region.seq_to_rel_pos(abs_cds_end)
+                else:
+                    cds_start = transcript_region.seq_to_rel_pos(abs_cds_start)
+
+                cds_end = sum(map(lambda region: region.seq_length, cds_regions)) + cds_start - 1
+
+        if cds_start is not None:
+            cds_outfile.write(f"{transcript.id}\t{cds_start}\t{cds_end}\n")
+        else:
+            cds_outfile.write(f"{transcript.id}\t-\t-\n")
+
+        # Define and print comparison result
+        result = []
+        if len(orfs) == 0 and len(cds_parts) == 0:
+            result.append('equal')
+        elif len(orfs) == 0:
+            result.append('undetected-ORF')
+        elif len(cds_parts) == 0:
+            result.append('no-CDS')
+        else:
+            orf = orfs.pop()
+            if cds_start is None or cds_end is None:
+                result.append(f'{cds_error}-CDS')
+            elif orf['seq_start'] == cds_start and orf['seq_end'] == cds_end:
+                result.append('equal')
+            elif orf['seq_end'] == cds_end and orf['seq_start'] < cds_start:
+                result.append('shortened-start')
+            elif orf['seq_end'] - orf['seq_start'] > cds_end - cds_start:
+                result.append('shorter-orf')
+            else:
+                result.append('unknown')
+
+        print(f"{transcript.id}\t{','.join(result)}")
+
+    orf_outfile.close()
+    cds_outfile.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pipeline/seq_retrieval/src/analysis/requirements.txt b/pipeline/seq_retrieval/src/analysis/requirements.txt
@@ -0,0 +1 @@
+gffutils==0.13
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
@@ -134,14 +134,10 @@ def main(seq_id: str, seq_strand: str, seq_regions: List[Dict[str, int]], fasta_
         seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
                                          fasta_file_url=fasta_file_url))
 
-    for seq_region in seq_region_objs:
-        # Retrieve sequence for region
-        seq_region.fetch_seq()
-
     # Concatenate all regions into single sequence
     fullRegion = MultiPartSeqRegion(seq_regions=seq_region_objs)
 
-    fullRegion.fetch_seq()
+    fullRegion.fetch_seq(recursive_fetch=True)
     seq_concat = fullRegion.get_sequence(unmasked=unmasked)
 
     logger.debug(f"full region: {fullRegion.seq_id}:{fullRegion.start}-{fullRegion.end}:{fullRegion.strand}")

diff --git a/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py b/pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py
@@ -34,37 +34,48 @@ def __init__(self, seq_regions: List[SeqRegion]):
         Args:
             seq_regions: list of SeqRegion objects that constitute this multi-part sequence region.\
                          All SeqRegions must have identical seq_id, strand and fasta_file_path properties \
-                         to form a valid MultipartSeqRegion.
+                         to form a valid MultipartSeqRegion. Regions cannot overlap.
 
         Raises:
-            ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties.
+            ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties, or if regions overlap.
         """
 
         self.seq_length: int = sum(map(lambda seq_region: seq_region.seq_length, seq_regions))
         self.start: int = min(map(lambda seq_region: seq_region.start, seq_regions))
         self.end: int = max(map(lambda seq_region: seq_region.end, seq_regions))
 
+        # Ensure one strand
         strands: Set[str] = set(map(lambda seq_region: seq_region.strand, seq_regions))
         if len(strands) > 1:
             raise ValueError(f"Multiple strands defined accross seq regions ({strands})."
                              + " All seqRegions in multiPartSeqRegion must have equal value for strand attribute.")
         else:
             self.strand = strands.pop()
 
+        # Ensure one seq_id
         seq_ids: Set[str] = set(map(lambda seq_region: seq_region.seq_id, seq_regions))
         if len(seq_ids) > 1:
             raise ValueError(f"Multiple seq_ids defined accross seq regions ({seq_ids})."
                              + " All seqRegions in multiPartSeqRegion must have equal value for seq_id attribute.")
         else:
             self.seq_id = seq_ids.pop()
 
+        # Ensure one fasta_file_path
         fasta_file_paths: Set[str] = set(map(lambda seq_region: seq_region.fasta_file_path, seq_regions))
         if len(fasta_file_paths) > 1:
             raise ValueError(f"Multiple fasta_file_paths defined accross seq regions ({fasta_file_paths})."
                              + " All seqRegions in multiPartSeqRegion must have equal value for fasta_file_path attribute.")
         else:
             self.fasta_file_path = fasta_file_paths.pop()
 
+        # Ensure no overlap between seq_regions
+        for i in range(0, len(seq_regions) - 1):
+            for j in range(i + 1, len(seq_regions)):
+                if seq_regions[i].overlaps(seq_regions[j]):
+                    raise ValueError(f"Overlapping seq regions found ({seq_regions[i]} and {seq_regions[j]})."
+                                     + " a MultiPartSeqRegion cannot consist of overlapping parts.")
+
+        # Sort seq_regions before storing
         sort_args: Dict[str, Any] = dict(key=lambda region: region.start, reverse=False)
 
         if self.strand == '-':
@@ -76,18 +87,30 @@ def __init__(self, seq_regions: List[SeqRegion]):
         self.ordered_seqRegions = ordered_seq_regions
 
     @override
-    def fetch_seq(self) -> None:
+    def __str__(self) -> str:  # pragma: no cover
+        return self.ordered_seqRegions.__str__()
+
+    @override
+    def fetch_seq(self, recursive_fetch: bool = False) -> None:
         """
         Fetch genetic (DNA) sequence for MultiPartSeqRegion by chaining \
         consisting SeqRegions' sequenes together into one continuous sequence.
 
         Chains seqRegions in the order defined in the `ordered_seqRegions` attribute.
 
+        Args:
+            recursive_fetch: if True, fetch sequence for each SeqRegion part of the MultiPartSeqRegion first, before chaining the results.
+
         Returns:
             Stores resulting sequence in `sequence` attribute.
         """
 
-        self.set_sequence(''.join(map(lambda region: region.get_sequence(), self.ordered_seqRegions)))
+        def get_fetch_sequence(region: SeqRegion) -> str:
+            if recursive_fetch:
+                region.fetch_seq()
+            return region.get_sequence()
+
+        self.set_sequence(''.join(map(lambda region: get_fetch_sequence(region), self.ordered_seqRegions)))
 
     @override
     def set_sequence(self, sequence: str) -> None:
@@ -154,6 +177,37 @@ def translate(self) -> str | None:
             logger.warning('No open reading frames found, so no translation made.')
             return None
 
+    def seq_to_rel_pos(self, seq_position: int) -> int:
+        """
+        Convert absolute sequence position to relative position within the MultipartSeqRegion
+
+        Returns:
+            Relative position on the complete MultipartSeqRegion sequence (1-based)
+
+        Raises:
+            ValueError: when abs_position falls between SeqRegion parts
+        """
+        rel_position = 0
+        for region in self.ordered_seqRegions:
+            if self.strand == '+':
+                if region.end < seq_position:
+                    rel_position += region.end - region.start + 1
+                elif region.start <= seq_position:
+                    rel_position += seq_position - region.start + 1
+                    break
+                else:
+                    raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.')
+            else:
+                if seq_position < region.start:
+                    rel_position += region.end - region.start + 1
+                elif seq_position <= region.end:
+                    rel_position += region.end - seq_position + 1
+                    break
+                else:
+                    raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.')
+
+        return rel_position
+
 
 def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type: str = 'all') -> List[Dict[str, Any]]:
     """
@@ -175,12 +229,15 @@ def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type
         ValueError: if `return_type` does not have a valid value.
     """
 
+    # Remove any softmasking
+    unmasked_dna_sequence = dna_sequence.upper()
+
     # Split the DNA sequence in codons (3-base blocks).
     # Frameshift the sequence by 0, 1 or 2 (skip first N bases) to obtain all possible codons.
     CODON_SIZE = 3
     codons: Dict[int, List[str]] = dict()
     for frameshift in range(0, CODON_SIZE):
-        codons[frameshift] = [dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(dna_sequence), CODON_SIZE)]
+        codons[frameshift] = [unmasked_dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(unmasked_dna_sequence), CODON_SIZE)]
 
     # Read through all codons accross all frameshifts and determine the ORFs
     orfs: List[Dict[str, Any]] = []