Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

annotated CDS vs longest orf comparison (KANBAN-551) #22

Merged
merged 7 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pipeline/seq_retrieval/.coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ omit =
src/main.py
# Exclude logging code (no need to be tested)
src/log_mgmt/*
# Exclude comparison analysis code
src/analysis/*

[report]
exclude_lines =
Expand All @@ -15,3 +17,6 @@ exclude_lines =

# Exclude main method calling (not testable through unit testing)
if __name__ == .__main__.:

# Exclude all functions labeled with "pragma: no cover"
pragma: no cover
10 changes: 10 additions & 0 deletions pipeline/seq_retrieval/src/analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Analyis input and DB files
*.db
*.gff
*.gff3
*.gz
# Analysis output files
*.tsv
*.out
*.err
*.debug
Empty file.
13 changes: 13 additions & 0 deletions pipeline/seq_retrieval/src/analysis/load_gff_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import gffutils # type: ignore
import click


@click.command(context_settings={'show_default': True})
@click.option("--infile", type=click.STRING, required=True, help="Example: 'GFF_HUMAN_0.gff'")
@click.option("--outfile", type=click.STRING, required=True, help="Example: 'GFF-HUMAN.db'")
def main(infile: str, outfile: str) -> None:
gffutils.create_db(infile, outfile, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)


if __name__ == '__main__':
main()
167 changes: 167 additions & 0 deletions pipeline/seq_retrieval/src/analysis/orf_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from gffutils import FeatureDB # type: ignore
from Bio.Data import CodonTable

from typing import List
import click

import sys
sys.path.append('../')
from seq_region import SeqRegion, MultiPartSeqRegion # noqa: E402
from seq_region.multipart_seq_region import find_orfs # noqa: E402

import logging # noqa: E402
from log_mgmt import get_logger, set_log_level # noqa: E402
logger = get_logger(name=__name__)


# db = gffutils.create_db("GFF_HUMAN_0.gff", 'GFF-HUMAN.db', force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)

@click.command(context_settings={'show_default': True})
@click.option("--mod", type=click.STRING, required=True, help="Example: 'XBXL'")
@click.option("--fasta_file", type=click.STRING, required=True, help="Example: '/home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz'")
def main(mod: str, fasta_file: str) -> None: # noqa: C901
# MOD = 'XBXL'
# FASTA_FILE = 'file:///home/mlp/AGR/fastas/XENLA_9.2_genome.fa.gz'

db = FeatureDB(f'GFF-{mod}.db', keep_order=True)
CDS_OUTPUT_FILENAME = f'{mod}-CDS.tsv'
ORF_OUTPUT_FILENAME = f'{mod}-ORF.tsv'

DEBUG = False

if DEBUG:
set_log_level(logging.DEBUG)
CDS_OUTPUT_FILENAME += '.debug'
ORF_OUTPUT_FILENAME += '.debug'

cds_outfile = open(CDS_OUTPUT_FILENAME, mode='w')
orf_outfile = open(ORF_OUTPUT_FILENAME, mode='w')

for transcript in db.features_of_type('mRNA'):
# if transcript.id != 'rna23588':
# continue

# Skip Mitochondrial seqs
if transcript.seqid in ['MtDNA', 'MT', 'M', 'mitochondrion_genome', 'chrmt']:
continue

exons = db.children(transcript, level=1, featuretype='exon')

seq_region_objs: List[SeqRegion] = []
for exon in exons:

seq_region_objs.append(SeqRegion(seq_id=exon.seqid, start=exon.start, end=exon.end, strand=exon.strand,
fasta_file_url='file://' + fasta_file))

# Skip transcripts with no exons in GFF
if len(seq_region_objs) == 0:
continue

# Skip if error occurs during MultiParSeqRegion definition
transcript_region: MultiPartSeqRegion
try:
transcript_region = MultiPartSeqRegion(seq_regions=seq_region_objs)
except Exception as err:
logger.warning(f'Skipping transcript {transcript.id} because exception during MultiPartSeqRegion creation: {err}')
continue

# print(f'Transcript: {transcript.start}-{transcript.end}')

# Skip if Error occured during sequence retrieval
try:
transcript_region.fetch_seq(recursive_fetch=True)
dna_sequence = transcript_region.get_sequence(unmasked=False)
except Exception as err:
logger.warning(f'Skipping transcript {transcript.id} because exception during sequence(s) retrieval: {err}')
continue
# print(dna_sequence)

codon_table: CodonTable.CodonTable = CodonTable.unambiguous_dna_by_name["Standard"]

# Find the best open reading frame
orfs = find_orfs(dna_sequence, codon_table, return_type='longest')

# print(f'ORF: {orfs[0]['seq_start']}-{orfs[0]['seq_end']}')
if len(orfs) > 0:
orf_outfile.write(f"{transcript.id}\t{orfs[0]['seq_start']}\t{orfs[0]['seq_end']}\n")
else:
orf_outfile.write(f"{transcript.id}\t-\t-\n")

cds_parts_iterator = db.children(transcript, level=1, featuretype='CDS')
cds_parts = []
for cds_part in cds_parts_iterator:
cds_parts.append(cds_part)

cds_start: int | None = None
cds_end: int | None = None

abs_cds_start: int | None = None
abs_cds_end: int | None = None
cds_error: str | None = None
cds_regions: List[SeqRegion] = []

if len(cds_parts) > 0:

for cds_part in cds_parts:
if cds_part.start < transcript.start or transcript.end < cds_part.end:
cds_error = 'out-of-bounds'
logger.warning(f'Out-of-bounds CDS regions detected, cancelling CDS calculations for {transcript.id}.')
break

cds_regions.append(SeqRegion(seq_id=cds_part.seqid, start=cds_part.start, end=cds_part.end, strand=cds_part.strand,
fasta_file_url='file://' + fasta_file))

for i in range(0, len(cds_regions) - 1):
for j in range(i + 1, len(cds_regions)):
logger.info(f'Overlap analysis for {cds_regions[i]} and {cds_regions[j]}')
if cds_regions[i].overlaps(cds_regions[j]):
cds_error = 'overlapping'
logger.warning(f'Overlapping CDS regions detected, cancelling CDS calculations for {transcript.id}.'
+ f" Conflicting CDS parts: {cds_regions[i]} and {cds_regions[j]}\n")
break

if cds_error is None:
abs_cds_start = min(map(lambda region: region.start, cds_regions))
abs_cds_end = max(map(lambda region: region.end, cds_regions))

if transcript_region.strand == '-':
cds_start = transcript_region.seq_to_rel_pos(abs_cds_end)
else:
cds_start = transcript_region.seq_to_rel_pos(abs_cds_start)

cds_end = sum(map(lambda region: region.seq_length, cds_regions)) + cds_start - 1

if cds_start is not None:
cds_outfile.write(f"{transcript.id}\t{cds_start}\t{cds_end}\n")
else:
cds_outfile.write(f"{transcript.id}\t-\t-\n")

# Define and print comparison result
result = []
if len(orfs) == 0 and len(cds_parts) == 0:
result.append('equal')
elif len(orfs) == 0:
result.append('undetected-ORF')
elif len(cds_parts) == 0:
result.append('no-CDS')
else:
orf = orfs.pop()
if cds_start is None or cds_end is None:
result.append(f'{cds_error}-CDS')
elif orf['seq_start'] == cds_start and orf['seq_end'] == cds_end:
result.append('equal')
elif orf['seq_end'] == cds_end and orf['seq_start'] < cds_start:
result.append('shortened-start')
elif orf['seq_end'] - orf['seq_start'] > cds_end - cds_start:
result.append('shorter-orf')
else:
result.append('unknown')

print(f"{transcript.id}\t{','.join(result)}")

orf_outfile.close()
cds_outfile.close()


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions pipeline/seq_retrieval/src/analysis/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gffutils==0.13
6 changes: 1 addition & 5 deletions pipeline/seq_retrieval/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,10 @@ def main(seq_id: str, seq_strand: str, seq_regions: List[Dict[str, int]], fasta_
seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
fasta_file_url=fasta_file_url))

for seq_region in seq_region_objs:
# Retrieve sequence for region
seq_region.fetch_seq()

# Concatenate all regions into single sequence
fullRegion = MultiPartSeqRegion(seq_regions=seq_region_objs)

fullRegion.fetch_seq()
fullRegion.fetch_seq(recursive_fetch=True)
seq_concat = fullRegion.get_sequence(unmasked=unmasked)

logger.debug(f"full region: {fullRegion.seq_id}:{fullRegion.start}-{fullRegion.end}:{fullRegion.strand}")
Expand Down
67 changes: 62 additions & 5 deletions pipeline/seq_retrieval/src/seq_region/multipart_seq_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,37 +34,48 @@ def __init__(self, seq_regions: List[SeqRegion]):
Args:
seq_regions: list of SeqRegion objects that constitute this multi-part sequence region.\
All SeqRegions must have identical seq_id, strand and fasta_file_path properties \
to form a valid MultipartSeqRegion.
to form a valid MultipartSeqRegion. Regions cannot overlap.

Raises:
ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties.
ValueError: if `seq_regions` have distinct `seq_id`, `strand` or `fasta_file_path` properties, or if regions overlap.
"""

self.seq_length: int = sum(map(lambda seq_region: seq_region.seq_length, seq_regions))
self.start: int = min(map(lambda seq_region: seq_region.start, seq_regions))
self.end: int = max(map(lambda seq_region: seq_region.end, seq_regions))

# Ensure one strand
strands: Set[str] = set(map(lambda seq_region: seq_region.strand, seq_regions))
if len(strands) > 1:
raise ValueError(f"Multiple strands defined accross seq regions ({strands})."
+ " All seqRegions in multiPartSeqRegion must have equal value for strand attribute.")
else:
self.strand = strands.pop()

# Ensure one seq_id
seq_ids: Set[str] = set(map(lambda seq_region: seq_region.seq_id, seq_regions))
if len(seq_ids) > 1:
raise ValueError(f"Multiple seq_ids defined accross seq regions ({seq_ids})."
+ " All seqRegions in multiPartSeqRegion must have equal value for seq_id attribute.")
else:
self.seq_id = seq_ids.pop()

# Ensure one fasta_file_path
fasta_file_paths: Set[str] = set(map(lambda seq_region: seq_region.fasta_file_path, seq_regions))
if len(fasta_file_paths) > 1:
raise ValueError(f"Multiple fasta_file_paths defined accross seq regions ({fasta_file_paths})."
+ " All seqRegions in multiPartSeqRegion must have equal value for fasta_file_path attribute.")
else:
self.fasta_file_path = fasta_file_paths.pop()

# Ensure no overlap between seq_regions
for i in range(0, len(seq_regions) - 1):
for j in range(i + 1, len(seq_regions)):
if seq_regions[i].overlaps(seq_regions[j]):
raise ValueError(f"Overlapping seq regions found ({seq_regions[i]} and {seq_regions[j]})."
+ " a MultiPartSeqRegion cannot consist of overlapping parts.")

# Sort seq_regions before storing
sort_args: Dict[str, Any] = dict(key=lambda region: region.start, reverse=False)

if self.strand == '-':
Expand All @@ -76,18 +87,30 @@ def __init__(self, seq_regions: List[SeqRegion]):
self.ordered_seqRegions = ordered_seq_regions

@override
def fetch_seq(self) -> None:
def __str__(self) -> str: # pragma: no cover
return self.ordered_seqRegions.__str__()

@override
def fetch_seq(self, recursive_fetch: bool = False) -> None:
"""
Fetch genetic (DNA) sequence for MultiPartSeqRegion by chaining \
consisting SeqRegions' sequenes together into one continuous sequence.

Chains seqRegions in the order defined in the `ordered_seqRegions` attribute.

Args:
recursive_fetch: if True, fetch sequence for each SeqRegion part of the MultiPartSeqRegion first, before chaining the results.

Returns:
Stores resulting sequence in `sequence` attribute.
"""

self.set_sequence(''.join(map(lambda region: region.get_sequence(), self.ordered_seqRegions)))
def get_fetch_sequence(region: SeqRegion) -> str:
if recursive_fetch:
region.fetch_seq()
return region.get_sequence()

self.set_sequence(''.join(map(lambda region: get_fetch_sequence(region), self.ordered_seqRegions)))

@override
def set_sequence(self, sequence: str) -> None:
Expand Down Expand Up @@ -154,6 +177,37 @@ def translate(self) -> str | None:
logger.warning('No open reading frames found, so no translation made.')
return None

def seq_to_rel_pos(self, seq_position: int) -> int:
"""
Convert absolute sequence position to relative position within the MultipartSeqRegion

Returns:
Relative position on the complete MultipartSeqRegion sequence (1-based)

Raises:
ValueError: when abs_position falls between SeqRegion parts
"""
rel_position = 0
for region in self.ordered_seqRegions:
if self.strand == '+':
if region.end < seq_position:
rel_position += region.end - region.start + 1
elif region.start <= seq_position:
rel_position += seq_position - region.start + 1
break
else:
raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.')
else:
if seq_position < region.start:
rel_position += region.end - region.start + 1
elif seq_position <= region.end:
rel_position += region.end - seq_position + 1
break
else:
raise ValueError(f'Seq position {seq_position} located between SeqRegion parts defining the MultipartSeqRegion {self}.')

return rel_position


def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type: str = 'all') -> List[Dict[str, Any]]:
"""
Expand All @@ -175,12 +229,15 @@ def find_orfs(dna_sequence: str, codon_table: CodonTable.CodonTable, return_type
ValueError: if `return_type` does not have a valid value.
"""

# Remove any softmasking
unmasked_dna_sequence = dna_sequence.upper()

# Split the DNA sequence in codons (3-base blocks).
# Frameshift the sequence by 0, 1 or 2 (skip first N bases) to obtain all possible codons.
CODON_SIZE = 3
codons: Dict[int, List[str]] = dict()
for frameshift in range(0, CODON_SIZE):
codons[frameshift] = [dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(dna_sequence), CODON_SIZE)]
codons[frameshift] = [unmasked_dna_sequence[i:i + CODON_SIZE] for i in range(frameshift, len(unmasked_dna_sequence), CODON_SIZE)]

# Read through all codons accross all frameshifts and determine the ORFs
orfs: List[Dict[str, Any]] = []
Expand Down
Loading