Skip to content

Commit

Permalink
refactor get_library_source
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Nov 14, 2023
1 parent bbb3180 commit 652923d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 33 deletions.
6 changes: 6 additions & 0 deletions htsinfer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception):

class CutadaptProblem(Exception):
"""Exception raised when running cutadapt commands."""


class UnsupportedSampleSourceException(Exception):
"""Exception raised when taxonomy ID is not found in the source
organism list.
"""
47 changes: 14 additions & 33 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pathlib import Path
import subprocess as sp
import tempfile
from typing import Optional

from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
Expand All @@ -14,6 +13,7 @@
FileProblem,
KallistoProblem,
TranscriptsFastaProblem,
UnsupportedSampleSourceException,
)
from htsinfer.models import (
ResultsSource,
Expand Down Expand Up @@ -82,34 +82,11 @@ def evaluate(self) -> ResultsSource:
self.tax_id,
self.transcripts_file
)
source.file_1.short_name = src_name

if src_name is not None:
source.file_1.short_name = src_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
LOGGER.warning(
f"Taxon ID '{self.tax_id}' not found in "
"organism dictionary, inferring source organism..."
)
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id
if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
index = self.create_kallisto_index()
Expand Down Expand Up @@ -333,7 +310,7 @@ def get_source_expression(
def get_source_name(
taxon_id: int,
transcripts_file: Path,
) -> Optional[str]:
) -> str:
"""Return name of the source organism, based on tax ID.
Args:
Expand All @@ -344,10 +321,11 @@ def get_source_name(
Short name of the organism belonging to the given tax ID.
Raises:
Could not process input FASTA file.
FileProblem: Could not process input FASTA file.
UnsupportedSampleSourceException: Taxon ID is not supported.
"""
src_dict = {}
# Construct dictionary of taxonomy ID's and short names

try:
for record in list(SeqIO.parse(
handle=transcripts_file,
Expand All @@ -363,7 +341,10 @@ def get_source_name(
f"Could not process file '{transcripts_file}'"
) from exc

if taxon_id in src_dict:
try:
return src_dict[taxon_id]

return None
except KeyError as exc:
raise UnsupportedSampleSourceException(
f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
) from exc

0 comments on commit 652923d

Please sign in to comment.