Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tax_id parameter #147

Merged
merged 20 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions htsinfer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception):

class CutadaptProblem(Exception):
"""Exception raised when running cutadapt commands."""


class UnsupportedSampleSourceException(Exception):
"""Exception raised when taxonomy ID is not found in the source
organism list.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"""
47 changes: 14 additions & 33 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pathlib import Path
import subprocess as sp
import tempfile
from typing import Optional

from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
Expand All @@ -14,6 +13,7 @@
FileProblem,
KallistoProblem,
TranscriptsFastaProblem,
UnsupportedSampleSourceException,
)
from htsinfer.models import (
ResultsSource,
Expand Down Expand Up @@ -82,34 +82,11 @@ def evaluate(self) -> ResultsSource:
self.tax_id,
self.transcripts_file
)
source.file_1.short_name = src_name

if src_name is not None:
source.file_1.short_name = src_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
LOGGER.warning(
f"Taxon ID '{self.tax_id}' not found in "
"organism dictionary, inferring source organism..."
)
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id
if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
index = self.create_kallisto_index()
Expand Down Expand Up @@ -333,7 +310,7 @@ def get_source_expression(
def get_source_name(
taxon_id: int,
transcripts_file: Path,
) -> Optional[str]:
) -> str:
"""Return name of the source organism, based on tax ID.

Args:
Expand All @@ -344,10 +321,11 @@ def get_source_name(
Short name of the organism belonging to the given tax ID.

Raises:
Could not process input FASTA file.
FileProblem: Could not process input FASTA file.
UnsupportedSampleSourceException: Taxon ID is not supported.
"""
src_dict = {}
# Construct dictionary of taxonomy ID's and short names

try:
for record in list(SeqIO.parse(
handle=transcripts_file,
Expand All @@ -363,7 +341,10 @@ def get_source_name(
f"Could not process file '{transcripts_file}'"
) from exc

if taxon_id in src_dict:
try:
return src_dict[taxon_id]

return None
except KeyError as exc:
raise UnsupportedSampleSourceException(
f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
) from exc
61 changes: 13 additions & 48 deletions tests/test_get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from htsinfer.exceptions import (
FileProblem,
KallistoProblem,
TranscriptsFastaProblem
TranscriptsFastaProblem,
UnsupportedSampleSourceException,
)
from htsinfer.get_library_source import GetLibSource
from htsinfer.models import (
Expand Down Expand Up @@ -323,36 +324,36 @@ def test_create_kallisto_index_problem(self, tmpdir):
with pytest.raises(KallistoProblem):
test_instance.create_kallisto_index()

def test_get_organism_name_found(self):
def test_get_source_name_found(self):
"""Test the function when the taxon_id
is found in the organism dictionary."""
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
test_instance = GetLibSource(config=CONFIG)
taxon_id = 7227
result = test_instance.get_organism_name(
result = test_instance.get_source_name(
taxon_id, CONFIG.args.t_file_processed
)
assert result == "dmelanogaster"

def test_get_organism_name_not_found(self):
def test_get_source_name_not_found(self):
"""Test the function when the taxon_id
is not found in the organism dictionary."""
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
test_instance = GetLibSource(config=CONFIG)
taxon_id = 12345 # A tax ID that doesn't exist in transcripts
result = test_instance.get_organism_name(
taxon_id, CONFIG.args.t_file_processed
)
assert result is None
with pytest.raises(UnsupportedSampleSourceException):
test_instance.get_source_name(
taxon_id, CONFIG.args.t_file_processed
)

def test_get_organism_name_file_problem(self):
def test_get_source_name_file_problem(self):
"""Test the function when there's a
file problem while processing the FASTA file."""
CONFIG.args.t_file_processed = FILE_DUMMY
test_instance = GetLibSource(config=CONFIG)
taxon_id = 7227
with pytest.raises(FileProblem):
test_instance.get_organism_name(
test_instance.get_source_name(
taxon_id, CONFIG.args.t_file_processed
)

Expand Down Expand Up @@ -386,42 +387,6 @@ def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir):
assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name

def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir):
"""Test when self.tax_id is not None but src_name is not found."""
CONFIG.args.tax_id = 7227
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
CONFIG.args.out_dir = tmpdir
test_instance = GetLibSource(config=CONFIG)

# Mock the get_organism_name method to return None
monkeypatch.setattr(
'htsinfer.get_library_source.GetLibSource.get_organism_name',
lambda *args, **kwargs: None,
)

# Mock the create_kallisto_index method to return a specific result
monkeypatch.setattr(
'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
lambda *args, **kwargs: tmpdir / "kallisto.idx",
)

# Mock the get_source method to return a specific result
monkeypatch.setattr(
'htsinfer.get_library_source.GetLibSource.get_source',
lambda *args, **kwargs: SOURCE_FRUIT_FLY,
)

result = test_instance.evaluate()

assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name

assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name

def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
"""Test when self.tax_id is not None and src_name is found."""
CONFIG.args.tax_id = 7227
Expand All @@ -432,9 +397,9 @@ def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
CONFIG.args.out_dir = tmpdir
test_instance = GetLibSource(config=CONFIG)

# Mock the get_organism_name method to return a specific result
# Mock the get_source_name method to return a specific result
monkeypatch.setattr(
'htsinfer.get_library_source.GetLibSource.get_organism_name',
'htsinfer.get_library_source.GetLibSource.get_source_name',
lambda *args, **kwargs: "dmelanogaster",
)

Expand Down
Loading