Skip to content

Commit

Permalink
refactor: replace org with tax-id
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Nov 14, 2023
1 parent b2f06e4 commit 8c87935
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 52 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ htsinfer [--output-directory PATH]
[--library-type-mates-cutoff FLOAT]
[--read-orientation-min-mapped-reads INT]
[--read-orientation-min-fraction FLOAT]
[--org-id INT]
[--tax-id INT]
[--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
[-h] [--version]
PATH [PATH]
Expand Down
8 changes: 4 additions & 4 deletions htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,14 @@ def __call__(
)
)
parser.add_argument(
'--org-id',
dest="org_id",
'--tax-id',
dest="tax_id",
metavar="INT",
type=int,
default=None,
help=(
"source organism of the sequencing library; if provided, will not "
"not be inferred by the application"
"NCBI taxonomic identifier of source organism of the library; "
"if provided, will not be inferred by the application"
)
)
parser.add_argument(
Expand Down
51 changes: 24 additions & 27 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from pathlib import Path
import subprocess as sp
import tempfile

from typing import Optional

from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
from pandas import DataFrame # type: ignore
Expand Down Expand Up @@ -52,7 +52,7 @@ class GetLibSource:
min_freq_ratio: Minimum frequency ratio between the first and second
most frequent source in order for the former to be considered the
library's source.
org_id: Taxonomy ID of the organism.
tax_id: Taxonomy ID of the organism.
"""
def __init__( # pylint: disable=E1101
self,
Expand All @@ -66,7 +66,7 @@ def __init__( # pylint: disable=E1101
self.tmp_dir = config.args.tmp_dir
self.min_match_pct = config.args.lib_source_min_match_pct
self.min_freq_ratio = config.args.lib_source_min_freq_ratio
self.org_id = config.args.org_id
self.tax_id = config.args.tax_id

def evaluate(self) -> ResultsSource:
"""Infer read source.
Expand All @@ -76,23 +76,23 @@ def evaluate(self) -> ResultsSource:
"""
source = ResultsSource()
# Check if library_source is provided, otherwise infer it
if self.org_id is not None:
source.file_1.taxon_id = self.org_id
org_name = self.get_organism_name(
self.org_id,
if self.tax_id is not None:
source.file_1.taxon_id = self.tax_id
src_name = self.get_source_name(
self.tax_id,
self.transcripts_file
)

if org_name is not None:
source.file_1.short_name = org_name
if src_name is not None:
source.file_1.short_name = src_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.org_id
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
LOGGER.warning(
f"Taxon ID '{self.org_id}' not found in "
f"Taxon ID '{self.tax_id}' not found in "
"organism dictionary, inferring source organism..."
)
index = self.create_kallisto_index()
Expand Down Expand Up @@ -330,43 +330,40 @@ def get_source_expression(
return dat_agg.sort_values(["tpm"], ascending=False)

@staticmethod
def get_organism_name(
def get_source_name(
taxon_id: int,
transcripts_file: Path,
) -> Optional[str]:
"""Return name of the organism, based on tax ID.
"""Return name of the source organism, based on tax ID.
Args:
taxon_id: Taxonomy ID of a given organism (int).
transcripts_file: Path to fasta file containing transcripts.
taxon_id: Taxonomy ID of a given organism.
transcripts_file: Path to FASTA file containing transcripts.
Returns:
Short name of the organism belonging to the given tax ID.
Raises:
Could not process input FASTA file.
"""
org_dict = {}
# Construct dictionary of organism ID's and names
src_dict = {}
# Construct dictionary of taxonomy ID's and short names
try:
for record in SeqIO.parse(
for record in list(SeqIO.parse(
handle=transcripts_file,
format='fasta',
):
org_id = int(record.description.split("|")[4])
org_name = record.description.split("|")[3]
)):
tax_id = int(record.description.split("|")[4])
src_name = record.description.split("|")[3]

if org_id not in org_dict:
org_dict[org_id] = org_name
else:
org_dict[org_id] = org_name
src_dict[tax_id] = src_name

except OSError as exc:
raise FileProblem(
f"Could not process file '{transcripts_file}'"
) from exc

if taxon_id in org_dict:
return org_dict[taxon_id]
if taxon_id in src_dict:
return src_dict[taxon_id]

return None
4 changes: 2 additions & 2 deletions htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ class Args(BaseModel):
records: Number of input file records to process; set to `0` to
process all records.
threads: Number of threads to run STAR with.
org_id: Organism ID.
tax_id: Organism ID.
transcripts_file: File path to transcripts FASTA file.
read_layout_adapter_file: Path to text file containing 3' adapter
sequences to scan for (one sequence per line).
Expand Down Expand Up @@ -430,7 +430,7 @@ class Args(BaseModel):
CleanupRegimes.DEFAULT
records: int = 1000000
threads: int = 1
org_id: Optional[int] = None
tax_id: Optional[int] = None
transcripts_file: Path = Path()
read_layout_adapter_file: Path = Path()
read_layout_min_match_pct: float = 0.1
Expand Down
36 changes: 18 additions & 18 deletions tests/test_get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,19 +264,19 @@ def test_evaluate_min_freq_ratio(self, tmpdir):
file_2=Source()
)

def test_evaluate_org_id_not_none(self):
"""Test when self.org_id is not None."""
CONFIG.args.org_id = 7227 # An example taxon ID
def test_evaluate_tax_id_not_none(self):
"""Test when self.tax_id is not None."""
CONFIG.args.tax_id = 7227 # An example taxon ID
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
test_instance = GetLibSource(config=CONFIG)
result = test_instance.evaluate()

assert result.file_1.taxon_id == 7227
assert result.file_1.short_name == "dmelanogaster"

def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
"""Test when self.org_id is None and self.paths[1] is not None."""
CONFIG.args.org_id = None
def test_evaluate_tax_id_none_with_path_2(self, tmpdir, monkeypatch):
"""Test when self.tax_id is None and self.paths[1] is not None."""
CONFIG.args.tax_id = None
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
Expand All @@ -295,9 +295,9 @@ def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id
assert result.file_2.short_name == SOURCE_HUMAN.short_name

def test_evaluate_org_id_not_none_with_path_2(self, tmpdir):
"""Test when self.org_id is not None and self.paths[1] is not None."""
CONFIG.args.org_id = 7227
def test_evaluate_tax_id_not_none_with_path_2(self, tmpdir):
"""Test when self.tax_id is not None and self.paths[1] is not None."""
CONFIG.args.tax_id = 7227
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
Expand Down Expand Up @@ -356,9 +356,9 @@ def test_get_organism_name_file_problem(self):
taxon_id, CONFIG.args.t_file_processed
)

def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
"""Test when self.org_id is None."""
CONFIG.args.org_id = None
def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir):
"""Test when self.tax_id is None."""
CONFIG.args.tax_id = None
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
Expand Down Expand Up @@ -386,9 +386,9 @@ def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name

def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
"""Test when self.org_id is not None but org_name is not found."""
CONFIG.args.org_id = 7227
def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir):
"""Test when self.tax_id is not None but src_name is not found."""
CONFIG.args.tax_id = 7227
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
Expand Down Expand Up @@ -422,9 +422,9 @@ def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name

def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir):
"""Test when self.org_id is not None and org_name is found."""
CONFIG.args.org_id = 7227
def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
"""Test when self.tax_id is not None and src_name is found."""
CONFIG.args.tax_id = 7227
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
Expand Down

0 comments on commit 8c87935

Please sign in to comment.