Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tax_id parameter #147

Merged
merged 20 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ tests/.DS_Store
results_htsinfer
.snakemake
tests/cluster_tests/results_sra_downloads
*.out
uniqueg marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions README.md
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
[--library-type-mates-cutoff FLOAT]
[--read-orientation-min-mapped-reads INT]
[--read-orientation-min-fraction FLOAT]
[--tax-id INT]
[--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
[-h] [--version]
PATH [PATH]
Expand Down
11 changes: 11 additions & 0 deletions htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ def __call__(
"be reported. Must be above 0.5"
)
)
parser.add_argument(
'--tax-id',
dest="tax_id",
metavar="INT",
type=int,
default=None,
help=(
"NCBI taxonomic identifier of source organism of the library; "
"if provided, will not be inferred by the application"
)
)
parser.add_argument(
"--verbosity",
choices=[e.name for e in LogLevels],
Expand Down
6 changes: 6 additions & 0 deletions htsinfer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception):

class CutadaptProblem(Exception):
"""Exception raised when running cutadapt commands."""


class UnsupportedSampleSourceException(Exception):
"""Exception raised when taxonomy ID is not found in the source
organism list.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"""
83 changes: 75 additions & 8 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import subprocess as sp
import tempfile

from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
from pandas import DataFrame # type: ignore

from htsinfer.exceptions import (
FileProblem,
KallistoProblem,
TranscriptsFastaProblem,
UnsupportedSampleSourceException,
)
from htsinfer.models import (
ResultsSource,
Expand Down Expand Up @@ -50,6 +52,7 @@ class GetLibSource:
min_freq_ratio: Minimum frequency ratio between the first and second
most frequent source in order for the former to be considered the
library's source.
tax_id: Taxonomy ID of the organism.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"""
def __init__( # pylint: disable=E1101
self,
Expand All @@ -63,6 +66,7 @@ def __init__( # pylint: disable=E1101
self.tmp_dir = config.args.tmp_dir
self.min_match_pct = config.args.lib_source_min_match_pct
self.min_freq_ratio = config.args.lib_source_min_freq_ratio
self.tax_id = config.args.tax_id

def evaluate(self) -> ResultsSource:
"""Infer read source.
Expand All @@ -71,16 +75,36 @@ def evaluate(self) -> ResultsSource:
Source results object.
"""
source = ResultsSource()
index = self.create_kallisto_index()
source.file_1 = self.get_source(
fastq=self.paths[0],
index=index,
)
if self.paths[1] is not None:
source.file_2 = self.get_source(
fastq=self.paths[1],
# Check if library_source is provided, otherwise infer it
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
if self.tax_id is not None:
source.file_1.taxon_id = self.tax_id
src_name = self.get_source_name(
uniqueg marked this conversation as resolved.
Show resolved Hide resolved
self.tax_id,
self.transcripts_file
)
source.file_1.short_name = src_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id

return source

def create_kallisto_index(self) -> Path:
Expand Down Expand Up @@ -281,3 +305,46 @@ def get_source_expression(

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)

@staticmethod
def get_source_name(
taxon_id: int,
transcripts_file: Path,
) -> str:
"""Return name of the source organism, based on tax ID.

Args:
taxon_id: Taxonomy ID of a given organism.
transcripts_file: Path to FASTA file containing transcripts.

Returns:
Short name of the organism belonging to the given tax ID.

Raises:
FileProblem: Could not process input FASTA file.
UnsupportedSampleSourceException: Taxon ID is not supported.
"""
src_dict = {}

try:
for record in list(SeqIO.parse(
handle=transcripts_file,
format='fasta',
)):
tax_id = int(record.description.split("|")[4])
src_name = record.description.split("|")[3]

src_dict[tax_id] = src_name

except OSError as exc:
raise FileProblem(
f"Could not process file '{transcripts_file}'"
) from exc

try:
return src_dict[taxon_id]

except KeyError as exc:
raise UnsupportedSampleSourceException(
f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
) from exc
2 changes: 1 addition & 1 deletion htsinfer/get_read_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def evaluate(self) -> None:
try:
with open(self.path, encoding="utf-8") as _f: # type: ignore

LOGGER.debug("Procecssing Reads")
LOGGER.debug("Processing Reads")
try:
for record in FastqGeneralIterator(source=_f):
read = record[1]
Expand Down
12 changes: 6 additions & 6 deletions htsinfer/htsinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ def evaluate(self):
self.get_library_stats()
LOGGER.info(
"Library stats determined: "
f"{self.config.results.library_stats.json()}"
f"{self.config.results.library_stats.model_dump_json()}"
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
)

# determine library source
LOGGER.info("Determining library source...")
self.config.results.library_source = self.get_library_source()
LOGGER.info(
"Library source determined: "
f"{self.config.results.library_source.json()}"
f"{self.config.results.library_source.model_dump_json()}"
)

# determine library type
Expand All @@ -106,7 +106,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Library type determined: "
f"{self.config.results.library_type.json()}"
f"{self.config.results.library_type.model_dump_json()}"
)

# determine read orientation
Expand All @@ -119,7 +119,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read orientation determined: "
f"{self.config.results.read_orientation.json()}"
f"{self.config.results.read_orientation.model_dump_json()}"
)

# determine read layout
Expand All @@ -132,7 +132,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read layout determined: "
f"{self.config.results.read_layout.json()}"
f"{self.config.results.read_layout.model_dump_json()}"
)

except FileProblem as exc:
Expand All @@ -148,7 +148,7 @@ def evaluate(self):
LOGGER.error(f"{type(exc).__name__}: {str(exc)}")

# log results
LOGGER.info(f"Results: {self.config.results.json()}")
LOGGER.info(f"Results: {self.config.results.model_dump_json()}")

def prepare_env(self):
"""Set up work environment."""
Expand Down
2 changes: 2 additions & 0 deletions htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Args(BaseModel):
records: Number of input file records to process; set to `0` to
process all records.
threads: Number of threads to run STAR with.
tax_id: Taxonomy ID of the source organism.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
transcripts_file: File path to transcripts FASTA file.
read_layout_adapter_file: Path to text file containing 3' adapter
sequences to scan for (one sequence per line).
Expand Down Expand Up @@ -429,6 +430,7 @@ class Args(BaseModel):
CleanupRegimes.DEFAULT
records: int = 1000000
threads: int = 1
tax_id: Optional[int] = None
transcripts_file: Path = Path()
read_layout_adapter_file: Path = Path()
read_layout_min_match_pct: float = 0.1
Expand Down
4 changes: 2 additions & 2 deletions pylint.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[MESSAGES CONTROL]
disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-white-list=pysam,ahocorasick
disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-whitelist=pysam,ahocorasick
ignored-classes=pysam
Loading
Loading