refactor: replace org with tax-id

zavolanlab · Nov 14, 2023 · 8c87935 · 8c87935
1 parent b2f06e4
commit 8c87935
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -109,7 +109,7 @@ htsinfer [--output-directory PATH]
          [--library-type-mates-cutoff FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
-         [--org-id INT]
+         [--tax-id INT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
          [-h] [--version]
          PATH [PATH]

diff --git a/htsinfer/cli.py b/htsinfer/cli.py
@@ -263,14 +263,14 @@ def __call__(
         )
     )
     parser.add_argument(
-        '--org-id',
-        dest="org_id",
+        '--tax-id',
+        dest="tax_id",
         metavar="INT",
         type=int,
         default=None,
         help=(
-            "source organism of the sequencing library; if provided, will not "
-            "not be inferred by the application"
+            "NCBI taxonomic identifier of source organism of the library; "
+            "if provided, will not be inferred by the application"
         )
     )
     parser.add_argument(

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
@@ -4,8 +4,8 @@
 from pathlib import Path
 import subprocess as sp
 import tempfile
-
 from typing import Optional
+
 from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
@@ -52,7 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
-        org_id: Taxonomy ID of the organism.
+        tax_id: Taxonomy ID of the organism.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -66,7 +66,7 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
-        self.org_id = config.args.org_id
+        self.tax_id = config.args.tax_id
 
     def evaluate(self) -> ResultsSource:
         """Infer read source.
@@ -76,23 +76,23 @@ def evaluate(self) -> ResultsSource:
         """
         source = ResultsSource()
         # Check if library_source is provided, otherwise infer it
-        if self.org_id is not None:
-            source.file_1.taxon_id = self.org_id
-            org_name = self.get_organism_name(
-                self.org_id,
+        if self.tax_id is not None:
+            source.file_1.taxon_id = self.tax_id
+            src_name = self.get_source_name(
+                self.tax_id,
                 self.transcripts_file
             )
 
-            if org_name is not None:
-                source.file_1.short_name = org_name
+            if src_name is not None:
+                source.file_1.short_name = src_name
 
                 if self.paths[1] is not None:
-                    source.file_2.taxon_id = self.org_id
+                    source.file_2.taxon_id = self.tax_id
                     source.file_2.short_name = source.file_1.short_name
 
             else:
                 LOGGER.warning(
-                    f"Taxon ID '{self.org_id}' not found in "
+                    f"Taxon ID '{self.tax_id}' not found in "
                     "organism dictionary, inferring source organism..."
                 )
                 index = self.create_kallisto_index()
@@ -330,43 +330,40 @@ def get_source_expression(
         return dat_agg.sort_values(["tpm"], ascending=False)
 
     @staticmethod
-    def get_organism_name(
+    def get_source_name(
         taxon_id: int,
         transcripts_file: Path,
     ) -> Optional[str]:
-        """Return name of the organism, based on tax ID.
+        """Return name of the source organism, based on tax ID.
 
         Args:
-            taxon_id: Taxonomy ID of a given organism (int).
-            transcripts_file: Path to fasta file containing transcripts.
+            taxon_id: Taxonomy ID of a given organism.
+            transcripts_file: Path to FASTA file containing transcripts.
 
         Returns:
             Short name of the organism belonging to the given tax ID.
 
         Raises:
             Could not process input FASTA file.
         """
-        org_dict = {}
-        # Construct dictionary of organism ID's and names
+        src_dict = {}
+        # Construct dictionary of taxonomy ID's and short names
         try:
-            for record in SeqIO.parse(
+            for record in list(SeqIO.parse(
                     handle=transcripts_file,
                     format='fasta',
-            ):
-                org_id = int(record.description.split("|")[4])
-                org_name = record.description.split("|")[3]
+            )):
+                tax_id = int(record.description.split("|")[4])
+                src_name = record.description.split("|")[3]
 
-                if org_id not in org_dict:
-                    org_dict[org_id] = org_name
-                else:
-                    org_dict[org_id] = org_name
+                src_dict[tax_id] = src_name
 
         except OSError as exc:
             raise FileProblem(
                 f"Could not process file '{transcripts_file}'"
             ) from exc
 
-        if taxon_id in org_dict:
-            return org_dict[taxon_id]
+        if taxon_id in src_dict:
+            return src_dict[taxon_id]
 
         return None
diff --git a/htsinfer/models.py b/htsinfer/models.py
@@ -356,7 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        org_id: Organism ID.
+        tax_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).
@@ -430,7 +430,7 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 1000000
     threads: int = 1
-    org_id: Optional[int] = None
+    tax_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()
     read_layout_min_match_pct: float = 0.1

diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
@@ -264,19 +264,19 @@ def test_evaluate_min_freq_ratio(self, tmpdir):
             file_2=Source()
         )
 
-    def test_evaluate_org_id_not_none(self):
-        """Test when self.org_id is not None."""
-        CONFIG.args.org_id = 7227  # An example taxon ID
+    def test_evaluate_tax_id_not_none(self):
+        """Test when self.tax_id is not None."""
+        CONFIG.args.tax_id = 7227  # An example taxon ID
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         test_instance = GetLibSource(config=CONFIG)
         result = test_instance.evaluate()
 
         assert result.file_1.taxon_id == 7227
         assert result.file_1.short_name == "dmelanogaster"
 
-    def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
-        """Test when self.org_id is None and self.paths[1] is not None."""
-        CONFIG.args.org_id = None
+    def test_evaluate_tax_id_none_with_path_2(self, tmpdir, monkeypatch):
+        """Test when self.tax_id is None and self.paths[1] is not None."""
+        CONFIG.args.tax_id = None
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -295,9 +295,9 @@ def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
         assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id
         assert result.file_2.short_name == SOURCE_HUMAN.short_name
 
-    def test_evaluate_org_id_not_none_with_path_2(self, tmpdir):
-        """Test when self.org_id is not None and self.paths[1] is not None."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_with_path_2(self, tmpdir):
+        """Test when self.tax_id is not None and self.paths[1] is not None."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -356,9 +356,9 @@ def test_get_organism_name_file_problem(self):
                 taxon_id, CONFIG.args.t_file_processed
             )
 
-    def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
-        """Test when self.org_id is None."""
-        CONFIG.args.org_id = None
+    def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is None."""
+        CONFIG.args.tax_id = None
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -386,9 +386,9 @@ def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
         assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
         assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
 
-    def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
-        """Test when self.org_id is not None but org_name is not found."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is not None but src_name is not found."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -422,9 +422,9 @@ def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
         assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
         assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
 
-    def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir):
-        """Test when self.org_id is not None and org_name is found."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is not None and src_name is found."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS