From b67c2510e96ad0fd0bc93f3db2405ad072572a97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Fri, 8 Sep 2023 12:28:09 +0200
Subject: [PATCH 01/17] feat: add org param

---
 htsinfer/cli.py                | 22 +++++++++++++++++++++
 htsinfer/get_library_source.py | 36 ++++++++++++++++++++++++++--------
 htsinfer/models.py             |  6 +++++-
 3 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/htsinfer/cli.py b/htsinfer/cli.py
index 3fea80d4..c8d5c553 100644
--- a/htsinfer/cli.py
+++ b/htsinfer/cli.py
@@ -261,6 +261,28 @@ def __call__(
             "be reported. Must be above 0.5"
         )
     )
+    parser.add_argument(
+        '--org-id',
+        dest="org_id",
+        metavar="INT",
+        type=int,
+        default=None,
+        help=(
+            "source organism of the sequencing library; if provided, will not "
+            "not be inferred by the application"
+        )
+    )
+    parser.add_argument(
+        '--org-name',
+        dest="org_name",
+        metavar="STR",
+        type=str,
+        default=None,
+        help=(
+            "source organism of the sequencing library; if provided, will not "
+            "not be inferred by the application"
+        )
+    )
     parser.add_argument(
         "--verbosity",
         choices=[e.name for e in LogLevels],
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 9e88ebe9..a07ecbc4 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -2,6 +2,7 @@
 
 import logging
 from pathlib import Path
+from typing import Optional
 import subprocess as sp
 import tempfile
 
@@ -63,6 +64,8 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
+        self.org_name = config.args.org_name
+        self.org_id = config.args.org_id
 
     def evaluate(self) -> ResultsSource:
         """Infer read source.
@@ -71,16 +74,33 @@ def evaluate(self) -> ResultsSource:
             Source results object.
         """
         source = ResultsSource()
-        index = self.create_kallisto_index()
-        source.file_1 = self.get_source(
-            fastq=self.paths[0],
-            index=index,
-        )
-        if self.paths[1] is not None:
-            source.file_2 = self.get_source(
-                fastq=self.paths[1],
+        # Check if library_source is provided, otherwise infer it
+        if self.org_name is not None:
+            source.file_1.short_name = self.org_name
+            source.file_1.taxon_id = self.org_id
+        else:
+            # Infer library source here and set it to source.library_source
+            index = self.create_kallisto_index()
+            library_source = self.get_source(
+                fastq=self.paths[0],
                 index=index,
             )
+            source.file_1.short_name = library_source.short_name
+            source.file_1.taxon_id = library_source.taxon_id
+
+        if self.paths[1] is not None:
+            # Check if library_source is provided for file_2, otherwise infer it
+            if self.org_name is not None:
+                source.file_2.short_name = self.org_name
+                source.file_2.taxon_id = self.org_id
+            else:
+                library_source = self.get_source(
+                    fastq=self.paths[1],
+                    index=index,
+                )
+                source.file_2.short_name = library_source.short_name
+                source.file_2.taxon_id = library_source.taxon_id
+
         return source
 
     def create_kallisto_index(self) -> Path:
diff --git a/htsinfer/models.py b/htsinfer/models.py
index 84c5cc42..391d856b 100644
--- a/htsinfer/models.py
+++ b/htsinfer/models.py
@@ -6,7 +6,7 @@
 )
 import logging
 import re
-from typing import Optional
+from typing import Optional, Union
 from pathlib import Path
 import tempfile
 
@@ -356,6 +356,8 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
+        org_name: Organism name.
+        org_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).
@@ -429,6 +431,8 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 0
     threads: int = 1
+    org_name: Optional[str] = None
+    org_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()
     read_layout_min_match_pct: float = 0.1

From dc8af234304d7d140facadf9af8f5b7591a339e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Boris=20Juri=C4=8D?=
 <74237898+BorisYourich@users.noreply.github.com>
Date: Thu, 14 Sep 2023 10:22:58 +0200
Subject: [PATCH 02/17] refactor: avoid duplicate mappings (#131)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Boris Jurič <499542@mail.muni.cz>
Co-authored-by: Alex Kanitz <alexander.kanitz@alumni.ethz.ch>
---
 .github/workflows/ci.yml           |  35 +--
 README.md                          |  10 +-
 environment-dev.yml                |  25 +-
 environment.yml                    |  24 +-
 htsinfer/cli.py                    |   1 +
 htsinfer/get_library_type.py       |  31 +--
 htsinfer/get_read_orientation.py   | 345 +-------------------------
 htsinfer/htsinfer.py               |   4 +
 htsinfer/mapping.py                | 378 +++++++++++++++++++++++++++++
 setup.py                           |   2 +-
 tests/test_get_library_type.py     |  49 ++--
 tests/test_get_read_orientation.py | 291 ++++------------------
 tests/test_mapping.py              | 259 ++++++++++++++++++++
 tests/utils.py                     |   3 +
 14 files changed, 801 insertions(+), 656 deletions(-)
 create mode 100644 htsinfer/mapping.py
 create mode 100644 tests/test_mapping.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22c59e57..3993e32a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,21 +12,18 @@ jobs:
     steps:
 
       - name: check out repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: set up miniconda and env
         uses: conda-incubator/setup-miniconda@v2
         with:
-          auto-update-conda: true
+          python-version: "3.9"
           mamba-version: "*"
-          channels: conda-forge,defaults
-          environment-file: environment.yml
+          auto-update-conda: true
           activate-environment: htsinfer
+          environment-file: environment-dev.yml
           auto-activate-base: false
 
-      - name: update env with dev packages
-        run: mamba env update --file environment-dev.yml
-
       - name: display env info
         run: |
           conda info -a
@@ -50,7 +47,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ '3.7', '3.8', '3.9' ]
+        python-version: [ '3.8', '3.9', '3.10' ]
 
     name: unit-testing-Python-${{ matrix.python-version }}
 
@@ -63,16 +60,12 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           python-version: ${{ matrix.python-version }}
-          auto-update-conda: true
           mamba-version: "*"
-          channels: conda-forge,defaults
-          environment-file: environment.yml
+          auto-update-conda: true
           activate-environment: htsinfer
+          environment-file: environment-dev.yml
           auto-activate-base: false
 
-      - name: update env with dev packages
-        run: mamba env update --file environment-dev.yml
-
       - name: display env info
         run: |
           conda info -a
@@ -100,29 +93,25 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ '3.7', '3.8', '3.9' ]
+        python-version: [ '3.8', '3.9', '3.10' ]
 
     name: integration-testing-Python-${{ matrix.python-version }}
 
     steps:
 
       - name: check out repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: set up miniconda and env
         uses: conda-incubator/setup-miniconda@v2
         with:
           python-version: ${{ matrix.python-version }}
-          auto-update-conda: true
           mamba-version: "*"
-          channels: conda-forge,defaults
-          environment-file: environment.yml
+          auto-update-conda: true
           activate-environment: htsinfer
+          environment-file: environment-dev.yml
           auto-activate-base: false
 
-      - name: update env with dev packages
-        run: mamba env update --file environment-dev.yml
-
       - name: display env info
         run: |
           conda info -a
@@ -171,4 +160,4 @@ jobs:
         run: |
           echo "Push indicator: ${{ steps.docker.outputs.push-indicator }}"
           echo "# Set to 'true' if image was pushed, empty string otherwise"
-          test "${{ steps.docker.outputs.push-indicator }}" == "true"
\ No newline at end of file
+          test "${{ steps.docker.outputs.push-indicator }}" == "true"
diff --git a/README.md b/README.md
index 35f49184..6149a708 100644
--- a/README.md
+++ b/README.md
@@ -123,12 +123,14 @@ dependencies via [Conda][conda]:
 git clone https://github.com/zavolanlab/htsinfer
 cd htsinfer
 conda env create --file environment.yml
-conda env update --file environment-dev.yml  # optional: install development/testing dependencies
+# Alternatively, to install with development dependencies,
+# run the following instead
+conda env create --file environment-dev.yml
 ```
 
-Note that creating the environment takes non-trivial time and it is strongly
-recommended that you install [Mamba][mamba] and replace `conda` with `mamba`
-in the previous commands.
+> Note that creating the environment takes non-trivial time and it is strongly
+> recommended that you install [Mamba][mamba] and replace `conda` with `mamba`
+> in the previous command.
 
 Then, activate the `htsinfer` Conda environment with:
 
diff --git a/environment-dev.yml b/environment-dev.yml
index 8b243f0f..40f4f5f4 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -1,11 +1,24 @@
 name: htsinfer
 channels:
-  - defaults
+  - conda-forge
+  - bioconda
 dependencies:
-  - coverage>=5.3
-  - flake8>=3.8.4
-  - mypy>=0.782
-  - pylint>=2.4.4
-  - pytest>=6.1.0
+  - biopython >=1.78
+  - coverage >=5.3
+  - cutadapt >=3.5, <=4.2
+  - flake8 >=3.8.4
+  - kallisto >=0.46.1, <= 0.48.0
+  - mypy >=0.782
+  - numpy >=1.22, <1.25
+  - pandas >=1.3.5, <1.4.0
+  - pip >=20.2.3
+  - pyahocorasick >=1.4.0
+  - pydantic >=1.8.1, <2
+  - pylint >=2.4.4
+  - pysam >=0.16.0
+  - pytest >=6.1.0
+  - python >=3.8, <=3.10
+  - star >=2.7.6
   - pip:
     - python-semantic-release>=7.15.0
+    - -e .
diff --git a/environment.yml b/environment.yml
index 54c4b8a8..08e4fd28 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,18 +1,18 @@
 name: htsinfer
 channels:
-  - defaults
-  - bioconda
   - conda-forge
+  - bioconda
 dependencies:
-  - biopython>=1.78
-  - kallisto>=0.46.1, <= 0.48.0
-  - pandas>=1.0.5
-  - pip>=20.2.3
-  - pyahocorasick>=1.4.0
-  - pydantic>=1.8.1, <2
-  - pysam>=0.16.0
-  - python>3.6, <3.10
-  - star>=2.7.6
-  - cutadapt>=3.5, <=4.2
+  - biopython >=1.78
+  - cutadapt >=3.5, <=4.2
+  - kallisto >=0.46.1, <= 0.48.0
+  - numpy >=1.22, <1.25
+  - pandas >=1.3.5, <1.4.0
+  - pip >=20.2.3
+  - pyahocorasick >=1.4.0
+  - pydantic >=1.8.1, <2
+  - pysam >=0.16.0
+  - python >=3.8, <=3.10
+  - star >=2.7.6
   - pip:
     - -e .
diff --git a/htsinfer/cli.py b/htsinfer/cli.py
index c8d5c553..256aeaa8 100644
--- a/htsinfer/cli.py
+++ b/htsinfer/cli.py
@@ -50,6 +50,7 @@ def __call__(
             values,
             option_string=None,
         ) -> None:
+            assert isinstance(values, list)
             if len(values) > 2:
                 parser.print_usage(file=sys.stderr)
                 sys.stderr.write(
diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py
index 2edfd095..a2c03a02 100644
--- a/htsinfer/get_library_type.py
+++ b/htsinfer/get_library_type.py
@@ -21,9 +21,7 @@
     SeqIdFormats,
     Config,
 )
-from htsinfer.get_read_orientation import (
-    GetOrientation,
-)
+from htsinfer.mapping import Mapping
 
 LOGGER = logging.getLogger(__name__)
 
@@ -62,6 +60,7 @@ class GetLibType:
     def __init__(
         self,
         config: Config,
+        mapping: Mapping,
     ):
         """Class constructor."""
         self.path_1: Path = config.args.path_1_processed
@@ -69,8 +68,7 @@ def __init__(
         self.library_source = config.results.library_source
         self.results: ResultsType = ResultsType()
         self.tmp_dir = config.args.tmp_dir
-        self.get_read_orientation: \
-            GetOrientation = GetOrientation(config=config)
+        self.mapping = mapping
         self.max_distance = config.args.lib_type_max_distance
         self.cutoff = config.args.lib_type_mates_cutoff
 
@@ -126,23 +124,24 @@ def _evaluate_mate_relationship(
                 self.results.relationship = (
                     StatesTypeRelationship.split_mates
                 )
+                self.mapping.library_type.relationship = (
+                    StatesTypeRelationship.split_mates
+                )
         else:
-            self.get_read_orientation.library_type.relationship \
+            self.mapping.library_type.relationship \
                 = StatesTypeRelationship.not_available
-            self.get_read_orientation.library_source = self.library_source
-            _ = self.get_read_orientation.evaluate()
+            self.mapping.library_source = self.library_source
+            self.mapping.evaluate()
             self._align_mates()
 
     def _align_mates(self):
         """Decide mate relationship by alignment."""
 
-        alignment_1 = Path(self.tmp_dir) \
-            / "alignments" / "file_1" / "Aligned.out.sam"
-        alignment_2 = Path(self.tmp_dir) \
-            / "alignments" / "file_2" / "Aligned.out.sam"
+        alignment_1 = self.mapping.star_dirs[0] / 'Aligned.out.sam'
+        alignment_2 = self.mapping.star_dirs[1] / 'Aligned.out.sam'
 
-        samfile1 = pysam.AlignmentFile(alignment_1, 'r')
-        samfile2 = pysam.AlignmentFile(alignment_2, 'r')
+        samfile1 = pysam.AlignmentFile(str(alignment_1), 'r')
+        samfile2 = pysam.AlignmentFile(str(alignment_2), 'r')
 
         previous_seq_id1 = None
         previous_seq_id2 = None
@@ -184,6 +183,10 @@ def _align_mates(self):
             self.results.relationship = (
                 StatesTypeRelationship.split_mates
             )
+            self.mapping.library_type.relationship \
+                = StatesTypeRelationship.split_mates
+            self.mapping.mapped = False
+            self.mapping.star_dirs = []
         else:
             self.results.relationship = (
                 StatesTypeRelationship.not_mates
diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py
index b923234a..d76b0922 100644
--- a/htsinfer/get_read_orientation.py
+++ b/htsinfer/get_read_orientation.py
@@ -2,26 +2,21 @@
 
 from collections import defaultdict
 import logging
-import math
 from pathlib import Path
-import subprocess as sp
 from typing import (Any, DefaultDict, Dict, List)
 
-from Bio import SeqIO  # type: ignore
 import pysam  # type: ignore
 
 from htsinfer.exceptions import (
     FileProblem,
-    SamFileProblem,
-    StarProblem,
 )
 from htsinfer.models import (
     ResultsOrientation,
     StatesOrientation,
     StatesOrientationRelationship,
-    StatesTypeRelationship,
     Config,
 )
+from htsinfer.mapping import Mapping
 
 LOGGER = logging.getLogger(__name__)
 
@@ -54,6 +49,7 @@ class GetOrientation:
     def __init__(
         self,
         config: Config,
+        mapping: Mapping,
     ):
         """Class contructor."""
         self.paths = (config.args.path_1_processed,
@@ -62,9 +58,9 @@ def __init__(
         self.library_source = config.results.library_source
         self.transcripts_file = config.args.t_file_processed
         self.tmp_dir = config.args.tmp_dir
-        self.threads_star = config.args.threads
         self.min_mapped_reads = config.args.read_orientation_min_mapped_reads
         self.min_fraction = config.args.read_orientation_min_fraction
+        self.mapping = mapping
 
     def evaluate(self) -> ResultsOrientation:
         """Infer read orientation.
@@ -73,327 +69,16 @@ def evaluate(self) -> ResultsOrientation:
             Orientation results object.
         """
 
-        # get transcripts for current organims
-        transcripts = self.subset_transcripts_by_organism()
-        ref_size = self.get_fasta_size(fasta=transcripts)
-        index_string_size = self.get_star_index_string_size(ref_size=ref_size)
-        chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size,
-                                                  transcripts=transcripts)
-
-        # generate STAR alignments
-        index_dir = self.create_star_index(
-            fasta=transcripts,
-            index_string_size=index_string_size,
-            chr_bin_bits=chr_bin_bits,
-        )
-        star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir)
-        self.generate_star_alignments(commands=star_cmds)
-
-        # process alignments
-        star_dirs = [e for e in star_cmds if e is not None]
-
-        return self.process_alignments(star_dirs=star_dirs)
-
-    def subset_transcripts_by_organism(self) -> Path:
-        """Filter FASTA file of transcripts by current sources.
-
-        The filtered file contains records from the indicated sources.
-            Typically, this is one source. However, for if two input files
-            were supplied that are originating from different sources (i.e.,
-            not from a valid paired-ended library), it may be from two
-            different sources. If no source is supplied (because it could
-            not be inferred), no filtering is done.
-
-        Returns:
-            Path to filtered FASTA file.
-
-        Raises:
-            FileProblem: Could not open input/output FASTA file for
-                reading/writing.
-        """
-        LOGGER.debug(f"Subsetting transcripts for: {self.library_source}")
-
-        outfile = self.tmp_dir / f"{self.library_source}.fasta"
-
-        def yield_filtered_seqs():
-            """Generator yielding sequence records for specified sources.
-
-            Yields:
-                Next FASTA sequence record of the specified sources.
-
-            Raises: Could not process input FASTA file.
-            """
-            sources = []
-            if self.library_source.file_1.short_name is not None:
-                sources.append(self.library_source.file_1.short_name)
-            if self.library_source.file_2.short_name is not None:
-                sources.append(self.library_source.file_2.short_name)
-            try:
-                for record in SeqIO.parse(
-                    handle=self.transcripts_file,
-                    format='fasta',
-                ):
-                    try:
-                        org_name = record.description.split("|")[3]
-                    except (ValueError, IndexError):
-                        continue
-                    if org_name in sources or len(sources) == 0:
-                        yield record
-
-            except OSError as exc:
-                raise FileProblem(
-                    f"Could not process file '{self.transcripts_file}'"
-                ) from exc
-
-        try:
-            SeqIO.write(
-                sequences=yield_filtered_seqs(),
-                handle=outfile,
-                format='fasta',
-            )
-        except OSError as exc:
-            raise FileProblem(
-                f"Failed to write to FASTA file '{outfile}'"
-            ) from exc
-
-        LOGGER.debug(f"Filtered transcripts file: {outfile}")
-        return outfile
-
-    @staticmethod
-    def get_fasta_size(fasta: Path) -> int:
-        """Get size of FASTA file in total nucleotides.
-
-        Args:
-            fasta: Path to FASTA file.
-
-        Returns:
-            Total number of nucleotides of all records.
-
-        Raises:
-            FileProblem: Could not open FASTA file for reading.
-        """
-        nucleotides: int = 0
-
-        try:
-            for record in SeqIO.parse(
-                handle=fasta,
-                format='fasta',
-            ):
-                nucleotides += len(record.seq)
-
-        except OSError as exc:
-            raise FileProblem(
-                f"Could not process file: {fasta}"
-            ) from exc
-
-        LOGGER.debug(f"Size of reference: {nucleotides}")
-        return nucleotides
-
-    @staticmethod
-    def get_star_index_string_size(ref_size: int) -> int:
-        """Get length of STAR SA pre-indexing string.
-
-        Cf.
-        https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf
-
-        Args:
-            ref_size: Size of genome/transcriptome reference in nucleotides.
-
-        Returns:
-            Size (in nucleotides) of SA pre-indexing string.
-        """
-        index_string_size = min(
-            14,
-            int(math.floor(math.log2(ref_size) / 2 - 1))
-        )
-        LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}")
-        return index_string_size
-
-    @staticmethod
-    def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int:
-        """Get size of bins for STAR genome storage.
+        self.mapping.paths = self.paths
+        self.mapping.library_type = self.library_type
+        self.mapping.library_source = self.library_source
+        self.mapping.transcripts_file = self.transcripts_file
+        self.mapping.tmp_dir = self.tmp_dir
 
-        Args:
-            ref_size: Size of genome/transcriptome reference in nucleotides.
-            transcripts: Path to filtered FASTA transcripts file.
-
-        Returns:
-            Number of bins for genome storage.
-        """
-        n_ref: int = 0
-
-        for _ in SeqIO.parse(
-            handle=transcripts,
-            format='fasta',
-        ):
-            n_ref += 1
-
-        chr_bin_bits = min(
-            18,
-            int(round(math.log2(ref_size / n_ref)))
-        )
-        LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits)
-        return chr_bin_bits
-
-    def create_star_index(
-        self,
-        fasta: Path,
-        chr_bin_bits: int = 18,
-        index_string_size: int = 5,
-    ) -> Path:
-        """Prepare STAR index.
-
-        Args:
-            fasta: Path to FASTA file of sequence records to create index from.
-            index_string_size: Size of SA pre-indexing string, in nucleotides.
-
-        Returns:
-            Path to directory containing STAR index.
-
-        Raises:
-            StarProblem: STAR index could not be created.
-        """
-        LOGGER.debug(f"Creating STAR index for: {fasta}")
-
-        index_dir: Path = Path(self.tmp_dir) / "index"
-
-        # solves the macOS issue with STAR
-        index_dir.mkdir(parents=True, exist_ok=True)
-
-        cmd = [
-            "STAR",
-            "--runMode", "genomeGenerate",
-            "--genomeSAindexNbases", f"{str(index_string_size)}",
-            "--genomeChrBinNbits", f"{str(chr_bin_bits)}",
-            "--runThreadN", f"{str(self.threads_star)}",
-            "--genomeDir", f"{str(index_dir)}",
-            "--genomeFastaFiles", f"{str(fasta)}",
-        ]
-
-        result = sp.run(
-            cmd,
-            capture_output=True,
-            text=True,
-        )
-        if result.returncode != 0:
-            LOGGER.error(result.stderr)
-            raise StarProblem("Failed to create STAR index")
+        if not self.mapping.mapped:
+            self.mapping.evaluate()
 
-        LOGGER.debug(f"STAR index created: {index_dir}")
-        return index_dir
-
-    def prepare_star_alignment_commands(
-        self,
-        index_dir: Path,
-    ) -> Dict[Path, List[str]]:
-        """Prepare STAR alignment commands.
-
-        Args:
-            index_dir: Path to directory containing STAR index.
-
-        Returns:
-            Dictionary of output paths and corresponding STAR commands.
-        """
-        LOGGER.debug("Preparing STAR commands...")
-
-        # helper function for compiling individual command
-        def build_star_command(
-            read_files: List[str],
-            out_dir: str,
-        ) -> List[str]:
-            """Compile an individual STAR alignment command.
-
-            Args:
-                read_files: List of read file paths.
-                out_dir: STAR output directory.
-
-            Returns:
-                STAR command list.
-            """
-            cmd_base: List[str] = [
-                "STAR",
-                "--alignIntronMax", "1",
-                "--alignEndsType", "Local",
-                "--runThreadN", f"{str(self.threads_star)}",
-                "--genomeDir", f"{str(index_dir)}",
-                "--outFilterMultimapNmax", "50",
-                "--outSAMunmapped", "Within", "KeepPairs",
-            ]
-            cmd: List[str] = cmd_base[:]
-            cmd.append("--readFilesIn")
-            cmd.extend(read_files)
-            cmd.append("--outFileNamePrefix")
-            cmd.append(out_dir)
-
-            # solves the macOS issue with STAR
-            Path(out_dir).mkdir(parents=True, exist_ok=True)
-
-            return cmd
-
-        out_dir_base: Path = Path(self.tmp_dir) / "alignments"
-        commands: Dict = {}
-
-        # create command for pairend-ended libraries
-        if (
-            self.library_type.relationship
-            == StatesTypeRelationship.split_mates
-        ):
-            out_dir = out_dir_base / "paired"
-            commands[out_dir] = build_star_command(
-                read_files=[str(path) for path in self.paths],
-                out_dir=f"{str(out_dir)}/",
-            )
-        # create commands for single-ended libraries
-        else:
-            out_dir = out_dir_base / "file_1"
-            commands[out_dir] = build_star_command(
-                read_files=[str(self.paths[0])],
-                out_dir=f"{str(out_dir)}/",
-            )
-            # run two commands in case there is a second file provided that is
-            # not a mate of the first one
-            out_dir = out_dir_base / "file_2"
-            if self.paths[1] is not None:
-                commands[out_dir] = build_star_command(
-                    read_files=[str(self.paths[1])],
-                    out_dir=f"{str(out_dir)}/",
-                )
-
-        return commands
-
-    @staticmethod
-    def generate_star_alignments(commands: Dict[Path, List[str]]) -> None:
-        """Align reads to index with STAR.
-
-        Args:
-            commands: Dictionary of output paths and corresponding STAR
-                commands.
-
-        Raises:
-            StarProblem: Generating alignments failed.
-        """
-        LOGGER.debug("Aligning reads with STAR...")
-
-        # execute commands
-        for out_dir, cmd in commands.items():
-            try:
-                result = sp.run(
-                    cmd,
-                    capture_output=True,
-                    text=True,
-                    check=True,
-                )
-                if result.returncode != 0:
-                    LOGGER.error(result.stderr)
-                    raise StarProblem(
-                        "Failed to generate STAR alignments for command: "
-                        f"{cmd}"
-                    )
-            except sp.CalledProcessError as exc:
-                raise StarProblem(
-                    f"Failed to generate STAR alignments for command: {cmd}"
-                ) from exc
-            LOGGER.debug(f"Written STAR output to directory: {out_dir}")
+        return self.process_alignments(star_dirs=self.mapping.star_dirs)
 
     def process_alignments(
         self,
@@ -459,17 +144,11 @@ def process_single(
                             states[record.query_name].append(
                                 StatesOrientation.stranded_forward
                             )
-
-        except OSError as exc:
+        except (OSError, ValueError) as exc:
             raise FileProblem(
                 f"Failed to open SAM file: '{sam}'"
             ) from exc
 
-        except ValueError as exc:
-            raise SamFileProblem(
-                f"Not a valid SAM file: '{sam}'"
-            ) from exc
-
         LOGGER.debug("Deciding read orientation...")
         reads = len(states)
         fractions = [
diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
index 28dd3bf8..9f2f6602 100755
--- a/htsinfer/htsinfer.py
+++ b/htsinfer/htsinfer.py
@@ -27,6 +27,7 @@
     Config,
 )
 from htsinfer.subset_fastq import SubsetFastq
+from htsinfer.mapping import Mapping
 
 LOGGER = logging.getLogger(__name__)
 
@@ -64,6 +65,7 @@ def __init__(
             else config.args.tmp_dir / config.args.transcripts_file.name
         )
         self.state: RunStates = RunStates.OKAY
+        self.mapping: Mapping = Mapping(config=self.config)
 
     def evaluate(self):
         """Determine library metadata."""
@@ -247,6 +249,7 @@ def get_library_type(self):
         """Determine library type."""
         get_lib_type = GetLibType(
             config=self.config,
+            mapping=self.mapping,
         )
         get_lib_type.evaluate()
         self.config.results.library_type = get_lib_type.results
@@ -255,6 +258,7 @@ def get_read_orientation(self):
         """Determine read orientation."""
         get_read_orientation = GetOrientation(
             config=self.config,
+            mapping=self.mapping,
         )
         self.config.results.read_orientation = get_read_orientation.evaluate()
 
diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py
new file mode 100644
index 00000000..cb53f7a5
--- /dev/null
+++ b/htsinfer/mapping.py
@@ -0,0 +1,378 @@
+"""Mapping FASTQ's and managing the outputs of STAR."""
+
+import logging
+import math
+from pathlib import Path
+import subprocess as sp
+from typing import (Dict, List)
+
+from Bio import SeqIO  # type: ignore
+
+from htsinfer.exceptions import (
+    FileProblem,
+    StarProblem,
+)
+from htsinfer.models import (
+    Config,
+    StatesTypeRelationship,
+)
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Mapping:
+    """Map FASTQ file/s and manage outputs.
+
+    Args:
+        path: Path to FASTQ file.
+
+    Attributes:
+        path_1: Path to single-end library or first mate file.
+        path_2: Path to second mate file.
+
+    Raise:
+        FileProblem: The input file could not be parsed or the output file
+            could not be written.
+    """
+
+    def __init__(
+            self,
+            config: Config,
+    ):
+        """Class contructor."""
+        self.paths = (config.args.path_1_processed,
+                      config.args.path_2_processed)
+        self.library_type = config.results.library_type
+        self.library_source = config.results.library_source
+        self.transcripts_file = config.args.t_file_processed
+        self.tmp_dir = config.args.tmp_dir
+        self.threads_star = config.args.threads
+        self.mapped = False
+        self.star_dirs: List[Path] = []
+
+    def evaluate(self):
+        """Infer read orientation.
+
+        Returns:
+            Orientation results object.
+        """
+
+        # get transcripts for current organims
+        transcripts = self.subset_transcripts_by_organism()
+        ref_size = self.get_fasta_size(fasta=transcripts)
+        index_string_size = self.get_star_index_string_size(ref_size=ref_size)
+        chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size,
+                                                  transcripts=transcripts)
+
+        # generate STAR alignments
+        index_dir = self.create_star_index(
+            fasta=transcripts,
+            index_string_size=index_string_size,
+            chr_bin_bits=chr_bin_bits,
+        )
+        star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir)
+        self.generate_star_alignments(commands=star_cmds)
+        # process alignments
+        self.star_dirs = [e for e in star_cmds if e is not None]
+        self.mapped = True
+
+    def subset_transcripts_by_organism(self) -> Path:
+        """Filter FASTA file of transcripts by current sources.
+
+        The filtered file contains records from the indicated sources.
+            Typically, this is one source. However, for if two input files
+            were supplied that are originating from different sources (i.e.,
+            not from a valid paired-ended library), it may be from two
+            different sources. If no source is supplied (because it could
+            not be inferred), no filtering is done.
+
+        Returns:
+            Path to filtered FASTA file.
+
+        Raises:
+            FileProblem: Could not open input/output FASTA file for
+                reading/writing.
+        """
+        LOGGER.debug(f"Subsetting transcripts for: {self.library_source}")
+
+        outfile = self.tmp_dir / "transcripts_subset.fasta"
+
+        def yield_filtered_seqs():
+            """Generator yielding sequence records for specified sources.
+
+            Yields:
+                Next FASTA sequence record of the specified sources.
+
+            Raises: Could not process input FASTA file.
+            """
+            sources = []
+            if self.library_source.file_1.short_name is not None:
+                sources.append(self.library_source.file_1.short_name)
+            if self.library_source.file_2.short_name is not None:
+                sources.append(self.library_source.file_2.short_name)
+            try:
+                for record in SeqIO.parse(
+                        handle=self.transcripts_file,
+                        format='fasta',
+                ):
+                    try:
+                        org_name = record.description.split("|")[3]
+                    except (ValueError, IndexError):
+                        continue
+                    if org_name in sources or len(sources) == 0:
+                        yield record
+
+            except OSError as exc:
+                raise FileProblem(
+                    f"Could not process file '{self.transcripts_file}'"
+                ) from exc
+
+        try:
+            SeqIO.write(
+                sequences=yield_filtered_seqs(),
+                handle=outfile,
+                format='fasta',
+            )
+        except OSError as exc:
+            raise FileProblem(
+                f"Failed to write to FASTA file '{outfile}'"
+            ) from exc
+
+        LOGGER.debug(f"Filtered transcripts file: {outfile}")
+        return outfile
+
+    @staticmethod
+    def get_fasta_size(fasta: Path) -> int:
+        """Get size of FASTA file in total nucleotides.
+
+        Args:
+            fasta: Path to FASTA file.
+
+        Returns:
+            Total number of nucleotides of all records.
+
+        Raises:
+            FileProblem: Could not open FASTA file for reading.
+        """
+        nucleotides: int = 0
+
+        try:
+            for record in SeqIO.parse(
+                    handle=fasta,
+                    format='fasta',
+            ):
+                nucleotides += len(record.seq)
+
+        except OSError as exc:
+            raise FileProblem(
+                f"Could not process file: {fasta}"
+            ) from exc
+
+        LOGGER.debug(f"Size of reference: {nucleotides}")
+        return nucleotides
+
+    @staticmethod
+    def get_star_index_string_size(ref_size: int) -> int:
+        """Get length of STAR SA pre-indexing string.
+
+        Cf.
+        https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf
+
+        Args:
+            ref_size: Size of genome/transcriptome reference in nucleotides.
+
+        Returns:
+            Size (in nucleotides) of SA pre-indexing string.
+        """
+        index_string_size = min(
+            14,
+            int(math.floor(math.log2(ref_size) / 2 - 1))
+        )
+        LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}")
+        return index_string_size
+
+    @staticmethod
+    def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int:
+        """Get size of bins for STAR genome storage.
+
+        Args:
+            ref_size: Size of genome/transcriptome reference in nucleotides.
+            transcripts: Path to filtered FASTA transcripts file.
+
+        Returns:
+            Number of bins for genome storage.
+        """
+        n_ref: int = 0
+
+        for _ in SeqIO.parse(
+                handle=transcripts,
+                format='fasta',
+        ):
+            n_ref += 1
+
+        chr_bin_bits = min(
+            18,
+            int(round(math.log2(ref_size / n_ref)))
+        )
+        LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits)
+        return chr_bin_bits
+
+    def create_star_index(
+            self,
+            fasta: Path,
+            chr_bin_bits: int = 18,
+            index_string_size: int = 5,
+    ) -> Path:
+        """Prepare STAR index.
+
+        Args:
+            fasta: Path to FASTA file of sequence records to create index from.
+            index_string_size: Size of SA pre-indexing string, in nucleotides.
+
+        Returns:
+            Path to directory containing STAR index.
+
+        Raises:
+            StarProblem: STAR index could not be created.
+        """
+        LOGGER.debug(f"Creating STAR index for: {fasta}")
+
+        index_dir: Path = Path(self.tmp_dir) / "index"
+
+        # solves the macOS issue with STAR
+        index_dir.mkdir(parents=True, exist_ok=True)
+
+        cmd = [
+            "STAR",
+            "--runMode", "genomeGenerate",
+            "--genomeSAindexNbases", f"{str(index_string_size)}",
+            "--genomeChrBinNbits", f"{str(chr_bin_bits)}",
+            "--runThreadN", f"{str(self.threads_star)}",
+            "--genomeDir", f"{str(index_dir)}",
+            "--genomeFastaFiles", f"{str(fasta)}",
+        ]
+
+        result = sp.run(
+            cmd,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            LOGGER.error(result.stderr)
+            raise StarProblem("Failed to create STAR index")
+
+        LOGGER.debug(f"STAR index created: {index_dir}")
+        return index_dir
+
+    def prepare_star_alignment_commands(
+            self,
+            index_dir: Path,
+    ) -> Dict[Path, List[str]]:
+        """Prepare STAR alignment commands.
+
+        Args:
+            index_dir: Path to directory containing STAR index.
+
+        Returns:
+            Dictionary of output paths and corresponding STAR commands.
+        """
+        LOGGER.debug("Preparing STAR commands...")
+
+        # helper function for compiling individual command
+        def build_star_command(
+                read_files: List[str],
+                out_dir: str,
+        ) -> List[str]:
+            """Compile an individual STAR alignment command.
+
+            Args:
+                read_files: List of read file paths.
+                out_dir: STAR output directory.
+
+            Returns:
+                STAR command list.
+            """
+            cmd_base: List[str] = [
+                "STAR",
+                "--alignIntronMax", "1",
+                "--alignEndsType", "Local",
+                "--runThreadN", f"{str(self.threads_star)}",
+                "--genomeDir", f"{str(index_dir)}",
+                "--outFilterMultimapNmax", "50",
+                "--outSAMunmapped", "Within", "KeepPairs",
+            ]
+            cmd: List[str] = cmd_base[:]
+            cmd.append("--readFilesIn")
+            cmd.extend(read_files)
+            cmd.append("--outFileNamePrefix")
+            cmd.append(out_dir)
+
+            # solves the macOS issue with STAR
+            Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+            return cmd
+
+        out_dir_base: Path = Path(self.tmp_dir) / "alignments"
+        commands: Dict = {}
+
+        # create command for pairend-ended libraries
+        if (
+                self.library_type.relationship
+                == StatesTypeRelationship.split_mates
+        ):
+            out_dir = out_dir_base / "paired"
+            commands[out_dir] = build_star_command(
+                read_files=[str(path) for path in self.paths],
+                out_dir=f"{str(out_dir)}/",
+            )
+        # create commands for single-ended libraries
+        else:
+            out_dir = out_dir_base / "file_1"
+            commands[out_dir] = build_star_command(
+                read_files=[str(self.paths[0])],
+                out_dir=f"{str(out_dir)}/",
+            )
+            # run two commands in case there is a second file provided that is
+            # not a mate of the first one
+            out_dir = out_dir_base / "file_2"
+            if self.paths[1] is not None:
+                commands[out_dir] = build_star_command(
+                    read_files=[str(self.paths[1])],
+                    out_dir=f"{str(out_dir)}/",
+                )
+
+        return commands
+
+    @staticmethod
+    def generate_star_alignments(commands: Dict[Path, List[str]]) -> None:
+        """Align reads to index with STAR.
+
+        Args:
+            commands: Dictionary of output paths and corresponding STAR
+                commands.
+
+        Raises:
+            StarProblem: Generating alignments failed.
+        """
+        LOGGER.debug("Aligning reads with STAR...")
+
+        # execute commands
+        for out_dir, cmd in commands.items():
+            try:
+                result = sp.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                if result.returncode != 0:
+                    LOGGER.error(result.stderr)
+                    raise StarProblem(
+                        "Failed to generate STAR alignments for command: "
+                        f"{cmd}"
+                    )
+            except sp.CalledProcessError as exc:
+                raise StarProblem(
+                    f"Failed to generate STAR alignments for command: {cmd}"
+                ) from exc
+            LOGGER.debug(f"Written STAR output to directory: {out_dir}")
diff --git a/setup.py b/setup.py
index 6b037f1e..ed2144a3 100644
--- a/setup.py
+++ b/setup.py
@@ -26,9 +26,9 @@
         "Intended Audience :: Science/Research",
         "License :: OSI Approved :: Apache Software License",
         "Natural Language :: English",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Topic :: Utilities",
     ],
diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py
index c738d8fd..dd52dfaa 100644
--- a/tests/test_get_library_type.py
+++ b/tests/test_get_library_type.py
@@ -38,6 +38,7 @@
     SEQ_ID_MATE_2,
     SEQ_ID_SINGLE,
     CONFIG,
+    MAPPING,
 )
 
 
@@ -47,20 +48,23 @@ class TestGetLibType:
     def test_init_required(self):
         """Create instance with required parameters."""
         CONFIG.args.path_2_processed = None
-        test_instance = GetLibType(config=CONFIG)
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         assert test_instance.path_1 == FILE_MATE_1
 
     def test_init_all(self):
         """Create instance with all available parameters."""
         CONFIG.args.path_2_processed = FILE_MATE_2
-        test_instance = GetLibType(config=CONFIG)
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         assert test_instance.path_1 == FILE_MATE_1
         assert test_instance.path_2 == FILE_MATE_2
 
     def test_evaluate_one_file(self):
         """Get library type for a single file."""
         CONFIG.args.path_2_processed = None
-        test_instance = GetLibType(config=CONFIG)
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         test_instance.evaluate()
         assert test_instance.results == ResultsType(
             file_1=StatesType.first_mate,
@@ -71,7 +75,8 @@ def test_evaluate_one_file(self):
     def test_evaluate_two_files(self):
         """Get library type for two files."""
         CONFIG.args.path_2_processed = FILE_MATE_2
-        test_instance = GetLibType(config=CONFIG)
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         test_instance.evaluate()
         assert test_instance.results == ResultsType(
             file_1=StatesType.first_mate,
@@ -83,7 +88,8 @@ def test_evaluate_mate_relationship_split_mates(self):
         """Test mate relationship evaluation logic with input files being
         mates of a paired-end library.
         """
-        test_instance = GetLibType(config=CONFIG)
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         test_instance.results.file_1 = StatesType.first_mate
         test_instance.results.file_2 = StatesType.second_mate
         test_instance._evaluate_mate_relationship(
@@ -105,43 +111,38 @@ def test_evaluate_mate_relationship_split_mates(self):
             StatesTypeRelationship.split_mates
         )
 
-    def test_evaluate_mate_relationship_not_mates(self):
+    def test_evaluate_mate_relationship_not_mates(self, tmpdir):
         """Test mate relationship evaluation logic with input files that are
         not mates from a paired-end library.
         """
         CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        test_instance = GetLibType(config=CONFIG)
+        CONFIG.args.tmp_dir = tmpdir
+        MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2)
+        MAPPING.transcripts_file = FILE_TRANSCRIPTS
+        MAPPING.tmp_dir = tmpdir
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         test_instance.results.file_1 = StatesType.first_mate
         test_instance.results.file_2 = StatesType.second_mate
-        test_instance._evaluate_mate_relationship(
-            ids_1=["A", "B", "C"],
-            ids_2=["C", "B", "A"],
-        )
-        assert (
-            test_instance.results.relationship ==
-            StatesTypeRelationship.not_mates
-        )
-        test_instance.results.file_1 = StatesType.single
-        test_instance.results.file_2 = StatesType.first_mate
-        test_instance._evaluate_mate_relationship(
-            ids_1=["A", "B", "C"],
-            ids_2=["A", "B", "C"],
-        )
+        test_instance.evaluate()
         assert (
             test_instance.results.relationship ==
             StatesTypeRelationship.not_mates
         )
 
-    def test_evaluate_split_mates_not_matching_ids(self):
+    def test_evaluate_split_mates_not_matching_ids(self, tmpdir):
         """Test mate relationship evaluation logic with input files that are
         not mates from a paired-end library.
         """
         CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
         CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2
-        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        test_instance = GetLibType(config=CONFIG)
+        CONFIG.args.tmp_dir = tmpdir
+        MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2)
+        MAPPING.tmp_dir = tmpdir
+        test_instance = GetLibType(config=CONFIG,
+                                   mapping=MAPPING)
         test_instance.evaluate()
         assert (
                 test_instance.results.relationship ==
diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py
index 75a2a25d..10894334 100644
--- a/tests/test_get_read_orientation.py
+++ b/tests/test_get_read_orientation.py
@@ -4,8 +4,6 @@
 
 from htsinfer.exceptions import (
     FileProblem,
-    SamFileProblem,
-    StarProblem,
 )
 from htsinfer.get_read_orientation import GetOrientation
 from htsinfer.models import (
@@ -18,11 +16,8 @@
     StatesTypeRelationship,
 )
 from tests.utils import (
-    FILE_2000_RECORDS,
-    FILE_DUMMY,
     FILE_EMPTY_ALIGNED_SAM,
     FILE_BAD_ALIGNED_SAM,
-    FILE_INVALID_TRANSCRIPTS,
     FILE_MATE_1,
     FILE_MATE_2,
     FILE_ORIENTATION_ISF_1,
@@ -38,11 +33,9 @@
     FILE_UNMAPPED_PAIRED_1,
     FILE_UNMAPPED_PAIRED_2,
     FILE_UNMAPPED_SINGLE,
+    FILE_IDS_NOT_MATCH_2,
     CONFIG,
-    RaiseError,
-    SubprocessError,
-    SOURCE_HUMAN,
-    SOURCE_FRUIT_FLY,
+    MAPPING,
 )
 
 
@@ -54,7 +47,10 @@ def test_init_required(self):
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = None
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        test_instance = GetOrientation(config=CONFIG)
+        CONFIG.results.library_type = ResultsType()
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         assert test_instance.paths[0] == FILE_MATE_1
         assert test_instance.library_type == ResultsType()
         assert test_instance.transcripts_file == FILE_TRANSCRIPTS
@@ -64,7 +60,8 @@ def test_init_required_paired(self):
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        test_instance = GetOrientation(config=CONFIG)
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         assert test_instance.paths[0] == FILE_MATE_1
         assert test_instance.paths[1] == FILE_MATE_2
         assert test_instance.library_type == ResultsType()
@@ -76,14 +73,14 @@ def test_init_all(self, tmpdir):
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         assert test_instance.paths[0] == FILE_MATE_1
         assert test_instance.paths[1] == FILE_MATE_2
         assert test_instance.library_type == ResultsType()
         assert test_instance.library_source == ResultsSource()
         assert test_instance.transcripts_file == FILE_TRANSCRIPTS
         assert test_instance.tmp_dir == tmpdir
-        assert test_instance.threads_star == 1
         assert test_instance.min_mapped_reads == 18
         assert test_instance.min_fraction == 0.75
 
@@ -94,7 +91,9 @@ def test_evaluate_single_unmapped(self, tmpdir):
         CONFIG.args.path_1_processed = FILE_UNMAPPED_SINGLE
         CONFIG.args.path_2_processed = None
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.not_available,
@@ -110,7 +109,9 @@ def test_evaluate_single_sf(self, tmpdir):
                 file_2=Source(),
             )
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.stranded_forward,
@@ -122,7 +123,9 @@ def test_evaluate_single_sr(self, tmpdir):
         """Get read orientation for a single-end stranded reverse library."""
         CONFIG.args.path_1_processed = FILE_ORIENTATION_SR
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.stranded_reverse,
@@ -134,7 +137,9 @@ def test_evaluate_single_u(self, tmpdir):
         """Get read orientation for a single-end unstranded library."""
         CONFIG.args.path_1_processed = FILE_ORIENTATION_U
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.unstranded,
@@ -153,7 +158,9 @@ def test_evaluate_paired_unmapped(self, tmpdir):
         CONFIG.results.library_type = ResultsType(
             relationship=StatesTypeRelationship.split_mates,
         )
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.not_available,
@@ -174,7 +181,9 @@ def test_evaluate_paired_isf(self, tmpdir):
         )
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.stranded_forward,
@@ -187,7 +196,9 @@ def test_evaluate_paired_isr(self, tmpdir):
         CONFIG.args.path_1_processed = FILE_ORIENTATION_ISR_1
         CONFIG.args.path_2_processed = FILE_ORIENTATION_ISR_2
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.stranded_reverse,
@@ -200,7 +211,9 @@ def test_evaluate_paired_iu(self, tmpdir):
         CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1
         CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.unstranded,
@@ -208,183 +221,21 @@ def test_evaluate_paired_iu(self, tmpdir):
             relationship=StatesOrientationRelationship.inward_unstranded,
         )
 
-    def test_subset_transcripts_by_organism(self, tmpdir):
-        """Get filtered orgainsm transcripts for different organisms."""
-        CONFIG.results.library_type = ResultsType(
-                relationship=StatesTypeRelationship.split_mates,
-            )
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=SOURCE_FRUIT_FLY
-            )
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        results = test_instance.subset_transcripts_by_organism()
-        filtered_organisms_transcripts = \
-            tmpdir / f"{CONFIG.results.library_source}.fasta"
-        assert results == filtered_organisms_transcripts
-
-    def test_subset_transcripts_by_organism_file_problem(self, tmpdir):
-        """Pass dummy file as transcripts.fasta file to simulate
-        file problem."""
-        CONFIG.args.path_2_processed = None
-        CONFIG.results.library_type = ResultsType()
-        CONFIG.results.library_source = ResultsSource()
-        CONFIG.args.t_file_processed = FILE_DUMMY
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-
-        with pytest.raises(FileProblem):
-            test_instance.subset_transcripts_by_organism()
-
-    def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir):
-        """Pass invalid transcripts.fasta file to simulate index error."""
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=SOURCE_FRUIT_FLY
-        )
-        CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        results = test_instance.subset_transcripts_by_organism()
-        filtered_organisms_transcripts = \
-            tmpdir / f"{CONFIG.results.library_source}.fasta"
-        assert results == filtered_organisms_transcripts
-
-    def test_get_fasta_size(self, tmpdir):
-        """Get nucleotide statistics for filtererd transcripts
-        with different organisms."""
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=SOURCE_FRUIT_FLY,
-            )
-        CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
-        CONFIG.results.library_type = ResultsType(
-            relationship=StatesTypeRelationship.split_mates,
-        )
-        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        filtered_organisms_transcripts = \
-            test_instance.subset_transcripts_by_organism()
-        results = test_instance.get_fasta_size(filtered_organisms_transcripts)
-        assert results == 249986
-
-    def test_get_fasta_size_file_problem(self, tmpdir):
-        """Pass dummy file as filtered_organisms_transcripts
-        to simulate file problem."""
-        CONFIG.args.path_2_processed = None
-        CONFIG.results.library_type = ResultsType()
-        CONFIG.results.library_source = ResultsSource()
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        with pytest.raises(FileProblem):
-            test_instance.get_fasta_size(FILE_DUMMY)
-
-    def test_get_star_index_string_size(self, tmpdir):
-        """Get length of STAR SA pre-indexing string."""
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        results = test_instance.get_star_index_string_size(249986)
-        assert results == 7
-
-    def test_evaluate_star_index_problem(self, monkeypatch, tmpdir):
-        """Force raising exception to stimulate a star problem."""
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=SOURCE_FRUIT_FLY,
-            )
-        CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
-        CONFIG.results.library_type = ResultsType(
-                relationship=StatesTypeRelationship.split_mates,
-            )
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        monkeypatch.setattr(
-            'htsinfer.get_read_orientation.GetOrientation.create_star_index',
-            lambda *args, **kwargs: StarProblem
-        )
-        with pytest.raises(StarProblem):
-            test_instance.evaluate()
-
-    def test_prepare_star_alignment_commands(self, tmpdir):
-        """Get star alignment command."""
-        CONFIG.args.path_1_processed = FILE_2000_RECORDS
-        CONFIG.args.path_2_processed = None
-        CONFIG.results.library_type = ResultsType(
-            relationship=StatesTypeRelationship.not_mates,
-        )
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=Source(),
-            )
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        index_dir = tmpdir / 'index'
-        file1_alignment_path = tmpdir / 'alignments/file_1'
-        cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \
-            + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \
-            + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \
-            + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \
-            + str(file1_alignment_path) + "/"
-        results = test_instance.prepare_star_alignment_commands(
-            index_dir=index_dir
-            )
-        assert ' '.join(list(results.values())[0]) == cmd
-
-    def test_generate_star_alignments_problem(self, monkeypatch, tmpdir):
-        """Force raising exception to simulate problem."""
-        CONFIG.results.library_source = ResultsSource(
-                file_1=SOURCE_HUMAN,
-                file_2=SOURCE_FRUIT_FLY,
-            )
-        CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1
-        CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
-        CONFIG.results.library_type = ResultsType(
-            relationship=StatesTypeRelationship.not_mates,
-        )
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        sub_method_name = 'htsinfer.get_read_orientation.' + \
-            'GetOrientation.generate_star_alignments'
-        monkeypatch.setattr(
-            sub_method_name,
-            lambda *args, **kwargs: StarProblem,
-        )
-        with pytest.raises(FileProblem):
-            test_instance.evaluate()
-
-    def test_generate_star_alignments_dummy_cmd(self, tmpdir):
-        """Pass dummy cmd to force simulate star problem."""
-        CONFIG.args.path_2_processed = None
-        CONFIG.results.library_type = ResultsType()
-        CONFIG.results.library_source = ResultsSource()
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        index_dir = tmpdir / 'index'
-        file1_alignment_path = tmpdir / 'alignments/file_1'
-        dummy_cmd = [
-            'STAR', '--alignIntrnMax', '1',
-            '--alignEndsType', 'Local', '--runThreadN', '1',
-            "--genomeDir", f"{str(index_dir)}",
-            ]
-        cmds = {file1_alignment_path: dummy_cmd}
-        with pytest.raises(StarProblem):
-            test_instance.generate_star_alignments(cmds)
-
-    def test_process_single_dummy_sam_file(self, tmpdir):
+    def test_process_single_dummy_sam_file_file_problem(self, tmpdir):
         """Pass dummy aligned.out.sam file to simulate file
         problem."""
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        with pytest.raises(SamFileProblem):
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
+        with pytest.raises(FileProblem):
             test_instance.process_single(FILE_EMPTY_ALIGNED_SAM)
 
     def test_process_paired_dummy_sam_file(self, tmpdir):
         """Pass dummy aligned.out.sam file to simulate file
         problem."""
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         with pytest.raises(FileProblem):
             test_instance.process_paired(FILE_EMPTY_ALIGNED_SAM)
 
@@ -392,69 +243,31 @@ def test_process_paired_wrong_sam_file(self, tmpdir):
         """Pass bad_aligned.out.sam file to ensure correct
         paired file behaviour."""
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         assert test_instance.process_paired(FILE_BAD_ALIGNED_SAM) \
                == ResultsOrientation()
 
-    def test_create_star_index_star_problem(self, tmpdir):
-        """Pass invalid transcripts path to simulate star problem."""
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        transcripts_path = tmpdir / 'invalid'
-        with pytest.raises(StarProblem):
-            test_instance.create_star_index(transcripts_path)
-
     def test_evaluate_paired_not_mates_unmapped(self, tmpdir):
         """Get read orientation for a paired-end library with no mappable
         reads.
         """
         CONFIG.args.path_1_processed = FILE_UNMAPPED_PAIRED_1
-        CONFIG.args.path_2_processed = FILE_UNMAPPED_PAIRED_2
+        CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2
         CONFIG.results.library_type = ResultsType(
-                relationship=StatesTypeRelationship.not_mates,
+                relationship=StatesTypeRelationship.not_available,
             )
+        CONFIG.results.library_source = ResultsSource(
+            file_1=Source(),
+            file_2=Source(),
+        )
         CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
+        MAPPING.mapped = False
+        test_instance = GetOrientation(config=CONFIG,
+                                       mapping=MAPPING)
         results = test_instance.evaluate()
         assert results == ResultsOrientation(
             file_1=StatesOrientation.not_available,
-            file_2=StatesOrientation.not_available,
+            file_2=StatesOrientation.stranded_reverse,
             relationship=StatesOrientationRelationship.not_available,
         )
-
-    def test_subset_transcripts_by_organism_cannot_write_file(
-        self, monkeypatch, tmpdir
-    ):
-        """Force raising of ``OSError`` to simulate file problem."""
-        CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1
-        CONFIG.args.path_2_processed = None
-        CONFIG.results.library_source = ResultsSource()
-        CONFIG.results.library_type = ResultsType()
-        CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        monkeypatch.setattr(
-            'Bio.SeqIO.write',
-            RaiseError(exc=OSError),
-        )
-        with pytest.raises(FileProblem):
-            test_instance.subset_transcripts_by_organism()
-
-    def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir):
-        """Force raising of ``SubprocessError`` to simulate star probelm."""
-        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        CONFIG.args.tmp_dir = tmpdir
-        test_instance = GetOrientation(config=CONFIG)
-        file1_alignment_path = tmpdir / 'alignments/file_1'
-        dummy_cmd = [
-            'STAR', '--alignIntrnMax', '1',
-            '--alignEndsType', 'Local', '--runThreadN', '1',
-            "--genomeDir",
-            ]
-        cmds = {file1_alignment_path: dummy_cmd}
-        monkeypatch.setattr(
-            'subprocess.run',
-            lambda *args, **kwargs: SubprocessError(),
-        )
-        with pytest.raises(StarProblem):
-            test_instance.generate_star_alignments(cmds)
diff --git a/tests/test_mapping.py b/tests/test_mapping.py
new file mode 100644
index 00000000..33798bed
--- /dev/null
+++ b/tests/test_mapping.py
@@ -0,0 +1,259 @@
+"""Unit tests for module ``mapping.py``."""
+
+import pytest
+
+from htsinfer.exceptions import (
+    FileProblem,
+    StarProblem,
+)
+from htsinfer.mapping import Mapping
+from htsinfer.models import (
+    ResultsSource,
+    ResultsType,
+    Source,
+    StatesTypeRelationship,
+)
+from tests.utils import (
+    FILE_2000_RECORDS,
+    FILE_DUMMY,
+    FILE_INVALID_TRANSCRIPTS,
+    FILE_MATE_1,
+    FILE_MATE_2,
+    FILE_ORIENTATION_IU_1,
+    FILE_ORIENTATION_IU_2,
+    FILE_TRANSCRIPTS,
+    CONFIG,
+    MAPPING,
+    RaiseError,
+    SubprocessError,
+    SOURCE_HUMAN,
+    SOURCE_FRUIT_FLY,
+)
+
+
+class TestMapping:
+    """Test ``Mapping`` class."""
+
+    def test_init_required(self):
+        """Create instance with required parameters."""
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = None
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.results.library_type = ResultsType()
+        test_instance = Mapping(config=CONFIG)
+        assert test_instance.paths[0] == FILE_MATE_1
+        assert test_instance.library_type == ResultsType()
+        assert test_instance.transcripts_file == FILE_TRANSCRIPTS
+
+    def test_init_required_paired(self):
+        """Create instance with required parameters for paired-end library."""
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        test_instance = Mapping(config=CONFIG)
+        assert test_instance.paths[0] == FILE_MATE_1
+        assert test_instance.paths[1] == FILE_MATE_2
+        assert test_instance.library_type == ResultsType()
+        assert test_instance.transcripts_file == FILE_TRANSCRIPTS
+
+    def test_init_all(self, tmpdir):
+        """Create instance with all available parameters."""
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        assert test_instance.paths[0] == FILE_MATE_1
+        assert test_instance.paths[1] == FILE_MATE_2
+        assert test_instance.library_type == ResultsType()
+        assert test_instance.library_source == ResultsSource()
+        assert test_instance.transcripts_file == FILE_TRANSCRIPTS
+        assert test_instance.tmp_dir == tmpdir
+
+    def test_subset_transcripts_by_organism(self, tmpdir):
+        """Get filtered orgainsm transcripts for different organisms."""
+        CONFIG.results.library_type = ResultsType(
+                relationship=StatesTypeRelationship.split_mates,
+            )
+        CONFIG.results.library_source = ResultsSource(
+                file_1=SOURCE_HUMAN,
+                file_2=SOURCE_FRUIT_FLY
+            )
+        CONFIG.args.tmp_dir = tmpdir
+        MAPPING.mapped = False
+        test_instance = Mapping(config=CONFIG)
+        results = test_instance.subset_transcripts_by_organism()
+        filtered_organisms_transcripts = \
+            tmpdir / "transcripts_subset.fasta"
+        assert results == filtered_organisms_transcripts
+
+    def test_subset_transcripts_by_organism_file_problem(self, tmpdir):
+        """Pass dummy file as transcripts.fasta file to simulate
+        file problem."""
+        CONFIG.args.path_2_processed = None
+        CONFIG.results.library_type = ResultsType()
+        CONFIG.results.library_source = ResultsSource()
+        CONFIG.args.t_file_processed = FILE_DUMMY
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+
+        with pytest.raises(FileProblem):
+            test_instance.subset_transcripts_by_organism()
+
+    def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir):
+        """Pass invalid transcripts.fasta file to simulate index error."""
+        CONFIG.results.library_source = ResultsSource(
+                file_1=SOURCE_HUMAN,
+                file_2=SOURCE_FRUIT_FLY
+        )
+        CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        results = test_instance.subset_transcripts_by_organism()
+        filtered_organisms_transcripts = \
+            tmpdir / "transcripts_subset.fasta"
+        assert results == filtered_organisms_transcripts
+
+    def test_get_fasta_size(self, tmpdir):
+        """Get nucleotide statistics for filtererd transcripts
+        with different organisms."""
+        CONFIG.results.library_source = ResultsSource(
+                file_1=SOURCE_HUMAN,
+                file_2=SOURCE_FRUIT_FLY,
+            )
+        CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
+        CONFIG.results.library_type = ResultsType(
+            relationship=StatesTypeRelationship.split_mates,
+        )
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        filtered_organisms_transcripts = \
+            test_instance.subset_transcripts_by_organism()
+        results = test_instance.get_fasta_size(filtered_organisms_transcripts)
+        assert results == 249986
+
+    def test_get_fasta_size_file_problem(self, tmpdir):
+        """Pass dummy file as filtered_organisms_transcripts
+        to simulate file problem."""
+        CONFIG.args.path_2_processed = None
+        CONFIG.results.library_type = ResultsType()
+        CONFIG.results.library_source = ResultsSource()
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        with pytest.raises(FileProblem):
+            test_instance.get_fasta_size(FILE_DUMMY)
+
+    def test_get_star_index_string_size(self, tmpdir):
+        """Get length of STAR SA pre-indexing string."""
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        results = test_instance.get_star_index_string_size(249986)
+        assert results == 7
+
+    def test_evaluate_star_index_problem(self, monkeypatch, tmpdir):
+        """Force raising exception to stimulate a star problem."""
+        CONFIG.results.library_source = ResultsSource(
+                file_1=SOURCE_HUMAN,
+                file_2=SOURCE_FRUIT_FLY,
+            )
+        CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2
+        CONFIG.results.library_type = ResultsType(
+                relationship=StatesTypeRelationship.split_mates,
+            )
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        monkeypatch.setattr(
+            'htsinfer.mapping.Mapping.create_star_index',
+            lambda *args, **kwargs: StarProblem
+        )
+        with pytest.raises(StarProblem):
+            test_instance.evaluate()
+
+    def test_prepare_star_alignment_commands(self, tmpdir):
+        """Get star alignment command."""
+        CONFIG.args.path_1_processed = FILE_2000_RECORDS
+        CONFIG.args.path_2_processed = None
+        CONFIG.results.library_type = ResultsType(
+            relationship=StatesTypeRelationship.not_mates,
+        )
+        CONFIG.results.library_source = ResultsSource(
+                file_1=SOURCE_HUMAN,
+                file_2=Source(),
+            )
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        index_dir = tmpdir / 'index'
+        file1_alignment_path = tmpdir / 'alignments/file_1'
+        cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \
+            + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \
+            + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \
+            + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \
+            + str(file1_alignment_path) + "/"
+        results = test_instance.prepare_star_alignment_commands(
+            index_dir=index_dir
+            )
+        assert ' '.join(list(results.values())[0]) == cmd
+
+    def test_generate_star_alignments_dummy_cmd(self, tmpdir):
+        """Pass dummy cmd to force simulate star problem."""
+        CONFIG.args.path_2_processed = None
+        CONFIG.results.library_type = ResultsType()
+        CONFIG.results.library_source = ResultsSource()
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        index_dir = tmpdir / 'index'
+        file1_alignment_path = tmpdir / 'alignments/file_1'
+        dummy_cmd = [
+            'STAR', '--alignIntrnMax', '1',
+            '--alignEndsType', 'Local', '--runThreadN', '1',
+            "--genomeDir", f"{str(index_dir)}",
+            ]
+        cmds = {file1_alignment_path: dummy_cmd}
+        with pytest.raises(StarProblem):
+            test_instance.generate_star_alignments(cmds)
+
+    def test_create_star_index_star_problem(self, tmpdir):
+        """Pass invalid transcripts path to simulate star problem."""
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        transcripts_path = tmpdir / 'invalid'
+        with pytest.raises(StarProblem):
+            test_instance.create_star_index(transcripts_path)
+
+    def test_subset_transcripts_by_organism_cannot_write_file(
+        self, monkeypatch, tmpdir
+    ):
+        """Force raising of ``OSError`` to simulate file problem."""
+        CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1
+        CONFIG.args.path_2_processed = None
+        CONFIG.results.library_source = ResultsSource()
+        CONFIG.results.library_type = ResultsType()
+        CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        monkeypatch.setattr(
+            'Bio.SeqIO.write',
+            RaiseError(exc=OSError),
+        )
+        with pytest.raises(FileProblem):
+            test_instance.subset_transcripts_by_organism()
+
+    def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir):
+        """Force raising of ``SubprocessError`` to simulate star probelm."""
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        test_instance = Mapping(config=CONFIG)
+        file1_alignment_path = tmpdir / 'alignments/file_1'
+        dummy_cmd = [
+            'STAR', '--alignIntrnMax', '1',
+            '--alignEndsType', 'Local', '--runThreadN', '1',
+            "--genomeDir",
+            ]
+        cmds = {file1_alignment_path: dummy_cmd}
+        monkeypatch.setattr(
+            'subprocess.run',
+            lambda *args, **kwargs: SubprocessError(),
+        )
+        with pytest.raises(StarProblem):
+            test_instance.generate_star_alignments(cmds)
diff --git a/tests/utils.py b/tests/utils.py
index 2c924d7a..dfa8b98c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,6 +4,7 @@
 from typing import Type
 
 from htsinfer.models import (Source, Config, Args, Results)
+from htsinfer.mapping import Mapping
 
 # test files
 PACKAGE_DIR = Path(__file__).resolve().parents[1] / "htsinfer"
@@ -87,6 +88,8 @@
     results=Results(),
 )
 
+MAPPING = Mapping(config=CONFIG)
+
 
 # helper classes
 class SubprocessError:

From 022d37d298c24d3172b28c42552cdd6990b60e48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 31 Oct 2023 14:51:22 +0100
Subject: [PATCH 03/17] fix typo, update pylint config

---
 htsinfer/get_read_layout.py | 2 +-
 pylint.cfg                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/htsinfer/get_read_layout.py b/htsinfer/get_read_layout.py
index 24529d3f..afd4c898 100644
--- a/htsinfer/get_read_layout.py
+++ b/htsinfer/get_read_layout.py
@@ -221,7 +221,7 @@ def evaluate(self) -> None:
         try:
             with open(self.path, encoding="utf-8") as _f:  # type: ignore
 
-                LOGGER.debug("Procecssing Reads")
+                LOGGER.debug("Processing Reads")
                 try:
                     for record in FastqGeneralIterator(source=_f):
                         read = record[1]
diff --git a/pylint.cfg b/pylint.cfg
index 96462c5d..ef923ddc 100644
--- a/pylint.cfg
+++ b/pylint.cfg
@@ -1,4 +1,4 @@
 [MESSAGES CONTROL]
-disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
-extension-pkg-white-list=pysam,ahocorasick
+disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
+extension-pkg-whitelist=pysam,ahocorasick
 ignored-classes=pysam

From 2bb2451f2b2f8c500a72e1f699e0154b24589050 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 31 Oct 2023 15:06:12 +0100
Subject: [PATCH 04/17] feat: add org_id param #108

---
 htsinfer/cli.py                | 11 ------
 htsinfer/get_library_source.py | 64 ++++++++++++++++++++++++++++++----
 htsinfer/models.py             |  4 +--
 3 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/htsinfer/cli.py b/htsinfer/cli.py
index 256aeaa8..3dee2364 100644
--- a/htsinfer/cli.py
+++ b/htsinfer/cli.py
@@ -273,17 +273,6 @@ def __call__(
             "not be inferred by the application"
         )
     )
-    parser.add_argument(
-        '--org-name',
-        dest="org_name",
-        metavar="STR",
-        type=str,
-        default=None,
-        help=(
-            "source organism of the sequencing library; if provided, will not "
-            "not be inferred by the application"
-        )
-    )
     parser.add_argument(
         "--verbosity",
         choices=[e.name for e in LogLevels],
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index a07ecbc4..6c39d47b 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -2,10 +2,11 @@
 
 import logging
 from pathlib import Path
-from typing import Optional
 import subprocess as sp
 import tempfile
 
+from typing import Optional
+from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
 
@@ -51,6 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
+        org_id: Taxonomy ID of the organism.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -64,7 +66,6 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
-        self.org_name = config.args.org_name
         self.org_id = config.args.org_id
 
     def evaluate(self) -> ResultsSource:
@@ -75,9 +76,12 @@ def evaluate(self) -> ResultsSource:
         """
         source = ResultsSource()
         # Check if library_source is provided, otherwise infer it
-        if self.org_name is not None:
-            source.file_1.short_name = self.org_name
+        if self.org_id is not None:
             source.file_1.taxon_id = self.org_id
+            source.file_1.short_name = self.get_organism_name(
+                self.org_id,
+                self.transcripts_file
+            )
         else:
             # Infer library source here and set it to source.library_source
             index = self.create_kallisto_index()
@@ -89,10 +93,11 @@ def evaluate(self) -> ResultsSource:
             source.file_1.taxon_id = library_source.taxon_id
 
         if self.paths[1] is not None:
-            # Check if library_source is provided for file_2, otherwise infer it
-            if self.org_name is not None:
-                source.file_2.short_name = self.org_name
+            # Check if library_source is provided for file_2,
+            # otherwise infer it
+            if self.org_id is not None:
                 source.file_2.taxon_id = self.org_id
+                source.file_2.short_name = source.file_1.short_name
             else:
                 library_source = self.get_source(
                     fastq=self.paths[1],
@@ -301,3 +306,48 @@ def get_source_expression(
 
         # return as dictionary
         return dat_agg.sort_values(["tpm"], ascending=False)
+
+    @staticmethod
+    def get_organism_name(
+        taxon_id: int,
+        transcripts_file: Path,
+    ) -> Optional[str]:
+        """Return name of the organism, based on tax ID.
+
+        Args:
+            taxon_id: Taxonomy ID of a given organism (int).
+            transcripts_file: Path to fasta file containing transcripts.
+
+        Returns:
+            Short name of the organism belonging to the given tax ID.
+
+        Raises:
+            Could not process input FASTA file.
+        """
+        org_dict = {}
+        # Construct dictionary of organism ID's and names
+        try:
+            for record in SeqIO.parse(
+                    handle=transcripts_file,
+                    format='fasta',
+            ):
+                org_id = int(record.description.split("|")[4])
+                org_name = record.description.split("|")[3]
+
+                if org_id not in org_dict:
+                    org_dict[org_id] = org_name
+                else:
+                    org_dict[org_id] = org_name
+
+        except OSError as exc:
+            raise FileProblem(
+                f"Could not process file '{transcripts_file}'"
+            ) from exc
+
+        if taxon_id in org_dict:
+            return org_dict[taxon_id]
+        LOGGER.warning(
+            f"Taxon ID '{taxon_id}' not found in organism dictionary, "
+            "using default 'None' for organism."
+        )
+        return None
diff --git a/htsinfer/models.py b/htsinfer/models.py
index 391d856b..a84ae35c 100644
--- a/htsinfer/models.py
+++ b/htsinfer/models.py
@@ -6,7 +6,7 @@
 )
 import logging
 import re
-from typing import Optional, Union
+from typing import Optional
 from pathlib import Path
 import tempfile
 
@@ -356,7 +356,6 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        org_name: Organism name.
         org_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
@@ -431,7 +430,6 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 0
     threads: int = 1
-    org_name: Optional[str] = None
     org_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()

From 6656ff3c1bcffefe5a8a9b49b16c190a7d4e3b19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 31 Oct 2023 16:40:37 +0100
Subject: [PATCH 05/17] refactor: get_library_source.py #108

---
 htsinfer/get_library_source.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 6c39d47b..d42533a8 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -82,6 +82,9 @@ def evaluate(self) -> ResultsSource:
                 self.org_id,
                 self.transcripts_file
             )
+            if self.paths[1] is not None:
+                source.file_2.taxon_id = self.org_id
+                source.file_2.short_name = source.file_1.short_name
         else:
             # Infer library source here and set it to source.library_source
             index = self.create_kallisto_index()
@@ -92,13 +95,7 @@ def evaluate(self) -> ResultsSource:
             source.file_1.short_name = library_source.short_name
             source.file_1.taxon_id = library_source.taxon_id
 
-        if self.paths[1] is not None:
-            # Check if library_source is provided for file_2,
-            # otherwise infer it
-            if self.org_id is not None:
-                source.file_2.taxon_id = self.org_id
-                source.file_2.short_name = source.file_1.short_name
-            else:
+            if self.paths[1] is not None:
                 library_source = self.get_source(
                     fastq=self.paths[1],
                     index=index,

From 6b6bc589c9de13c1dd92942c8f0095e178a3bc4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Thu, 9 Nov 2023 13:39:18 +0100
Subject: [PATCH 06/17] test: add org param tests #108

---
 tests/test_get_library_source.py | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
index 5b27e8a6..e8e01498 100644
--- a/tests/test_get_library_source.py
+++ b/tests/test_get_library_source.py
@@ -264,6 +264,52 @@ def test_evaluate_min_freq_ratio(self, tmpdir):
             file_2=Source()
         )
 
+    def test_evaluate_org_id_not_none(self):
+        """Test when self.org_id is not None."""
+        CONFIG.args.org_id = 7227  # An example taxon ID
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        test_instance = GetLibSource(config=CONFIG)
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == 7227
+        assert result.file_1.short_name == "dmelanogaster"
+
+    def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
+        """Test when self.org_id is None and self.paths[1] is not None."""
+        CONFIG.args.org_id = None
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the get_source method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_source',
+            lambda *args, **kwargs: SOURCE_HUMAN,
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id
+        assert result.file_2.short_name == SOURCE_HUMAN.short_name
+
+    def test_evaluate_org_id_not_none_with_path_2(self, tmpdir):
+        """Test when self.org_id is not None and self.paths[1] is not None."""
+        CONFIG.args.org_id = 7227
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        result = test_instance.evaluate()
+
+        assert result.file_2.taxon_id == 7227
+        assert result.file_2.short_name == "dmelanogaster"
+
     def test_create_kallisto_index_problem(self, tmpdir):
         """Pass invalid file as transcripts.fasta file
         to simulate KallistoProblem."""
@@ -276,3 +322,36 @@ def test_create_kallisto_index_problem(self, tmpdir):
         test_instance = GetLibSource(config=CONFIG)
         with pytest.raises(KallistoProblem):
             test_instance.create_kallisto_index()
+
+    def test_get_organism_name_found(self):
+        """Test the function when the taxon_id
+        is found in the organism dictionary."""
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        test_instance = GetLibSource(config=CONFIG)
+        taxon_id = 7227
+        result = test_instance.get_organism_name(
+            taxon_id, CONFIG.args.t_file_processed
+        )
+        assert result == "dmelanogaster"
+
+    def test_get_organism_name_not_found(self):
+        """Test the function when the taxon_id
+        is not found in the organism dictionary."""
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        test_instance = GetLibSource(config=CONFIG)
+        taxon_id = 12345  # A tax ID that doesn't exist in transcripts
+        result = test_instance.get_organism_name(
+            taxon_id, CONFIG.args.t_file_processed
+        )
+        assert result is None
+
+    def test_get_organism_name_file_problem(self):
+        """Test the function when there's a
+        file problem while processing the FASTA file."""
+        CONFIG.args.t_file_processed = FILE_DUMMY
+        test_instance = GetLibSource(config=CONFIG)
+        taxon_id = 7227
+        with pytest.raises(FileProblem):
+            test_instance.get_organism_name(
+                taxon_id, CONFIG.args.t_file_processed
+            )

From 6d359358f9288c59c35c7c117455c929662b32e2 Mon Sep 17 00:00:00 2001
From: balajtimate <51365402+balajtimate@users.noreply.github.com>
Date: Thu, 9 Nov 2023 15:12:15 +0100
Subject: [PATCH 07/17] fix: update Pydantic version (#146)

* fix pydantic issues

* fix: update pydantic version in envs

* fix: pin sphinx-rtd-theme into env

* fix: update readthedocs config
---
 .readthedocs.yaml      | 2 +-
 environment-dev.yml    | 5 +++--
 environment.yml        | 2 +-
 htsinfer/htsinfer.py   | 3 +--
 tests/test_htsinfer.py | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 14c41f2e..0922ec04 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,4 +13,4 @@ formats:
    - pdf
    - epub
 conda:
-  environment: environment.yml
+  environment: environment-dev.yml
diff --git a/environment-dev.yml b/environment-dev.yml
index 40f4f5f4..c97c611a 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -13,12 +13,13 @@ dependencies:
   - pandas >=1.3.5, <1.4.0
   - pip >=20.2.3
   - pyahocorasick >=1.4.0
-  - pydantic >=1.8.1, <2
+  - pydantic >=2, <3
   - pylint >=2.4.4
   - pysam >=0.16.0
   - pytest >=6.1.0
   - python >=3.8, <=3.10
+  - python-semantic-release >=8
+  - sphinx-rtd-theme
   - star >=2.7.6
   - pip:
-    - python-semantic-release>=7.15.0
     - -e .
diff --git a/environment.yml b/environment.yml
index 08e4fd28..273edbed 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,7 +10,7 @@ dependencies:
   - pandas >=1.3.5, <1.4.0
   - pip >=20.2.3
   - pyahocorasick >=1.4.0
-  - pydantic >=1.8.1, <2
+  - pydantic >=2, <3
   - pysam >=0.16.0
   - python >=3.8, <=3.10
   - star >=2.7.6
diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
index 9f2f6602..4967a4d9 100755
--- a/htsinfer/htsinfer.py
+++ b/htsinfer/htsinfer.py
@@ -315,8 +315,7 @@ def clean_up(self):
     def print(self):
         """Print results to STDOUT."""
         sys.stdout.write(
-            self.config.results.json(
+            self.config.results.model_dump_json(
                 indent=3,
-                sort_keys=False,
             ) + linesep
         )
diff --git a/tests/test_htsinfer.py b/tests/test_htsinfer.py
index 0489023b..ea6e65f7 100644
--- a/tests/test_htsinfer.py
+++ b/tests/test_htsinfer.py
@@ -305,7 +305,7 @@ def test_clean_up_keep_none(self, tmpdir):
     def test_clean_up_keep_results(self, tmpdir):
         """Remove temporary data."""
         arguments = Args(path_1=FILE_MATE_1,
-                         out_dir=tmpdir,
+                         out_dir=tmpdir.strpath,
                          tmpdir=tmpdir,
                          )
         results = Results()
@@ -323,8 +323,8 @@ def test_clean_up_keep_results(self, tmpdir):
     def test_clean_up_keep_all(self, tmpdir):
         """Remove no data."""
         arguments = Args(path_1=FILE_MATE_1,
-                         out_dir=tmpdir,
-                         tmpdir=tmpdir,
+                         out_dir=tmpdir.strpath,
+                         tmpdir=tmpdir.strpath,
                          )
         results = Results()
         configs = Config(

From 59ace859b5682e024c716432a7ea824259b9a2d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Thu, 9 Nov 2023 16:09:22 +0100
Subject: [PATCH 08/17] update readme, gitignore

---
 .gitignore | 1 +
 README.md  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index a1f96fd6..c53c6f18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,3 +117,4 @@ tests/.DS_Store
 results_htsinfer
 .snakemake
 tests/cluster_tests/results_sra_downloads
+*.out
diff --git a/README.md b/README.md
index 6149a708..2e0dcaab 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
          [--library-type-mates-cutoff FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
+         [--org-id INT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
          [-h] [--version]
          PATH [PATH]

From 7359932014dceac99eecdb322c5967b04ae94078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Thu, 9 Nov 2023 16:09:43 +0100
Subject: [PATCH 09/17] feat: infer org source if id not in dict #108

---
 htsinfer/get_library_source.py | 43 ++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index d42533a8..4dd79c81 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -78,15 +78,37 @@ def evaluate(self) -> ResultsSource:
         # Check if library_source is provided, otherwise infer it
         if self.org_id is not None:
             source.file_1.taxon_id = self.org_id
-            source.file_1.short_name = self.get_organism_name(
-                self.org_id,
-                self.transcripts_file
-            )
-            if self.paths[1] is not None:
-                source.file_2.taxon_id = self.org_id
-                source.file_2.short_name = source.file_1.short_name
+            org_name = self.get_organism_name(self.org_id, self.transcripts_file)
+            
+            if org_name is not None:
+                source.file_1.short_name = org_name
+
+                if self.paths[1] is not None:
+                    source.file_2.taxon_id = self.org_id
+                    source.file_2.short_name = source.file_1.short_name
+
+            else:
+                LOGGER.warning(
+                    f"Taxon ID '{self.org_id}' not found in organism dictionary, "
+                    "inferring source organism..."
+                )
+                index = self.create_kallisto_index()
+                library_source = self.get_source(
+                    fastq=self.paths[0],
+                    index=index,
+                )
+                source.file_1.short_name = library_source.short_name
+                source.file_1.taxon_id = library_source.taxon_id
+
+                if self.paths[1] is not None:
+                    library_source = self.get_source(
+                        fastq=self.paths[1],
+                        index=index,
+                    )
+                    source.file_2.short_name = library_source.short_name
+                    source.file_2.taxon_id = library_source.taxon_id
+
         else:
-            # Infer library source here and set it to source.library_source
             index = self.create_kallisto_index()
             library_source = self.get_source(
                 fastq=self.paths[0],
@@ -343,8 +365,5 @@ def get_organism_name(
 
         if taxon_id in org_dict:
             return org_dict[taxon_id]
-        LOGGER.warning(
-            f"Taxon ID '{taxon_id}' not found in organism dictionary, "
-            "using default 'None' for organism."
-        )
+
         return None

From fa1f3fc638fe4e5dac8bcacf1ece4982147ce7c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Fri, 10 Nov 2023 11:38:58 +0100
Subject: [PATCH 10/17] replace json with model_dump

---
 htsinfer/htsinfer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
index 4967a4d9..d29c89ae 100755
--- a/htsinfer/htsinfer.py
+++ b/htsinfer/htsinfer.py
@@ -85,7 +85,7 @@ def evaluate(self):
                 self.get_library_stats()
                 LOGGER.info(
                     "Library stats determined: "
-                    f"{self.config.results.library_stats.json()}"
+                    f"{self.config.results.library_stats.model_dump_json()}"
                 )
 
                 # determine library source
@@ -93,7 +93,7 @@ def evaluate(self):
                 self.config.results.library_source = self.get_library_source()
                 LOGGER.info(
                     "Library source determined: "
-                    f"{self.config.results.library_source.json()}"
+                    f"{self.config.results.library_source.model_dump_json()}"
                 )
 
                 # determine library type
@@ -106,7 +106,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Library type determined: "
-                    f"{self.config.results.library_type.json()}"
+                    f"{self.config.results.library_type.model_dump_json()}"
                 )
 
                 # determine read orientation
@@ -119,7 +119,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read orientation determined: "
-                    f"{self.config.results.read_orientation.json()}"
+                    f"{self.config.results.read_orientation.model_dump_json()}"
                 )
 
                 # determine read layout
@@ -132,7 +132,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read layout determined: "
-                    f"{self.config.results.read_layout.json()}"
+                    f"{self.config.results.read_layout.model_dump_json()}"
                 )
 
             except FileProblem as exc:
@@ -148,7 +148,7 @@ def evaluate(self):
             LOGGER.error(f"{type(exc).__name__}: {str(exc)}")
 
         # log results
-        LOGGER.info(f"Results: {self.config.results.json()}")
+        LOGGER.info(f"Results: {self.config.results.model_dump_json()}")
 
     def prepare_env(self):
         """Set up work environment."""

From fbd91b0b3706435e30ab79046f8230acdae1c783 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Fri, 10 Nov 2023 13:24:52 +0100
Subject: [PATCH 11/17] feat: add org_id param #108

---
 htsinfer/get_library_source.py   | 11 ++--
 tests/test_get_library_source.py | 90 ++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 4dd79c81..b4d7b06a 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -78,8 +78,11 @@ def evaluate(self) -> ResultsSource:
         # Check if library_source is provided, otherwise infer it
         if self.org_id is not None:
             source.file_1.taxon_id = self.org_id
-            org_name = self.get_organism_name(self.org_id, self.transcripts_file)
-            
+            org_name = self.get_organism_name(
+                self.org_id,
+                self.transcripts_file
+            )
+
             if org_name is not None:
                 source.file_1.short_name = org_name
 
@@ -89,8 +92,8 @@ def evaluate(self) -> ResultsSource:
 
             else:
                 LOGGER.warning(
-                    f"Taxon ID '{self.org_id}' not found in organism dictionary, "
-                    "inferring source organism..."
+                    f"Taxon ID '{self.org_id}' not found in "
+                    "organism dictionary, inferring source organism..."
                 )
                 index = self.create_kallisto_index()
                 library_source = self.get_source(
diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
index e8e01498..5cd2df54 100644
--- a/tests/test_get_library_source.py
+++ b/tests/test_get_library_source.py
@@ -355,3 +355,93 @@ def test_get_organism_name_file_problem(self):
             test_instance.get_organism_name(
                 taxon_id, CONFIG.args.t_file_processed
             )
+
+    def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
+        """Test when self.org_id is None."""
+        CONFIG.args.org_id = None
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the create_kallisto_index method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
+            lambda *args, **kwargs: tmpdir / "kallisto.idx",
+        )
+
+        # Mock the get_source method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_source',
+            lambda *args, **kwargs: SOURCE_FRUIT_FLY,
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name
+
+        assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
+
+    def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
+        """Test when self.org_id is not None but org_name is not found."""
+        CONFIG.args.org_id = 7227
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the get_organism_name method to return None
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_organism_name',
+            lambda *args, **kwargs: None,
+        )
+
+        # Mock the create_kallisto_index method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
+            lambda *args, **kwargs: tmpdir / "kallisto.idx",
+        )
+
+        # Mock the get_source method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_source',
+            lambda *args, **kwargs: SOURCE_FRUIT_FLY,
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name
+
+        assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
+
+    def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir):
+        """Test when self.org_id is not None and org_name is found."""
+        CONFIG.args.org_id = 7227
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the get_organism_name method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_organism_name',
+            lambda *args, **kwargs: "dmelanogaster",
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == 7227
+        assert result.file_1.short_name == "dmelanogaster"
+
+        assert result.file_2.taxon_id == 7227
+        assert result.file_2.short_name == "dmelanogaster"

From a6d4a56d53b824e6f78a3aed25fe41bb7c6c81fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Fri, 10 Nov 2023 13:24:52 +0100
Subject: [PATCH 12/17] feat: add org_id param #108

---
 htsinfer/get_library_source.py   | 11 ++--
 tests/test_get_library_source.py | 90 ++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 4dd79c81..b4d7b06a 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -78,8 +78,11 @@ def evaluate(self) -> ResultsSource:
         # Check if library_source is provided, otherwise infer it
         if self.org_id is not None:
             source.file_1.taxon_id = self.org_id
-            org_name = self.get_organism_name(self.org_id, self.transcripts_file)
-            
+            org_name = self.get_organism_name(
+                self.org_id,
+                self.transcripts_file
+            )
+
             if org_name is not None:
                 source.file_1.short_name = org_name
 
@@ -89,8 +92,8 @@ def evaluate(self) -> ResultsSource:
 
             else:
                 LOGGER.warning(
-                    f"Taxon ID '{self.org_id}' not found in organism dictionary, "
-                    "inferring source organism..."
+                    f"Taxon ID '{self.org_id}' not found in "
+                    "organism dictionary, inferring source organism..."
                 )
                 index = self.create_kallisto_index()
                 library_source = self.get_source(
diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
index e8e01498..5cd2df54 100644
--- a/tests/test_get_library_source.py
+++ b/tests/test_get_library_source.py
@@ -355,3 +355,93 @@ def test_get_organism_name_file_problem(self):
             test_instance.get_organism_name(
                 taxon_id, CONFIG.args.t_file_processed
             )
+
+    def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
+        """Test when self.org_id is None."""
+        CONFIG.args.org_id = None
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the create_kallisto_index method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
+            lambda *args, **kwargs: tmpdir / "kallisto.idx",
+        )
+
+        # Mock the get_source method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_source',
+            lambda *args, **kwargs: SOURCE_FRUIT_FLY,
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name
+
+        assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
+
+    def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
+        """Test when self.org_id is not None but org_name is not found."""
+        CONFIG.args.org_id = 7227
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the get_organism_name method to return None
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_organism_name',
+            lambda *args, **kwargs: None,
+        )
+
+        # Mock the create_kallisto_index method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
+            lambda *args, **kwargs: tmpdir / "kallisto.idx",
+        )
+
+        # Mock the get_source method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_source',
+            lambda *args, **kwargs: SOURCE_FRUIT_FLY,
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name
+
+        assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
+        assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
+
+    def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir):
+        """Test when self.org_id is not None and org_name is found."""
+        CONFIG.args.org_id = 7227
+        CONFIG.args.path_1_processed = FILE_MATE_1
+        CONFIG.args.path_2_processed = FILE_MATE_2
+        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
+        CONFIG.args.tmp_dir = tmpdir
+        CONFIG.args.out_dir = tmpdir
+        test_instance = GetLibSource(config=CONFIG)
+
+        # Mock the get_organism_name method to return a specific result
+        monkeypatch.setattr(
+            'htsinfer.get_library_source.GetLibSource.get_organism_name',
+            lambda *args, **kwargs: "dmelanogaster",
+        )
+
+        result = test_instance.evaluate()
+
+        assert result.file_1.taxon_id == 7227
+        assert result.file_1.short_name == "dmelanogaster"
+
+        assert result.file_2.taxon_id == 7227
+        assert result.file_2.short_name == "dmelanogaster"

From 8c879356a7009d000c6e52aa2afbbb4e0da5cca7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 14 Nov 2023 14:38:37 +0100
Subject: [PATCH 13/17] refactor: replace org with tax-id

---
 README.md                        |  2 +-
 htsinfer/cli.py                  |  8 ++---
 htsinfer/get_library_source.py   | 51 +++++++++++++++-----------------
 htsinfer/models.py               |  4 +--
 tests/test_get_library_source.py | 36 +++++++++++-----------
 5 files changed, 49 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 2e0dcaab..53e7267d 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ htsinfer [--output-directory PATH]
          [--library-type-mates-cutoff FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
-         [--org-id INT]
+         [--tax-id INT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
          [-h] [--version]
          PATH [PATH]
diff --git a/htsinfer/cli.py b/htsinfer/cli.py
index ff12a825..217d6ce8 100644
--- a/htsinfer/cli.py
+++ b/htsinfer/cli.py
@@ -263,14 +263,14 @@ def __call__(
         )
     )
     parser.add_argument(
-        '--org-id',
-        dest="org_id",
+        '--tax-id',
+        dest="tax_id",
         metavar="INT",
         type=int,
         default=None,
         help=(
-            "source organism of the sequencing library; if provided, will not "
-            "not be inferred by the application"
+            "NCBI taxonomic identifier of source organism of the library; "
+            "if provided, will not be inferred by the application"
         )
     )
     parser.add_argument(
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index b4d7b06a..3fb70f84 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -4,8 +4,8 @@
 from pathlib import Path
 import subprocess as sp
 import tempfile
-
 from typing import Optional
+
 from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
@@ -52,7 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
-        org_id: Taxonomy ID of the organism.
+        tax_id: Taxonomy ID of the organism.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -66,7 +66,7 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
-        self.org_id = config.args.org_id
+        self.tax_id = config.args.tax_id
 
     def evaluate(self) -> ResultsSource:
         """Infer read source.
@@ -76,23 +76,23 @@ def evaluate(self) -> ResultsSource:
         """
         source = ResultsSource()
         # Check if library_source is provided, otherwise infer it
-        if self.org_id is not None:
-            source.file_1.taxon_id = self.org_id
-            org_name = self.get_organism_name(
-                self.org_id,
+        if self.tax_id is not None:
+            source.file_1.taxon_id = self.tax_id
+            src_name = self.get_source_name(
+                self.tax_id,
                 self.transcripts_file
             )
 
-            if org_name is not None:
-                source.file_1.short_name = org_name
+            if src_name is not None:
+                source.file_1.short_name = src_name
 
                 if self.paths[1] is not None:
-                    source.file_2.taxon_id = self.org_id
+                    source.file_2.taxon_id = self.tax_id
                     source.file_2.short_name = source.file_1.short_name
 
             else:
                 LOGGER.warning(
-                    f"Taxon ID '{self.org_id}' not found in "
+                    f"Taxon ID '{self.tax_id}' not found in "
                     "organism dictionary, inferring source organism..."
                 )
                 index = self.create_kallisto_index()
@@ -330,15 +330,15 @@ def get_source_expression(
         return dat_agg.sort_values(["tpm"], ascending=False)
 
     @staticmethod
-    def get_organism_name(
+    def get_source_name(
         taxon_id: int,
         transcripts_file: Path,
     ) -> Optional[str]:
-        """Return name of the organism, based on tax ID.
+        """Return name of the source organism, based on tax ID.
 
         Args:
-            taxon_id: Taxonomy ID of a given organism (int).
-            transcripts_file: Path to fasta file containing transcripts.
+            taxon_id: Taxonomy ID of a given organism.
+            transcripts_file: Path to FASTA file containing transcripts.
 
         Returns:
             Short name of the organism belonging to the given tax ID.
@@ -346,27 +346,24 @@ def get_organism_name(
         Raises:
             Could not process input FASTA file.
         """
-        org_dict = {}
-        # Construct dictionary of organism ID's and names
+        src_dict = {}
+        # Construct dictionary of taxonomy ID's and short names
         try:
-            for record in SeqIO.parse(
+            for record in list(SeqIO.parse(
                     handle=transcripts_file,
                     format='fasta',
-            ):
-                org_id = int(record.description.split("|")[4])
-                org_name = record.description.split("|")[3]
+            )):
+                tax_id = int(record.description.split("|")[4])
+                src_name = record.description.split("|")[3]
 
-                if org_id not in org_dict:
-                    org_dict[org_id] = org_name
-                else:
-                    org_dict[org_id] = org_name
+                src_dict[tax_id] = src_name
 
         except OSError as exc:
             raise FileProblem(
                 f"Could not process file '{transcripts_file}'"
             ) from exc
 
-        if taxon_id in org_dict:
-            return org_dict[taxon_id]
+        if taxon_id in src_dict:
+            return src_dict[taxon_id]
 
         return None
diff --git a/htsinfer/models.py b/htsinfer/models.py
index 58ed952f..8174ded7 100644
--- a/htsinfer/models.py
+++ b/htsinfer/models.py
@@ -356,7 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        org_id: Organism ID.
+        tax_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).
@@ -430,7 +430,7 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 1000000
     threads: int = 1
-    org_id: Optional[int] = None
+    tax_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()
     read_layout_min_match_pct: float = 0.1
diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
index 5cd2df54..6e4b58a6 100644
--- a/tests/test_get_library_source.py
+++ b/tests/test_get_library_source.py
@@ -264,9 +264,9 @@ def test_evaluate_min_freq_ratio(self, tmpdir):
             file_2=Source()
         )
 
-    def test_evaluate_org_id_not_none(self):
-        """Test when self.org_id is not None."""
-        CONFIG.args.org_id = 7227  # An example taxon ID
+    def test_evaluate_tax_id_not_none(self):
+        """Test when self.tax_id is not None."""
+        CONFIG.args.tax_id = 7227  # An example taxon ID
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         test_instance = GetLibSource(config=CONFIG)
         result = test_instance.evaluate()
@@ -274,9 +274,9 @@ def test_evaluate_org_id_not_none(self):
         assert result.file_1.taxon_id == 7227
         assert result.file_1.short_name == "dmelanogaster"
 
-    def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
-        """Test when self.org_id is None and self.paths[1] is not None."""
-        CONFIG.args.org_id = None
+    def test_evaluate_tax_id_none_with_path_2(self, tmpdir, monkeypatch):
+        """Test when self.tax_id is None and self.paths[1] is not None."""
+        CONFIG.args.tax_id = None
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -295,9 +295,9 @@ def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch):
         assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id
         assert result.file_2.short_name == SOURCE_HUMAN.short_name
 
-    def test_evaluate_org_id_not_none_with_path_2(self, tmpdir):
-        """Test when self.org_id is not None and self.paths[1] is not None."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_with_path_2(self, tmpdir):
+        """Test when self.tax_id is not None and self.paths[1] is not None."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -356,9 +356,9 @@ def test_get_organism_name_file_problem(self):
                 taxon_id, CONFIG.args.t_file_processed
             )
 
-    def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
-        """Test when self.org_id is None."""
-        CONFIG.args.org_id = None
+    def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is None."""
+        CONFIG.args.tax_id = None
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -386,9 +386,9 @@ def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir):
         assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
         assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
 
-    def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
-        """Test when self.org_id is not None but org_name is not found."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is not None but src_name is not found."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
@@ -422,9 +422,9 @@ def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir):
         assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
         assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
 
-    def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir):
-        """Test when self.org_id is not None and org_name is found."""
-        CONFIG.args.org_id = 7227
+    def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
+        """Test when self.tax_id is not None and src_name is found."""
+        CONFIG.args.tax_id = 7227
         CONFIG.args.path_1_processed = FILE_MATE_1
         CONFIG.args.path_2_processed = FILE_MATE_2
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS

From 652923daf4346b59f4ebbe61649853c1e3f1619e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 14 Nov 2023 17:28:48 +0100
Subject: [PATCH 14/17] refactor get_library_source

---
 htsinfer/exceptions.py         |  6 +++++
 htsinfer/get_library_source.py | 47 ++++++++++------------------------
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py
index e40278c3..f526841a 100644
--- a/htsinfer/exceptions.py
+++ b/htsinfer/exceptions.py
@@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception):
 
 class CutadaptProblem(Exception):
     """Exception raised when running cutadapt commands."""
+
+
+class UnsupportedSampleSourceException(Exception):
+    """Exception raised when taxonomy ID is not found in the source
+    organism list.
+    """
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 3fb70f84..25eecb51 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 import subprocess as sp
 import tempfile
-from typing import Optional
 
 from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
@@ -14,6 +13,7 @@
     FileProblem,
     KallistoProblem,
     TranscriptsFastaProblem,
+    UnsupportedSampleSourceException,
 )
 from htsinfer.models import (
     ResultsSource,
@@ -82,34 +82,11 @@ def evaluate(self) -> ResultsSource:
                 self.tax_id,
                 self.transcripts_file
             )
+            source.file_1.short_name = src_name
 
-            if src_name is not None:
-                source.file_1.short_name = src_name
-
-                if self.paths[1] is not None:
-                    source.file_2.taxon_id = self.tax_id
-                    source.file_2.short_name = source.file_1.short_name
-
-            else:
-                LOGGER.warning(
-                    f"Taxon ID '{self.tax_id}' not found in "
-                    "organism dictionary, inferring source organism..."
-                )
-                index = self.create_kallisto_index()
-                library_source = self.get_source(
-                    fastq=self.paths[0],
-                    index=index,
-                )
-                source.file_1.short_name = library_source.short_name
-                source.file_1.taxon_id = library_source.taxon_id
-
-                if self.paths[1] is not None:
-                    library_source = self.get_source(
-                        fastq=self.paths[1],
-                        index=index,
-                    )
-                    source.file_2.short_name = library_source.short_name
-                    source.file_2.taxon_id = library_source.taxon_id
+            if self.paths[1] is not None:
+                source.file_2.taxon_id = self.tax_id
+                source.file_2.short_name = source.file_1.short_name
 
         else:
             index = self.create_kallisto_index()
@@ -333,7 +310,7 @@ def get_source_expression(
     def get_source_name(
         taxon_id: int,
         transcripts_file: Path,
-    ) -> Optional[str]:
+    ) -> str:
         """Return name of the source organism, based on tax ID.
 
         Args:
@@ -344,10 +321,11 @@ def get_source_name(
             Short name of the organism belonging to the given tax ID.
 
         Raises:
-            Could not process input FASTA file.
+            FileProblem: Could not process input FASTA file.
+            UnsupportedSampleSourceException: Taxon ID is not supported.
         """
         src_dict = {}
-        # Construct dictionary of taxonomy ID's and short names
+
         try:
             for record in list(SeqIO.parse(
                     handle=transcripts_file,
@@ -363,7 +341,10 @@ def get_source_name(
                 f"Could not process file '{transcripts_file}'"
             ) from exc
 
-        if taxon_id in src_dict:
+        try:
             return src_dict[taxon_id]
 
-        return None
+        except KeyError as exc:
+            raise UnsupportedSampleSourceException(
+                f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
+            ) from exc

From fd29bc6b49bf812b296bec51a75a0ef97c1e5c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 14 Nov 2023 17:28:56 +0100
Subject: [PATCH 15/17] refactor get_library_source tests

---
 tests/test_get_library_source.py | 61 +++++++-------------------------
 1 file changed, 13 insertions(+), 48 deletions(-)

diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py
index 6e4b58a6..6539d0f1 100644
--- a/tests/test_get_library_source.py
+++ b/tests/test_get_library_source.py
@@ -5,7 +5,8 @@
 from htsinfer.exceptions import (
     FileProblem,
     KallistoProblem,
-    TranscriptsFastaProblem
+    TranscriptsFastaProblem,
+    UnsupportedSampleSourceException,
 )
 from htsinfer.get_library_source import GetLibSource
 from htsinfer.models import (
@@ -323,36 +324,36 @@ def test_create_kallisto_index_problem(self, tmpdir):
         with pytest.raises(KallistoProblem):
             test_instance.create_kallisto_index()
 
-    def test_get_organism_name_found(self):
+    def test_get_source_name_found(self):
         """Test the function when the taxon_id
         is found in the organism dictionary."""
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         test_instance = GetLibSource(config=CONFIG)
         taxon_id = 7227
-        result = test_instance.get_organism_name(
+        result = test_instance.get_source_name(
             taxon_id, CONFIG.args.t_file_processed
         )
         assert result == "dmelanogaster"
 
-    def test_get_organism_name_not_found(self):
+    def test_get_source_name_not_found(self):
         """Test the function when the taxon_id
         is not found in the organism dictionary."""
         CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
         test_instance = GetLibSource(config=CONFIG)
         taxon_id = 12345  # A tax ID that doesn't exist in transcripts
-        result = test_instance.get_organism_name(
-            taxon_id, CONFIG.args.t_file_processed
-        )
-        assert result is None
+        with pytest.raises(UnsupportedSampleSourceException):
+            test_instance.get_source_name(
+                taxon_id, CONFIG.args.t_file_processed
+            )
 
-    def test_get_organism_name_file_problem(self):
+    def test_get_source_name_file_problem(self):
         """Test the function when there's a
         file problem while processing the FASTA file."""
         CONFIG.args.t_file_processed = FILE_DUMMY
         test_instance = GetLibSource(config=CONFIG)
         taxon_id = 7227
         with pytest.raises(FileProblem):
-            test_instance.get_organism_name(
+            test_instance.get_source_name(
                 taxon_id, CONFIG.args.t_file_processed
             )
 
@@ -386,42 +387,6 @@ def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir):
         assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
         assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
 
-    def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir):
-        """Test when self.tax_id is not None but src_name is not found."""
-        CONFIG.args.tax_id = 7227
-        CONFIG.args.path_1_processed = FILE_MATE_1
-        CONFIG.args.path_2_processed = FILE_MATE_2
-        CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
-        CONFIG.args.tmp_dir = tmpdir
-        CONFIG.args.out_dir = tmpdir
-        test_instance = GetLibSource(config=CONFIG)
-
-        # Mock the get_organism_name method to return None
-        monkeypatch.setattr(
-            'htsinfer.get_library_source.GetLibSource.get_organism_name',
-            lambda *args, **kwargs: None,
-        )
-
-        # Mock the create_kallisto_index method to return a specific result
-        monkeypatch.setattr(
-            'htsinfer.get_library_source.GetLibSource.create_kallisto_index',
-            lambda *args, **kwargs: tmpdir / "kallisto.idx",
-        )
-
-        # Mock the get_source method to return a specific result
-        monkeypatch.setattr(
-            'htsinfer.get_library_source.GetLibSource.get_source',
-            lambda *args, **kwargs: SOURCE_FRUIT_FLY,
-        )
-
-        result = test_instance.evaluate()
-
-        assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id
-        assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name
-
-        assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id
-        assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name
-
     def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
         """Test when self.tax_id is not None and src_name is found."""
         CONFIG.args.tax_id = 7227
@@ -432,9 +397,9 @@ def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir):
         CONFIG.args.out_dir = tmpdir
         test_instance = GetLibSource(config=CONFIG)
 
-        # Mock the get_organism_name method to return a specific result
+        # Mock the get_source_name method to return a specific result
         monkeypatch.setattr(
-            'htsinfer.get_library_source.GetLibSource.get_organism_name',
+            'htsinfer.get_library_source.GetLibSource.get_source_name',
             lambda *args, **kwargs: "dmelanogaster",
         )
 

From a71d7573661dd082f63f922e571a50f10c6f2268 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 14 Nov 2023 18:00:41 +0100
Subject: [PATCH 16/17] refactor: update models.py

---
 htsinfer/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htsinfer/models.py b/htsinfer/models.py
index 8174ded7..239aa57e 100644
--- a/htsinfer/models.py
+++ b/htsinfer/models.py
@@ -356,7 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        tax_id: Organism ID.
+        tax_id: Taxonomy ID of the source organism.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).

From c8a10e63f7b2172f35b6d5c076bf1cde61789c4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Wed, 15 Nov 2023 13:41:57 +0100
Subject: [PATCH 17/17] refactor: fix typos

---
 htsinfer/exceptions.py         | 4 +---
 htsinfer/get_library_source.py | 2 +-
 htsinfer/models.py             | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py
index f526841a..eade2d89 100644
--- a/htsinfer/exceptions.py
+++ b/htsinfer/exceptions.py
@@ -47,6 +47,4 @@ class CutadaptProblem(Exception):
 
 
 class UnsupportedSampleSourceException(Exception):
-    """Exception raised when taxonomy ID is not found in the source
-    organism list.
-    """
+    """Exception raised when taxonomy ID is not supported."""
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 25eecb51..15a8c834 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -52,7 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
-        tax_id: Taxonomy ID of the organism.
+        tax_id: Taxonomy ID of the sample source.
     """
     def __init__(  # pylint: disable=E1101
         self,
diff --git a/htsinfer/models.py b/htsinfer/models.py
index 239aa57e..8db42f6f 100644
--- a/htsinfer/models.py
+++ b/htsinfer/models.py
@@ -356,7 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        tax_id: Taxonomy ID of the source organism.
+        tax_id: Taxonomy ID of the sample source.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).