From b67c2510e96ad0fd0bc93f3db2405ad072572a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 8 Sep 2023 12:28:09 +0200 Subject: [PATCH 01/17] feat: add org param --- htsinfer/cli.py | 22 +++++++++++++++++++++ htsinfer/get_library_source.py | 36 ++++++++++++++++++++++++++-------- htsinfer/models.py | 6 +++++- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/htsinfer/cli.py b/htsinfer/cli.py index 3fea80d4..c8d5c553 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -261,6 +261,28 @@ def __call__( "be reported. Must be above 0.5" ) ) + parser.add_argument( + '--org-id', + dest="org_id", + metavar="INT", + type=int, + default=None, + help=( + "source organism of the sequencing library; if provided, will not " + "not be inferred by the application" + ) + ) + parser.add_argument( + '--org-name', + dest="org_name", + metavar="STR", + type=str, + default=None, + help=( + "source organism of the sequencing library; if provided, will not " + "not be inferred by the application" + ) + ) parser.add_argument( "--verbosity", choices=[e.name for e in LogLevels], diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 9e88ebe9..a07ecbc4 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -2,6 +2,7 @@ import logging from pathlib import Path +from typing import Optional import subprocess as sp import tempfile @@ -63,6 +64,8 @@ def __init__( # pylint: disable=E1101 self.tmp_dir = config.args.tmp_dir self.min_match_pct = config.args.lib_source_min_match_pct self.min_freq_ratio = config.args.lib_source_min_freq_ratio + self.org_name = config.args.org_name + self.org_id = config.args.org_id def evaluate(self) -> ResultsSource: """Infer read source. @@ -71,16 +74,33 @@ def evaluate(self) -> ResultsSource: Source results object. """ source = ResultsSource() - index = self.create_kallisto_index() - source.file_1 = self.get_source( - fastq=self.paths[0], - index=index, - ) - if self.paths[1] is not None: - source.file_2 = self.get_source( - fastq=self.paths[1], + # Check if library_source is provided, otherwise infer it + if self.org_name is not None: + source.file_1.short_name = self.org_name + source.file_1.taxon_id = self.org_id + else: + # Infer library source here and set it to source.library_source + index = self.create_kallisto_index() + library_source = self.get_source( + fastq=self.paths[0], index=index, ) + source.file_1.short_name = library_source.short_name + source.file_1.taxon_id = library_source.taxon_id + + if self.paths[1] is not None: + # Check if library_source is provided for file_2, otherwise infer it + if self.org_name is not None: + source.file_2.short_name = self.org_name + source.file_2.taxon_id = self.org_id + else: + library_source = self.get_source( + fastq=self.paths[1], + index=index, + ) + source.file_2.short_name = library_source.short_name + source.file_2.taxon_id = library_source.taxon_id + return source def create_kallisto_index(self) -> Path: diff --git a/htsinfer/models.py b/htsinfer/models.py index 84c5cc42..391d856b 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -6,7 +6,7 @@ ) import logging import re -from typing import Optional +from typing import Optional, Union from pathlib import Path import tempfile @@ -356,6 +356,8 @@ class Args(BaseModel): records: Number of input file records to process; set to `0` to process all records. threads: Number of threads to run STAR with. + org_name: Organism name. + org_id: Organism ID. transcripts_file: File path to transcripts FASTA file. read_layout_adapter_file: Path to text file containing 3' adapter sequences to scan for (one sequence per line). @@ -429,6 +431,8 @@ class Args(BaseModel): CleanupRegimes.DEFAULT records: int = 0 threads: int = 1 + org_name: Optional[str] = None + org_id: Optional[int] = None transcripts_file: Path = Path() read_layout_adapter_file: Path = Path() read_layout_min_match_pct: float = 0.1 From dc8af234304d7d140facadf9af8f5b7591a339e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Juri=C4=8D?= <74237898+BorisYourich@users.noreply.github.com> Date: Thu, 14 Sep 2023 10:22:58 +0200 Subject: [PATCH 02/17] refactor: avoid duplicate mappings (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Boris Jurič <499542@mail.muni.cz> Co-authored-by: Alex Kanitz --- .github/workflows/ci.yml | 35 +-- README.md | 10 +- environment-dev.yml | 25 +- environment.yml | 24 +- htsinfer/cli.py | 1 + htsinfer/get_library_type.py | 31 +-- htsinfer/get_read_orientation.py | 345 +------------------------- htsinfer/htsinfer.py | 4 + htsinfer/mapping.py | 378 +++++++++++++++++++++++++++++ setup.py | 2 +- tests/test_get_library_type.py | 49 ++-- tests/test_get_read_orientation.py | 291 ++++------------------ tests/test_mapping.py | 259 ++++++++++++++++++++ tests/utils.py | 3 + 14 files changed, 801 insertions(+), 656 deletions(-) create mode 100644 htsinfer/mapping.py create mode 100644 tests/test_mapping.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22c59e57..3993e32a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,21 +12,18 @@ jobs: steps: - name: check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: set up miniconda and env uses: conda-incubator/setup-miniconda@v2 with: - auto-update-conda: true + python-version: "3.9" mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -50,7 +47,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.7', '3.8', '3.9' ] + python-version: [ '3.8', '3.9', '3.10' ] name: unit-testing-Python-${{ matrix.python-version }} @@ -63,16 +60,12 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} - auto-update-conda: true mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -100,29 +93,25 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.7', '3.8', '3.9' ] + python-version: [ '3.8', '3.9', '3.10' ] name: integration-testing-Python-${{ matrix.python-version }} steps: - name: check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: set up miniconda and env uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} - auto-update-conda: true mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -171,4 +160,4 @@ jobs: run: | echo "Push indicator: ${{ steps.docker.outputs.push-indicator }}" echo "# Set to 'true' if image was pushed, empty string otherwise" - test "${{ steps.docker.outputs.push-indicator }}" == "true" \ No newline at end of file + test "${{ steps.docker.outputs.push-indicator }}" == "true" diff --git a/README.md b/README.md index 35f49184..6149a708 100644 --- a/README.md +++ b/README.md @@ -123,12 +123,14 @@ dependencies via [Conda][conda]: git clone https://github.com/zavolanlab/htsinfer cd htsinfer conda env create --file environment.yml -conda env update --file environment-dev.yml # optional: install development/testing dependencies +# Alternatively, to install with development dependencies, +# run the following instead +conda env create --file environment-dev.yml ``` -Note that creating the environment takes non-trivial time and it is strongly -recommended that you install [Mamba][mamba] and replace `conda` with `mamba` -in the previous commands. +> Note that creating the environment takes non-trivial time and it is strongly +> recommended that you install [Mamba][mamba] and replace `conda` with `mamba` +> in the previous command. Then, activate the `htsinfer` Conda environment with: diff --git a/environment-dev.yml b/environment-dev.yml index 8b243f0f..40f4f5f4 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -1,11 +1,24 @@ name: htsinfer channels: - - defaults + - conda-forge + - bioconda dependencies: - - coverage>=5.3 - - flake8>=3.8.4 - - mypy>=0.782 - - pylint>=2.4.4 - - pytest>=6.1.0 + - biopython >=1.78 + - coverage >=5.3 + - cutadapt >=3.5, <=4.2 + - flake8 >=3.8.4 + - kallisto >=0.46.1, <= 0.48.0 + - mypy >=0.782 + - numpy >=1.22, <1.25 + - pandas >=1.3.5, <1.4.0 + - pip >=20.2.3 + - pyahocorasick >=1.4.0 + - pydantic >=1.8.1, <2 + - pylint >=2.4.4 + - pysam >=0.16.0 + - pytest >=6.1.0 + - python >=3.8, <=3.10 + - star >=2.7.6 - pip: - python-semantic-release>=7.15.0 + - -e . diff --git a/environment.yml b/environment.yml index 54c4b8a8..08e4fd28 100644 --- a/environment.yml +++ b/environment.yml @@ -1,18 +1,18 @@ name: htsinfer channels: - - defaults - - bioconda - conda-forge + - bioconda dependencies: - - biopython>=1.78 - - kallisto>=0.46.1, <= 0.48.0 - - pandas>=1.0.5 - - pip>=20.2.3 - - pyahocorasick>=1.4.0 - - pydantic>=1.8.1, <2 - - pysam>=0.16.0 - - python>3.6, <3.10 - - star>=2.7.6 - - cutadapt>=3.5, <=4.2 + - biopython >=1.78 + - cutadapt >=3.5, <=4.2 + - kallisto >=0.46.1, <= 0.48.0 + - numpy >=1.22, <1.25 + - pandas >=1.3.5, <1.4.0 + - pip >=20.2.3 + - pyahocorasick >=1.4.0 + - pydantic >=1.8.1, <2 + - pysam >=0.16.0 + - python >=3.8, <=3.10 + - star >=2.7.6 - pip: - -e . diff --git a/htsinfer/cli.py b/htsinfer/cli.py index c8d5c553..256aeaa8 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -50,6 +50,7 @@ def __call__( values, option_string=None, ) -> None: + assert isinstance(values, list) if len(values) > 2: parser.print_usage(file=sys.stderr) sys.stderr.write( diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 2edfd095..a2c03a02 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -21,9 +21,7 @@ SeqIdFormats, Config, ) -from htsinfer.get_read_orientation import ( - GetOrientation, -) +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -62,6 +60,7 @@ class GetLibType: def __init__( self, config: Config, + mapping: Mapping, ): """Class constructor.""" self.path_1: Path = config.args.path_1_processed @@ -69,8 +68,7 @@ def __init__( self.library_source = config.results.library_source self.results: ResultsType = ResultsType() self.tmp_dir = config.args.tmp_dir - self.get_read_orientation: \ - GetOrientation = GetOrientation(config=config) + self.mapping = mapping self.max_distance = config.args.lib_type_max_distance self.cutoff = config.args.lib_type_mates_cutoff @@ -126,23 +124,24 @@ def _evaluate_mate_relationship( self.results.relationship = ( StatesTypeRelationship.split_mates ) + self.mapping.library_type.relationship = ( + StatesTypeRelationship.split_mates + ) else: - self.get_read_orientation.library_type.relationship \ + self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available - self.get_read_orientation.library_source = self.library_source - _ = self.get_read_orientation.evaluate() + self.mapping.library_source = self.library_source + self.mapping.evaluate() self._align_mates() def _align_mates(self): """Decide mate relationship by alignment.""" - alignment_1 = Path(self.tmp_dir) \ - / "alignments" / "file_1" / "Aligned.out.sam" - alignment_2 = Path(self.tmp_dir) \ - / "alignments" / "file_2" / "Aligned.out.sam" + alignment_1 = self.mapping.star_dirs[0] / 'Aligned.out.sam' + alignment_2 = self.mapping.star_dirs[1] / 'Aligned.out.sam' - samfile1 = pysam.AlignmentFile(alignment_1, 'r') - samfile2 = pysam.AlignmentFile(alignment_2, 'r') + samfile1 = pysam.AlignmentFile(str(alignment_1), 'r') + samfile2 = pysam.AlignmentFile(str(alignment_2), 'r') previous_seq_id1 = None previous_seq_id2 = None @@ -184,6 +183,10 @@ def _align_mates(self): self.results.relationship = ( StatesTypeRelationship.split_mates ) + self.mapping.library_type.relationship \ + = StatesTypeRelationship.split_mates + self.mapping.mapped = False + self.mapping.star_dirs = [] else: self.results.relationship = ( StatesTypeRelationship.not_mates diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index b923234a..d76b0922 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -2,26 +2,21 @@ from collections import defaultdict import logging -import math from pathlib import Path -import subprocess as sp from typing import (Any, DefaultDict, Dict, List) -from Bio import SeqIO # type: ignore import pysam # type: ignore from htsinfer.exceptions import ( FileProblem, - SamFileProblem, - StarProblem, ) from htsinfer.models import ( ResultsOrientation, StatesOrientation, StatesOrientationRelationship, - StatesTypeRelationship, Config, ) +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -54,6 +49,7 @@ class GetOrientation: def __init__( self, config: Config, + mapping: Mapping, ): """Class contructor.""" self.paths = (config.args.path_1_processed, @@ -62,9 +58,9 @@ def __init__( self.library_source = config.results.library_source self.transcripts_file = config.args.t_file_processed self.tmp_dir = config.args.tmp_dir - self.threads_star = config.args.threads self.min_mapped_reads = config.args.read_orientation_min_mapped_reads self.min_fraction = config.args.read_orientation_min_fraction + self.mapping = mapping def evaluate(self) -> ResultsOrientation: """Infer read orientation. @@ -73,327 +69,16 @@ def evaluate(self) -> ResultsOrientation: Orientation results object. """ - # get transcripts for current organims - transcripts = self.subset_transcripts_by_organism() - ref_size = self.get_fasta_size(fasta=transcripts) - index_string_size = self.get_star_index_string_size(ref_size=ref_size) - chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size, - transcripts=transcripts) - - # generate STAR alignments - index_dir = self.create_star_index( - fasta=transcripts, - index_string_size=index_string_size, - chr_bin_bits=chr_bin_bits, - ) - star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir) - self.generate_star_alignments(commands=star_cmds) - - # process alignments - star_dirs = [e for e in star_cmds if e is not None] - - return self.process_alignments(star_dirs=star_dirs) - - def subset_transcripts_by_organism(self) -> Path: - """Filter FASTA file of transcripts by current sources. - - The filtered file contains records from the indicated sources. - Typically, this is one source. However, for if two input files - were supplied that are originating from different sources (i.e., - not from a valid paired-ended library), it may be from two - different sources. If no source is supplied (because it could - not be inferred), no filtering is done. - - Returns: - Path to filtered FASTA file. - - Raises: - FileProblem: Could not open input/output FASTA file for - reading/writing. - """ - LOGGER.debug(f"Subsetting transcripts for: {self.library_source}") - - outfile = self.tmp_dir / f"{self.library_source}.fasta" - - def yield_filtered_seqs(): - """Generator yielding sequence records for specified sources. - - Yields: - Next FASTA sequence record of the specified sources. - - Raises: Could not process input FASTA file. - """ - sources = [] - if self.library_source.file_1.short_name is not None: - sources.append(self.library_source.file_1.short_name) - if self.library_source.file_2.short_name is not None: - sources.append(self.library_source.file_2.short_name) - try: - for record in SeqIO.parse( - handle=self.transcripts_file, - format='fasta', - ): - try: - org_name = record.description.split("|")[3] - except (ValueError, IndexError): - continue - if org_name in sources or len(sources) == 0: - yield record - - except OSError as exc: - raise FileProblem( - f"Could not process file '{self.transcripts_file}'" - ) from exc - - try: - SeqIO.write( - sequences=yield_filtered_seqs(), - handle=outfile, - format='fasta', - ) - except OSError as exc: - raise FileProblem( - f"Failed to write to FASTA file '{outfile}'" - ) from exc - - LOGGER.debug(f"Filtered transcripts file: {outfile}") - return outfile - - @staticmethod - def get_fasta_size(fasta: Path) -> int: - """Get size of FASTA file in total nucleotides. - - Args: - fasta: Path to FASTA file. - - Returns: - Total number of nucleotides of all records. - - Raises: - FileProblem: Could not open FASTA file for reading. - """ - nucleotides: int = 0 - - try: - for record in SeqIO.parse( - handle=fasta, - format='fasta', - ): - nucleotides += len(record.seq) - - except OSError as exc: - raise FileProblem( - f"Could not process file: {fasta}" - ) from exc - - LOGGER.debug(f"Size of reference: {nucleotides}") - return nucleotides - - @staticmethod - def get_star_index_string_size(ref_size: int) -> int: - """Get length of STAR SA pre-indexing string. - - Cf. - https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf - - Args: - ref_size: Size of genome/transcriptome reference in nucleotides. - - Returns: - Size (in nucleotides) of SA pre-indexing string. - """ - index_string_size = min( - 14, - int(math.floor(math.log2(ref_size) / 2 - 1)) - ) - LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}") - return index_string_size - - @staticmethod - def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int: - """Get size of bins for STAR genome storage. + self.mapping.paths = self.paths + self.mapping.library_type = self.library_type + self.mapping.library_source = self.library_source + self.mapping.transcripts_file = self.transcripts_file + self.mapping.tmp_dir = self.tmp_dir - Args: - ref_size: Size of genome/transcriptome reference in nucleotides. - transcripts: Path to filtered FASTA transcripts file. - - Returns: - Number of bins for genome storage. - """ - n_ref: int = 0 - - for _ in SeqIO.parse( - handle=transcripts, - format='fasta', - ): - n_ref += 1 - - chr_bin_bits = min( - 18, - int(round(math.log2(ref_size / n_ref))) - ) - LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits) - return chr_bin_bits - - def create_star_index( - self, - fasta: Path, - chr_bin_bits: int = 18, - index_string_size: int = 5, - ) -> Path: - """Prepare STAR index. - - Args: - fasta: Path to FASTA file of sequence records to create index from. - index_string_size: Size of SA pre-indexing string, in nucleotides. - - Returns: - Path to directory containing STAR index. - - Raises: - StarProblem: STAR index could not be created. - """ - LOGGER.debug(f"Creating STAR index for: {fasta}") - - index_dir: Path = Path(self.tmp_dir) / "index" - - # solves the macOS issue with STAR - index_dir.mkdir(parents=True, exist_ok=True) - - cmd = [ - "STAR", - "--runMode", "genomeGenerate", - "--genomeSAindexNbases", f"{str(index_string_size)}", - "--genomeChrBinNbits", f"{str(chr_bin_bits)}", - "--runThreadN", f"{str(self.threads_star)}", - "--genomeDir", f"{str(index_dir)}", - "--genomeFastaFiles", f"{str(fasta)}", - ] - - result = sp.run( - cmd, - capture_output=True, - text=True, - ) - if result.returncode != 0: - LOGGER.error(result.stderr) - raise StarProblem("Failed to create STAR index") + if not self.mapping.mapped: + self.mapping.evaluate() - LOGGER.debug(f"STAR index created: {index_dir}") - return index_dir - - def prepare_star_alignment_commands( - self, - index_dir: Path, - ) -> Dict[Path, List[str]]: - """Prepare STAR alignment commands. - - Args: - index_dir: Path to directory containing STAR index. - - Returns: - Dictionary of output paths and corresponding STAR commands. - """ - LOGGER.debug("Preparing STAR commands...") - - # helper function for compiling individual command - def build_star_command( - read_files: List[str], - out_dir: str, - ) -> List[str]: - """Compile an individual STAR alignment command. - - Args: - read_files: List of read file paths. - out_dir: STAR output directory. - - Returns: - STAR command list. - """ - cmd_base: List[str] = [ - "STAR", - "--alignIntronMax", "1", - "--alignEndsType", "Local", - "--runThreadN", f"{str(self.threads_star)}", - "--genomeDir", f"{str(index_dir)}", - "--outFilterMultimapNmax", "50", - "--outSAMunmapped", "Within", "KeepPairs", - ] - cmd: List[str] = cmd_base[:] - cmd.append("--readFilesIn") - cmd.extend(read_files) - cmd.append("--outFileNamePrefix") - cmd.append(out_dir) - - # solves the macOS issue with STAR - Path(out_dir).mkdir(parents=True, exist_ok=True) - - return cmd - - out_dir_base: Path = Path(self.tmp_dir) / "alignments" - commands: Dict = {} - - # create command for pairend-ended libraries - if ( - self.library_type.relationship - == StatesTypeRelationship.split_mates - ): - out_dir = out_dir_base / "paired" - commands[out_dir] = build_star_command( - read_files=[str(path) for path in self.paths], - out_dir=f"{str(out_dir)}/", - ) - # create commands for single-ended libraries - else: - out_dir = out_dir_base / "file_1" - commands[out_dir] = build_star_command( - read_files=[str(self.paths[0])], - out_dir=f"{str(out_dir)}/", - ) - # run two commands in case there is a second file provided that is - # not a mate of the first one - out_dir = out_dir_base / "file_2" - if self.paths[1] is not None: - commands[out_dir] = build_star_command( - read_files=[str(self.paths[1])], - out_dir=f"{str(out_dir)}/", - ) - - return commands - - @staticmethod - def generate_star_alignments(commands: Dict[Path, List[str]]) -> None: - """Align reads to index with STAR. - - Args: - commands: Dictionary of output paths and corresponding STAR - commands. - - Raises: - StarProblem: Generating alignments failed. - """ - LOGGER.debug("Aligning reads with STAR...") - - # execute commands - for out_dir, cmd in commands.items(): - try: - result = sp.run( - cmd, - capture_output=True, - text=True, - check=True, - ) - if result.returncode != 0: - LOGGER.error(result.stderr) - raise StarProblem( - "Failed to generate STAR alignments for command: " - f"{cmd}" - ) - except sp.CalledProcessError as exc: - raise StarProblem( - f"Failed to generate STAR alignments for command: {cmd}" - ) from exc - LOGGER.debug(f"Written STAR output to directory: {out_dir}") + return self.process_alignments(star_dirs=self.mapping.star_dirs) def process_alignments( self, @@ -459,17 +144,11 @@ def process_single( states[record.query_name].append( StatesOrientation.stranded_forward ) - - except OSError as exc: + except (OSError, ValueError) as exc: raise FileProblem( f"Failed to open SAM file: '{sam}'" ) from exc - except ValueError as exc: - raise SamFileProblem( - f"Not a valid SAM file: '{sam}'" - ) from exc - LOGGER.debug("Deciding read orientation...") reads = len(states) fractions = [ diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 28dd3bf8..9f2f6602 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -27,6 +27,7 @@ Config, ) from htsinfer.subset_fastq import SubsetFastq +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -64,6 +65,7 @@ def __init__( else config.args.tmp_dir / config.args.transcripts_file.name ) self.state: RunStates = RunStates.OKAY + self.mapping: Mapping = Mapping(config=self.config) def evaluate(self): """Determine library metadata.""" @@ -247,6 +249,7 @@ def get_library_type(self): """Determine library type.""" get_lib_type = GetLibType( config=self.config, + mapping=self.mapping, ) get_lib_type.evaluate() self.config.results.library_type = get_lib_type.results @@ -255,6 +258,7 @@ def get_read_orientation(self): """Determine read orientation.""" get_read_orientation = GetOrientation( config=self.config, + mapping=self.mapping, ) self.config.results.read_orientation = get_read_orientation.evaluate() diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py new file mode 100644 index 00000000..cb53f7a5 --- /dev/null +++ b/htsinfer/mapping.py @@ -0,0 +1,378 @@ +"""Mapping FASTQ's and managing the outputs of STAR.""" + +import logging +import math +from pathlib import Path +import subprocess as sp +from typing import (Dict, List) + +from Bio import SeqIO # type: ignore + +from htsinfer.exceptions import ( + FileProblem, + StarProblem, +) +from htsinfer.models import ( + Config, + StatesTypeRelationship, +) + +LOGGER = logging.getLogger(__name__) + + +class Mapping: + """Map FASTQ file/s and manage outputs. + + Args: + path: Path to FASTQ file. + + Attributes: + path_1: Path to single-end library or first mate file. + path_2: Path to second mate file. + + Raise: + FileProblem: The input file could not be parsed or the output file + could not be written. + """ + + def __init__( + self, + config: Config, + ): + """Class contructor.""" + self.paths = (config.args.path_1_processed, + config.args.path_2_processed) + self.library_type = config.results.library_type + self.library_source = config.results.library_source + self.transcripts_file = config.args.t_file_processed + self.tmp_dir = config.args.tmp_dir + self.threads_star = config.args.threads + self.mapped = False + self.star_dirs: List[Path] = [] + + def evaluate(self): + """Infer read orientation. + + Returns: + Orientation results object. + """ + + # get transcripts for current organims + transcripts = self.subset_transcripts_by_organism() + ref_size = self.get_fasta_size(fasta=transcripts) + index_string_size = self.get_star_index_string_size(ref_size=ref_size) + chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size, + transcripts=transcripts) + + # generate STAR alignments + index_dir = self.create_star_index( + fasta=transcripts, + index_string_size=index_string_size, + chr_bin_bits=chr_bin_bits, + ) + star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir) + self.generate_star_alignments(commands=star_cmds) + # process alignments + self.star_dirs = [e for e in star_cmds if e is not None] + self.mapped = True + + def subset_transcripts_by_organism(self) -> Path: + """Filter FASTA file of transcripts by current sources. + + The filtered file contains records from the indicated sources. + Typically, this is one source. However, for if two input files + were supplied that are originating from different sources (i.e., + not from a valid paired-ended library), it may be from two + different sources. If no source is supplied (because it could + not be inferred), no filtering is done. + + Returns: + Path to filtered FASTA file. + + Raises: + FileProblem: Could not open input/output FASTA file for + reading/writing. + """ + LOGGER.debug(f"Subsetting transcripts for: {self.library_source}") + + outfile = self.tmp_dir / "transcripts_subset.fasta" + + def yield_filtered_seqs(): + """Generator yielding sequence records for specified sources. + + Yields: + Next FASTA sequence record of the specified sources. + + Raises: Could not process input FASTA file. + """ + sources = [] + if self.library_source.file_1.short_name is not None: + sources.append(self.library_source.file_1.short_name) + if self.library_source.file_2.short_name is not None: + sources.append(self.library_source.file_2.short_name) + try: + for record in SeqIO.parse( + handle=self.transcripts_file, + format='fasta', + ): + try: + org_name = record.description.split("|")[3] + except (ValueError, IndexError): + continue + if org_name in sources or len(sources) == 0: + yield record + + except OSError as exc: + raise FileProblem( + f"Could not process file '{self.transcripts_file}'" + ) from exc + + try: + SeqIO.write( + sequences=yield_filtered_seqs(), + handle=outfile, + format='fasta', + ) + except OSError as exc: + raise FileProblem( + f"Failed to write to FASTA file '{outfile}'" + ) from exc + + LOGGER.debug(f"Filtered transcripts file: {outfile}") + return outfile + + @staticmethod + def get_fasta_size(fasta: Path) -> int: + """Get size of FASTA file in total nucleotides. + + Args: + fasta: Path to FASTA file. + + Returns: + Total number of nucleotides of all records. + + Raises: + FileProblem: Could not open FASTA file for reading. + """ + nucleotides: int = 0 + + try: + for record in SeqIO.parse( + handle=fasta, + format='fasta', + ): + nucleotides += len(record.seq) + + except OSError as exc: + raise FileProblem( + f"Could not process file: {fasta}" + ) from exc + + LOGGER.debug(f"Size of reference: {nucleotides}") + return nucleotides + + @staticmethod + def get_star_index_string_size(ref_size: int) -> int: + """Get length of STAR SA pre-indexing string. + + Cf. + https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf + + Args: + ref_size: Size of genome/transcriptome reference in nucleotides. + + Returns: + Size (in nucleotides) of SA pre-indexing string. + """ + index_string_size = min( + 14, + int(math.floor(math.log2(ref_size) / 2 - 1)) + ) + LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}") + return index_string_size + + @staticmethod + def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int: + """Get size of bins for STAR genome storage. + + Args: + ref_size: Size of genome/transcriptome reference in nucleotides. + transcripts: Path to filtered FASTA transcripts file. + + Returns: + Number of bins for genome storage. + """ + n_ref: int = 0 + + for _ in SeqIO.parse( + handle=transcripts, + format='fasta', + ): + n_ref += 1 + + chr_bin_bits = min( + 18, + int(round(math.log2(ref_size / n_ref))) + ) + LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits) + return chr_bin_bits + + def create_star_index( + self, + fasta: Path, + chr_bin_bits: int = 18, + index_string_size: int = 5, + ) -> Path: + """Prepare STAR index. + + Args: + fasta: Path to FASTA file of sequence records to create index from. + index_string_size: Size of SA pre-indexing string, in nucleotides. + + Returns: + Path to directory containing STAR index. + + Raises: + StarProblem: STAR index could not be created. + """ + LOGGER.debug(f"Creating STAR index for: {fasta}") + + index_dir: Path = Path(self.tmp_dir) / "index" + + # solves the macOS issue with STAR + index_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "STAR", + "--runMode", "genomeGenerate", + "--genomeSAindexNbases", f"{str(index_string_size)}", + "--genomeChrBinNbits", f"{str(chr_bin_bits)}", + "--runThreadN", f"{str(self.threads_star)}", + "--genomeDir", f"{str(index_dir)}", + "--genomeFastaFiles", f"{str(fasta)}", + ] + + result = sp.run( + cmd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + LOGGER.error(result.stderr) + raise StarProblem("Failed to create STAR index") + + LOGGER.debug(f"STAR index created: {index_dir}") + return index_dir + + def prepare_star_alignment_commands( + self, + index_dir: Path, + ) -> Dict[Path, List[str]]: + """Prepare STAR alignment commands. + + Args: + index_dir: Path to directory containing STAR index. + + Returns: + Dictionary of output paths and corresponding STAR commands. + """ + LOGGER.debug("Preparing STAR commands...") + + # helper function for compiling individual command + def build_star_command( + read_files: List[str], + out_dir: str, + ) -> List[str]: + """Compile an individual STAR alignment command. + + Args: + read_files: List of read file paths. + out_dir: STAR output directory. + + Returns: + STAR command list. + """ + cmd_base: List[str] = [ + "STAR", + "--alignIntronMax", "1", + "--alignEndsType", "Local", + "--runThreadN", f"{str(self.threads_star)}", + "--genomeDir", f"{str(index_dir)}", + "--outFilterMultimapNmax", "50", + "--outSAMunmapped", "Within", "KeepPairs", + ] + cmd: List[str] = cmd_base[:] + cmd.append("--readFilesIn") + cmd.extend(read_files) + cmd.append("--outFileNamePrefix") + cmd.append(out_dir) + + # solves the macOS issue with STAR + Path(out_dir).mkdir(parents=True, exist_ok=True) + + return cmd + + out_dir_base: Path = Path(self.tmp_dir) / "alignments" + commands: Dict = {} + + # create command for pairend-ended libraries + if ( + self.library_type.relationship + == StatesTypeRelationship.split_mates + ): + out_dir = out_dir_base / "paired" + commands[out_dir] = build_star_command( + read_files=[str(path) for path in self.paths], + out_dir=f"{str(out_dir)}/", + ) + # create commands for single-ended libraries + else: + out_dir = out_dir_base / "file_1" + commands[out_dir] = build_star_command( + read_files=[str(self.paths[0])], + out_dir=f"{str(out_dir)}/", + ) + # run two commands in case there is a second file provided that is + # not a mate of the first one + out_dir = out_dir_base / "file_2" + if self.paths[1] is not None: + commands[out_dir] = build_star_command( + read_files=[str(self.paths[1])], + out_dir=f"{str(out_dir)}/", + ) + + return commands + + @staticmethod + def generate_star_alignments(commands: Dict[Path, List[str]]) -> None: + """Align reads to index with STAR. + + Args: + commands: Dictionary of output paths and corresponding STAR + commands. + + Raises: + StarProblem: Generating alignments failed. + """ + LOGGER.debug("Aligning reads with STAR...") + + # execute commands + for out_dir, cmd in commands.items(): + try: + result = sp.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + if result.returncode != 0: + LOGGER.error(result.stderr) + raise StarProblem( + "Failed to generate STAR alignments for command: " + f"{cmd}" + ) + except sp.CalledProcessError as exc: + raise StarProblem( + f"Failed to generate STAR alignments for command: {cmd}" + ) from exc + LOGGER.debug(f"Written STAR output to directory: {out_dir}") diff --git a/setup.py b/setup.py index 6b037f1e..ed2144a3 100644 --- a/setup.py +++ b/setup.py @@ -26,9 +26,9 @@ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Natural Language :: English", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Utilities", ], diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index c738d8fd..dd52dfaa 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -38,6 +38,7 @@ SEQ_ID_MATE_2, SEQ_ID_SINGLE, CONFIG, + MAPPING, ) @@ -47,20 +48,23 @@ class TestGetLibType: def test_init_required(self): """Create instance with required parameters.""" CONFIG.args.path_2_processed = None - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) assert test_instance.path_1 == FILE_MATE_1 def test_init_all(self): """Create instance with all available parameters.""" CONFIG.args.path_2_processed = FILE_MATE_2 - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) assert test_instance.path_1 == FILE_MATE_1 assert test_instance.path_2 == FILE_MATE_2 def test_evaluate_one_file(self): """Get library type for a single file.""" CONFIG.args.path_2_processed = None - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert test_instance.results == ResultsType( file_1=StatesType.first_mate, @@ -71,7 +75,8 @@ def test_evaluate_one_file(self): def test_evaluate_two_files(self): """Get library type for two files.""" CONFIG.args.path_2_processed = FILE_MATE_2 - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert test_instance.results == ResultsType( file_1=StatesType.first_mate, @@ -83,7 +88,8 @@ def test_evaluate_mate_relationship_split_mates(self): """Test mate relationship evaluation logic with input files being mates of a paired-end library. """ - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.results.file_1 = StatesType.first_mate test_instance.results.file_2 = StatesType.second_mate test_instance._evaluate_mate_relationship( @@ -105,43 +111,38 @@ def test_evaluate_mate_relationship_split_mates(self): StatesTypeRelationship.split_mates ) - def test_evaluate_mate_relationship_not_mates(self): + def test_evaluate_mate_relationship_not_mates(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetLibType(config=CONFIG) + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.results.file_1 = StatesType.first_mate test_instance.results.file_2 = StatesType.second_mate - test_instance._evaluate_mate_relationship( - ids_1=["A", "B", "C"], - ids_2=["C", "B", "A"], - ) - assert ( - test_instance.results.relationship == - StatesTypeRelationship.not_mates - ) - test_instance.results.file_1 = StatesType.single - test_instance.results.file_2 = StatesType.first_mate - test_instance._evaluate_mate_relationship( - ids_1=["A", "B", "C"], - ids_2=["A", "B", "C"], - ) + test_instance.evaluate() assert ( test_instance.results.relationship == StatesTypeRelationship.not_mates ) - def test_evaluate_split_mates_not_matching_ids(self): + def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetLibType(config=CONFIG) + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2) + MAPPING.tmp_dir = tmpdir + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert ( test_instance.results.relationship == diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index 75a2a25d..10894334 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -4,8 +4,6 @@ from htsinfer.exceptions import ( FileProblem, - SamFileProblem, - StarProblem, ) from htsinfer.get_read_orientation import GetOrientation from htsinfer.models import ( @@ -18,11 +16,8 @@ StatesTypeRelationship, ) from tests.utils import ( - FILE_2000_RECORDS, - FILE_DUMMY, FILE_EMPTY_ALIGNED_SAM, FILE_BAD_ALIGNED_SAM, - FILE_INVALID_TRANSCRIPTS, FILE_MATE_1, FILE_MATE_2, FILE_ORIENTATION_ISF_1, @@ -38,11 +33,9 @@ FILE_UNMAPPED_PAIRED_1, FILE_UNMAPPED_PAIRED_2, FILE_UNMAPPED_SINGLE, + FILE_IDS_NOT_MATCH_2, CONFIG, - RaiseError, - SubprocessError, - SOURCE_HUMAN, - SOURCE_FRUIT_FLY, + MAPPING, ) @@ -54,7 +47,10 @@ def test_init_required(self): CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = None CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetOrientation(config=CONFIG) + CONFIG.results.library_type = ResultsType() + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.library_type == ResultsType() assert test_instance.transcripts_file == FILE_TRANSCRIPTS @@ -64,7 +60,8 @@ def test_init_required_paired(self): CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.paths[1] == FILE_MATE_2 assert test_instance.library_type == ResultsType() @@ -76,14 +73,14 @@ def test_init_all(self, tmpdir): CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.paths[1] == FILE_MATE_2 assert test_instance.library_type == ResultsType() assert test_instance.library_source == ResultsSource() assert test_instance.transcripts_file == FILE_TRANSCRIPTS assert test_instance.tmp_dir == tmpdir - assert test_instance.threads_star == 1 assert test_instance.min_mapped_reads == 18 assert test_instance.min_fraction == 0.75 @@ -94,7 +91,9 @@ def test_evaluate_single_unmapped(self, tmpdir): CONFIG.args.path_1_processed = FILE_UNMAPPED_SINGLE CONFIG.args.path_2_processed = None CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, @@ -110,7 +109,9 @@ def test_evaluate_single_sf(self, tmpdir): file_2=Source(), ) CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_forward, @@ -122,7 +123,9 @@ def test_evaluate_single_sr(self, tmpdir): """Get read orientation for a single-end stranded reverse library.""" CONFIG.args.path_1_processed = FILE_ORIENTATION_SR CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_reverse, @@ -134,7 +137,9 @@ def test_evaluate_single_u(self, tmpdir): """Get read orientation for a single-end unstranded library.""" CONFIG.args.path_1_processed = FILE_ORIENTATION_U CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.unstranded, @@ -153,7 +158,9 @@ def test_evaluate_paired_unmapped(self, tmpdir): CONFIG.results.library_type = ResultsType( relationship=StatesTypeRelationship.split_mates, ) - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, @@ -174,7 +181,9 @@ def test_evaluate_paired_isf(self, tmpdir): ) CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_forward, @@ -187,7 +196,9 @@ def test_evaluate_paired_isr(self, tmpdir): CONFIG.args.path_1_processed = FILE_ORIENTATION_ISR_1 CONFIG.args.path_2_processed = FILE_ORIENTATION_ISR_2 CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_reverse, @@ -200,7 +211,9 @@ def test_evaluate_paired_iu(self, tmpdir): CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.unstranded, @@ -208,183 +221,21 @@ def test_evaluate_paired_iu(self, tmpdir): relationship=StatesOrientationRelationship.inward_unstranded, ) - def test_subset_transcripts_by_organism(self, tmpdir): - """Get filtered orgainsm transcripts for different organisms.""" - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.subset_transcripts_by_organism() - filtered_organisms_transcripts = \ - tmpdir / f"{CONFIG.results.library_source}.fasta" - assert results == filtered_organisms_transcripts - - def test_subset_transcripts_by_organism_file_problem(self, tmpdir): - """Pass dummy file as transcripts.fasta file to simulate - file problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.t_file_processed = FILE_DUMMY - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - - with pytest.raises(FileProblem): - test_instance.subset_transcripts_by_organism() - - def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir): - """Pass invalid transcripts.fasta file to simulate index error.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY - ) - CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.subset_transcripts_by_organism() - filtered_organisms_transcripts = \ - tmpdir / f"{CONFIG.results.library_source}.fasta" - assert results == filtered_organisms_transcripts - - def test_get_fasta_size(self, tmpdir): - """Get nucleotide statistics for filtererd transcripts - with different organisms.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - filtered_organisms_transcripts = \ - test_instance.subset_transcripts_by_organism() - results = test_instance.get_fasta_size(filtered_organisms_transcripts) - assert results == 249986 - - def test_get_fasta_size_file_problem(self, tmpdir): - """Pass dummy file as filtered_organisms_transcripts - to simulate file problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - with pytest.raises(FileProblem): - test_instance.get_fasta_size(FILE_DUMMY) - - def test_get_star_index_string_size(self, tmpdir): - """Get length of STAR SA pre-indexing string.""" - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.get_star_index_string_size(249986) - assert results == 7 - - def test_evaluate_star_index_problem(self, monkeypatch, tmpdir): - """Force raising exception to stimulate a star problem.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - monkeypatch.setattr( - 'htsinfer.get_read_orientation.GetOrientation.create_star_index', - lambda *args, **kwargs: StarProblem - ) - with pytest.raises(StarProblem): - test_instance.evaluate() - - def test_prepare_star_alignment_commands(self, tmpdir): - """Get star alignment command.""" - CONFIG.args.path_1_processed = FILE_2000_RECORDS - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, - ) - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=Source(), - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - index_dir = tmpdir / 'index' - file1_alignment_path = tmpdir / 'alignments/file_1' - cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ - + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ - + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ - + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ - + str(file1_alignment_path) + "/" - results = test_instance.prepare_star_alignment_commands( - index_dir=index_dir - ) - assert ' '.join(list(results.values())[0]) == cmd - - def test_generate_star_alignments_problem(self, monkeypatch, tmpdir): - """Force raising exception to simulate problem.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - sub_method_name = 'htsinfer.get_read_orientation.' + \ - 'GetOrientation.generate_star_alignments' - monkeypatch.setattr( - sub_method_name, - lambda *args, **kwargs: StarProblem, - ) - with pytest.raises(FileProblem): - test_instance.evaluate() - - def test_generate_star_alignments_dummy_cmd(self, tmpdir): - """Pass dummy cmd to force simulate star problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - index_dir = tmpdir / 'index' - file1_alignment_path = tmpdir / 'alignments/file_1' - dummy_cmd = [ - 'STAR', '--alignIntrnMax', '1', - '--alignEndsType', 'Local', '--runThreadN', '1', - "--genomeDir", f"{str(index_dir)}", - ] - cmds = {file1_alignment_path: dummy_cmd} - with pytest.raises(StarProblem): - test_instance.generate_star_alignments(cmds) - - def test_process_single_dummy_sam_file(self, tmpdir): + def test_process_single_dummy_sam_file_file_problem(self, tmpdir): """Pass dummy aligned.out.sam file to simulate file problem.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - with pytest.raises(SamFileProblem): + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) + with pytest.raises(FileProblem): test_instance.process_single(FILE_EMPTY_ALIGNED_SAM) def test_process_paired_dummy_sam_file(self, tmpdir): """Pass dummy aligned.out.sam file to simulate file problem.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) with pytest.raises(FileProblem): test_instance.process_paired(FILE_EMPTY_ALIGNED_SAM) @@ -392,69 +243,31 @@ def test_process_paired_wrong_sam_file(self, tmpdir): """Pass bad_aligned.out.sam file to ensure correct paired file behaviour.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.process_paired(FILE_BAD_ALIGNED_SAM) \ == ResultsOrientation() - def test_create_star_index_star_problem(self, tmpdir): - """Pass invalid transcripts path to simulate star problem.""" - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - transcripts_path = tmpdir / 'invalid' - with pytest.raises(StarProblem): - test_instance.create_star_index(transcripts_path) - def test_evaluate_paired_not_mates_unmapped(self, tmpdir): """Get read orientation for a paired-end library with no mappable reads. """ CONFIG.args.path_1_processed = FILE_UNMAPPED_PAIRED_1 - CONFIG.args.path_2_processed = FILE_UNMAPPED_PAIRED_2 + CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, + relationship=StatesTypeRelationship.not_available, ) + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source(), + ) CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, - file_2=StatesOrientation.not_available, + file_2=StatesOrientation.stranded_reverse, relationship=StatesOrientationRelationship.not_available, ) - - def test_subset_transcripts_by_organism_cannot_write_file( - self, monkeypatch, tmpdir - ): - """Force raising of ``OSError`` to simulate file problem.""" - CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 - CONFIG.args.path_2_processed = None - CONFIG.results.library_source = ResultsSource() - CONFIG.results.library_type = ResultsType() - CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - monkeypatch.setattr( - 'Bio.SeqIO.write', - RaiseError(exc=OSError), - ) - with pytest.raises(FileProblem): - test_instance.subset_transcripts_by_organism() - - def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir): - """Force raising of ``SubprocessError`` to simulate star probelm.""" - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - file1_alignment_path = tmpdir / 'alignments/file_1' - dummy_cmd = [ - 'STAR', '--alignIntrnMax', '1', - '--alignEndsType', 'Local', '--runThreadN', '1', - "--genomeDir", - ] - cmds = {file1_alignment_path: dummy_cmd} - monkeypatch.setattr( - 'subprocess.run', - lambda *args, **kwargs: SubprocessError(), - ) - with pytest.raises(StarProblem): - test_instance.generate_star_alignments(cmds) diff --git a/tests/test_mapping.py b/tests/test_mapping.py new file mode 100644 index 00000000..33798bed --- /dev/null +++ b/tests/test_mapping.py @@ -0,0 +1,259 @@ +"""Unit tests for module ``mapping.py``.""" + +import pytest + +from htsinfer.exceptions import ( + FileProblem, + StarProblem, +) +from htsinfer.mapping import Mapping +from htsinfer.models import ( + ResultsSource, + ResultsType, + Source, + StatesTypeRelationship, +) +from tests.utils import ( + FILE_2000_RECORDS, + FILE_DUMMY, + FILE_INVALID_TRANSCRIPTS, + FILE_MATE_1, + FILE_MATE_2, + FILE_ORIENTATION_IU_1, + FILE_ORIENTATION_IU_2, + FILE_TRANSCRIPTS, + CONFIG, + MAPPING, + RaiseError, + SubprocessError, + SOURCE_HUMAN, + SOURCE_FRUIT_FLY, +) + + +class TestMapping: + """Test ``Mapping`` class.""" + + def test_init_required(self): + """Create instance with required parameters.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = None + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.results.library_type = ResultsType() + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.library_type == ResultsType() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + + def test_init_required_paired(self): + """Create instance with required parameters for paired-end library.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.paths[1] == FILE_MATE_2 + assert test_instance.library_type == ResultsType() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + + def test_init_all(self, tmpdir): + """Create instance with all available parameters.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.paths[1] == FILE_MATE_2 + assert test_instance.library_type == ResultsType() + assert test_instance.library_source == ResultsSource() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + assert test_instance.tmp_dir == tmpdir + + def test_subset_transcripts_by_organism(self, tmpdir): + """Get filtered orgainsm transcripts for different organisms.""" + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY + ) + CONFIG.args.tmp_dir = tmpdir + MAPPING.mapped = False + test_instance = Mapping(config=CONFIG) + results = test_instance.subset_transcripts_by_organism() + filtered_organisms_transcripts = \ + tmpdir / "transcripts_subset.fasta" + assert results == filtered_organisms_transcripts + + def test_subset_transcripts_by_organism_file_problem(self, tmpdir): + """Pass dummy file as transcripts.fasta file to simulate + file problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.t_file_processed = FILE_DUMMY + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + + with pytest.raises(FileProblem): + test_instance.subset_transcripts_by_organism() + + def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir): + """Pass invalid transcripts.fasta file to simulate index error.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY + ) + CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + results = test_instance.subset_transcripts_by_organism() + filtered_organisms_transcripts = \ + tmpdir / "transcripts_subset.fasta" + assert results == filtered_organisms_transcripts + + def test_get_fasta_size(self, tmpdir): + """Get nucleotide statistics for filtererd transcripts + with different organisms.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY, + ) + CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + filtered_organisms_transcripts = \ + test_instance.subset_transcripts_by_organism() + results = test_instance.get_fasta_size(filtered_organisms_transcripts) + assert results == 249986 + + def test_get_fasta_size_file_problem(self, tmpdir): + """Pass dummy file as filtered_organisms_transcripts + to simulate file problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + with pytest.raises(FileProblem): + test_instance.get_fasta_size(FILE_DUMMY) + + def test_get_star_index_string_size(self, tmpdir): + """Get length of STAR SA pre-indexing string.""" + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + results = test_instance.get_star_index_string_size(249986) + assert results == 7 + + def test_evaluate_star_index_problem(self, monkeypatch, tmpdir): + """Force raising exception to stimulate a star problem.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY, + ) + CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + monkeypatch.setattr( + 'htsinfer.mapping.Mapping.create_star_index', + lambda *args, **kwargs: StarProblem + ) + with pytest.raises(StarProblem): + test_instance.evaluate() + + def test_prepare_star_alignment_commands(self, tmpdir): + """Get star alignment command.""" + CONFIG.args.path_1_processed = FILE_2000_RECORDS + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.not_mates, + ) + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=Source(), + ) + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + index_dir = tmpdir / 'index' + file1_alignment_path = tmpdir / 'alignments/file_1' + cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ + + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ + + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ + + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ + + str(file1_alignment_path) + "/" + results = test_instance.prepare_star_alignment_commands( + index_dir=index_dir + ) + assert ' '.join(list(results.values())[0]) == cmd + + def test_generate_star_alignments_dummy_cmd(self, tmpdir): + """Pass dummy cmd to force simulate star problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + index_dir = tmpdir / 'index' + file1_alignment_path = tmpdir / 'alignments/file_1' + dummy_cmd = [ + 'STAR', '--alignIntrnMax', '1', + '--alignEndsType', 'Local', '--runThreadN', '1', + "--genomeDir", f"{str(index_dir)}", + ] + cmds = {file1_alignment_path: dummy_cmd} + with pytest.raises(StarProblem): + test_instance.generate_star_alignments(cmds) + + def test_create_star_index_star_problem(self, tmpdir): + """Pass invalid transcripts path to simulate star problem.""" + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + transcripts_path = tmpdir / 'invalid' + with pytest.raises(StarProblem): + test_instance.create_star_index(transcripts_path) + + def test_subset_transcripts_by_organism_cannot_write_file( + self, monkeypatch, tmpdir + ): + """Force raising of ``OSError`` to simulate file problem.""" + CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 + CONFIG.args.path_2_processed = None + CONFIG.results.library_source = ResultsSource() + CONFIG.results.library_type = ResultsType() + CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + monkeypatch.setattr( + 'Bio.SeqIO.write', + RaiseError(exc=OSError), + ) + with pytest.raises(FileProblem): + test_instance.subset_transcripts_by_organism() + + def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir): + """Force raising of ``SubprocessError`` to simulate star probelm.""" + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + file1_alignment_path = tmpdir / 'alignments/file_1' + dummy_cmd = [ + 'STAR', '--alignIntrnMax', '1', + '--alignEndsType', 'Local', '--runThreadN', '1', + "--genomeDir", + ] + cmds = {file1_alignment_path: dummy_cmd} + monkeypatch.setattr( + 'subprocess.run', + lambda *args, **kwargs: SubprocessError(), + ) + with pytest.raises(StarProblem): + test_instance.generate_star_alignments(cmds) diff --git a/tests/utils.py b/tests/utils.py index 2c924d7a..dfa8b98c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ from typing import Type from htsinfer.models import (Source, Config, Args, Results) +from htsinfer.mapping import Mapping # test files PACKAGE_DIR = Path(__file__).resolve().parents[1] / "htsinfer" @@ -87,6 +88,8 @@ results=Results(), ) +MAPPING = Mapping(config=CONFIG) + # helper classes class SubprocessError: From 022d37d298c24d3172b28c42552cdd6990b60e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 31 Oct 2023 14:51:22 +0100 Subject: [PATCH 03/17] fix typo, update pylint config --- htsinfer/get_read_layout.py | 2 +- pylint.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/htsinfer/get_read_layout.py b/htsinfer/get_read_layout.py index 24529d3f..afd4c898 100644 --- a/htsinfer/get_read_layout.py +++ b/htsinfer/get_read_layout.py @@ -221,7 +221,7 @@ def evaluate(self) -> None: try: with open(self.path, encoding="utf-8") as _f: # type: ignore - LOGGER.debug("Procecssing Reads") + LOGGER.debug("Processing Reads") try: for record in FastqGeneralIterator(source=_f): read = record[1] diff --git a/pylint.cfg b/pylint.cfg index 96462c5d..ef923ddc 100644 --- a/pylint.cfg +++ b/pylint.cfg @@ -1,4 +1,4 @@ [MESSAGES CONTROL] -disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510 -extension-pkg-white-list=pysam,ahocorasick +disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510 +extension-pkg-whitelist=pysam,ahocorasick ignored-classes=pysam From 2bb2451f2b2f8c500a72e1f699e0154b24589050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 31 Oct 2023 15:06:12 +0100 Subject: [PATCH 04/17] feat: add org_id param #108 --- htsinfer/cli.py | 11 ------ htsinfer/get_library_source.py | 64 ++++++++++++++++++++++++++++++---- htsinfer/models.py | 4 +-- 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/htsinfer/cli.py b/htsinfer/cli.py index 256aeaa8..3dee2364 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -273,17 +273,6 @@ def __call__( "not be inferred by the application" ) ) - parser.add_argument( - '--org-name', - dest="org_name", - metavar="STR", - type=str, - default=None, - help=( - "source organism of the sequencing library; if provided, will not " - "not be inferred by the application" - ) - ) parser.add_argument( "--verbosity", choices=[e.name for e in LogLevels], diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index a07ecbc4..6c39d47b 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -2,10 +2,11 @@ import logging from pathlib import Path -from typing import Optional import subprocess as sp import tempfile +from typing import Optional +from Bio import SeqIO # type: ignore import pandas as pd # type: ignore from pandas import DataFrame # type: ignore @@ -51,6 +52,7 @@ class GetLibSource: min_freq_ratio: Minimum frequency ratio between the first and second most frequent source in order for the former to be considered the library's source. + org_id: Taxonomy ID of the organism. """ def __init__( # pylint: disable=E1101 self, @@ -64,7 +66,6 @@ def __init__( # pylint: disable=E1101 self.tmp_dir = config.args.tmp_dir self.min_match_pct = config.args.lib_source_min_match_pct self.min_freq_ratio = config.args.lib_source_min_freq_ratio - self.org_name = config.args.org_name self.org_id = config.args.org_id def evaluate(self) -> ResultsSource: @@ -75,9 +76,12 @@ def evaluate(self) -> ResultsSource: """ source = ResultsSource() # Check if library_source is provided, otherwise infer it - if self.org_name is not None: - source.file_1.short_name = self.org_name + if self.org_id is not None: source.file_1.taxon_id = self.org_id + source.file_1.short_name = self.get_organism_name( + self.org_id, + self.transcripts_file + ) else: # Infer library source here and set it to source.library_source index = self.create_kallisto_index() @@ -89,10 +93,11 @@ def evaluate(self) -> ResultsSource: source.file_1.taxon_id = library_source.taxon_id if self.paths[1] is not None: - # Check if library_source is provided for file_2, otherwise infer it - if self.org_name is not None: - source.file_2.short_name = self.org_name + # Check if library_source is provided for file_2, + # otherwise infer it + if self.org_id is not None: source.file_2.taxon_id = self.org_id + source.file_2.short_name = source.file_1.short_name else: library_source = self.get_source( fastq=self.paths[1], @@ -301,3 +306,48 @@ def get_source_expression( # return as dictionary return dat_agg.sort_values(["tpm"], ascending=False) + + @staticmethod + def get_organism_name( + taxon_id: int, + transcripts_file: Path, + ) -> Optional[str]: + """Return name of the organism, based on tax ID. + + Args: + taxon_id: Taxonomy ID of a given organism (int). + transcripts_file: Path to fasta file containing transcripts. + + Returns: + Short name of the organism belonging to the given tax ID. + + Raises: + Could not process input FASTA file. + """ + org_dict = {} + # Construct dictionary of organism ID's and names + try: + for record in SeqIO.parse( + handle=transcripts_file, + format='fasta', + ): + org_id = int(record.description.split("|")[4]) + org_name = record.description.split("|")[3] + + if org_id not in org_dict: + org_dict[org_id] = org_name + else: + org_dict[org_id] = org_name + + except OSError as exc: + raise FileProblem( + f"Could not process file '{transcripts_file}'" + ) from exc + + if taxon_id in org_dict: + return org_dict[taxon_id] + LOGGER.warning( + f"Taxon ID '{taxon_id}' not found in organism dictionary, " + "using default 'None' for organism." + ) + return None diff --git a/htsinfer/models.py b/htsinfer/models.py index 391d856b..a84ae35c 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -6,7 +6,7 @@ ) import logging import re -from typing import Optional, Union +from typing import Optional from pathlib import Path import tempfile @@ -356,7 +356,6 @@ class Args(BaseModel): records: Number of input file records to process; set to `0` to process all records. threads: Number of threads to run STAR with. - org_name: Organism name. org_id: Organism ID. transcripts_file: File path to transcripts FASTA file. read_layout_adapter_file: Path to text file containing 3' adapter @@ -431,7 +430,6 @@ class Args(BaseModel): CleanupRegimes.DEFAULT records: int = 0 threads: int = 1 - org_name: Optional[str] = None org_id: Optional[int] = None transcripts_file: Path = Path() read_layout_adapter_file: Path = Path() From 6656ff3c1bcffefe5a8a9b49b16c190a7d4e3b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 31 Oct 2023 16:40:37 +0100 Subject: [PATCH 05/17] refactor: get_library_source.py #108 --- htsinfer/get_library_source.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 6c39d47b..d42533a8 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -82,6 +82,9 @@ def evaluate(self) -> ResultsSource: self.org_id, self.transcripts_file ) + if self.paths[1] is not None: + source.file_2.taxon_id = self.org_id + source.file_2.short_name = source.file_1.short_name else: # Infer library source here and set it to source.library_source index = self.create_kallisto_index() @@ -92,13 +95,7 @@ def evaluate(self) -> ResultsSource: source.file_1.short_name = library_source.short_name source.file_1.taxon_id = library_source.taxon_id - if self.paths[1] is not None: - # Check if library_source is provided for file_2, - # otherwise infer it - if self.org_id is not None: - source.file_2.taxon_id = self.org_id - source.file_2.short_name = source.file_1.short_name - else: + if self.paths[1] is not None: library_source = self.get_source( fastq=self.paths[1], index=index, From 6b6bc589c9de13c1dd92942c8f0095e178a3bc4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Thu, 9 Nov 2023 13:39:18 +0100 Subject: [PATCH 06/17] test: add org param tests #108 --- tests/test_get_library_source.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py index 5b27e8a6..e8e01498 100644 --- a/tests/test_get_library_source.py +++ b/tests/test_get_library_source.py @@ -264,6 +264,52 @@ def test_evaluate_min_freq_ratio(self, tmpdir): file_2=Source() ) + def test_evaluate_org_id_not_none(self): + """Test when self.org_id is not None.""" + CONFIG.args.org_id = 7227 # An example taxon ID + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + test_instance = GetLibSource(config=CONFIG) + result = test_instance.evaluate() + + assert result.file_1.taxon_id == 7227 + assert result.file_1.short_name == "dmelanogaster" + + def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch): + """Test when self.org_id is None and self.paths[1] is not None.""" + CONFIG.args.org_id = None + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the get_source method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_source', + lambda *args, **kwargs: SOURCE_HUMAN, + ) + + result = test_instance.evaluate() + + assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id + assert result.file_2.short_name == SOURCE_HUMAN.short_name + + def test_evaluate_org_id_not_none_with_path_2(self, tmpdir): + """Test when self.org_id is not None and self.paths[1] is not None.""" + CONFIG.args.org_id = 7227 + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + result = test_instance.evaluate() + + assert result.file_2.taxon_id == 7227 + assert result.file_2.short_name == "dmelanogaster" + def test_create_kallisto_index_problem(self, tmpdir): """Pass invalid file as transcripts.fasta file to simulate KallistoProblem.""" @@ -276,3 +322,36 @@ def test_create_kallisto_index_problem(self, tmpdir): test_instance = GetLibSource(config=CONFIG) with pytest.raises(KallistoProblem): test_instance.create_kallisto_index() + + def test_get_organism_name_found(self): + """Test the function when the taxon_id + is found in the organism dictionary.""" + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + test_instance = GetLibSource(config=CONFIG) + taxon_id = 7227 + result = test_instance.get_organism_name( + taxon_id, CONFIG.args.t_file_processed + ) + assert result == "dmelanogaster" + + def test_get_organism_name_not_found(self): + """Test the function when the taxon_id + is not found in the organism dictionary.""" + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + test_instance = GetLibSource(config=CONFIG) + taxon_id = 12345 # A tax ID that doesn't exist in transcripts + result = test_instance.get_organism_name( + taxon_id, CONFIG.args.t_file_processed + ) + assert result is None + + def test_get_organism_name_file_problem(self): + """Test the function when there's a + file problem while processing the FASTA file.""" + CONFIG.args.t_file_processed = FILE_DUMMY + test_instance = GetLibSource(config=CONFIG) + taxon_id = 7227 + with pytest.raises(FileProblem): + test_instance.get_organism_name( + taxon_id, CONFIG.args.t_file_processed + ) From 6d359358f9288c59c35c7c117455c929662b32e2 Mon Sep 17 00:00:00 2001 From: balajtimate <51365402+balajtimate@users.noreply.github.com> Date: Thu, 9 Nov 2023 15:12:15 +0100 Subject: [PATCH 07/17] fix: update Pydantic version (#146) * fix pydantic issues * fix: update pydantic version in envs * fix: pin sphinx-rtd-theme into env * fix: update readthedocs config --- .readthedocs.yaml | 2 +- environment-dev.yml | 5 +++-- environment.yml | 2 +- htsinfer/htsinfer.py | 3 +-- tests/test_htsinfer.py | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 14c41f2e..0922ec04 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,4 +13,4 @@ formats: - pdf - epub conda: - environment: environment.yml + environment: environment-dev.yml diff --git a/environment-dev.yml b/environment-dev.yml index 40f4f5f4..c97c611a 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -13,12 +13,13 @@ dependencies: - pandas >=1.3.5, <1.4.0 - pip >=20.2.3 - pyahocorasick >=1.4.0 - - pydantic >=1.8.1, <2 + - pydantic >=2, <3 - pylint >=2.4.4 - pysam >=0.16.0 - pytest >=6.1.0 - python >=3.8, <=3.10 + - python-semantic-release >=8 + - sphinx-rtd-theme - star >=2.7.6 - pip: - - python-semantic-release>=7.15.0 - -e . diff --git a/environment.yml b/environment.yml index 08e4fd28..273edbed 100644 --- a/environment.yml +++ b/environment.yml @@ -10,7 +10,7 @@ dependencies: - pandas >=1.3.5, <1.4.0 - pip >=20.2.3 - pyahocorasick >=1.4.0 - - pydantic >=1.8.1, <2 + - pydantic >=2, <3 - pysam >=0.16.0 - python >=3.8, <=3.10 - star >=2.7.6 diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 9f2f6602..4967a4d9 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -315,8 +315,7 @@ def clean_up(self): def print(self): """Print results to STDOUT.""" sys.stdout.write( - self.config.results.json( + self.config.results.model_dump_json( indent=3, - sort_keys=False, ) + linesep ) diff --git a/tests/test_htsinfer.py b/tests/test_htsinfer.py index 0489023b..ea6e65f7 100644 --- a/tests/test_htsinfer.py +++ b/tests/test_htsinfer.py @@ -305,7 +305,7 @@ def test_clean_up_keep_none(self, tmpdir): def test_clean_up_keep_results(self, tmpdir): """Remove temporary data.""" arguments = Args(path_1=FILE_MATE_1, - out_dir=tmpdir, + out_dir=tmpdir.strpath, tmpdir=tmpdir, ) results = Results() @@ -323,8 +323,8 @@ def test_clean_up_keep_results(self, tmpdir): def test_clean_up_keep_all(self, tmpdir): """Remove no data.""" arguments = Args(path_1=FILE_MATE_1, - out_dir=tmpdir, - tmpdir=tmpdir, + out_dir=tmpdir.strpath, + tmpdir=tmpdir.strpath, ) results = Results() configs = Config( From 59ace859b5682e024c716432a7ea824259b9a2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Thu, 9 Nov 2023 16:09:22 +0100 Subject: [PATCH 08/17] update readme, gitignore --- .gitignore | 1 + README.md | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index a1f96fd6..c53c6f18 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,4 @@ tests/.DS_Store results_htsinfer .snakemake tests/cluster_tests/results_sra_downloads +*.out diff --git a/README.md b/README.md index 6149a708..2e0dcaab 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ htsinfer [--output-directory PATH] [--library-type-mates-cutoff FLOAT] [--read-orientation-min-mapped-reads INT] [--read-orientation-min-fraction FLOAT] + [--org-id INT] [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}] [-h] [--version] PATH [PATH] From 7359932014dceac99eecdb322c5967b04ae94078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Thu, 9 Nov 2023 16:09:43 +0100 Subject: [PATCH 09/17] feat: infer org source if id not in dict #108 --- htsinfer/get_library_source.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index d42533a8..4dd79c81 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -78,15 +78,37 @@ def evaluate(self) -> ResultsSource: # Check if library_source is provided, otherwise infer it if self.org_id is not None: source.file_1.taxon_id = self.org_id - source.file_1.short_name = self.get_organism_name( - self.org_id, - self.transcripts_file - ) - if self.paths[1] is not None: - source.file_2.taxon_id = self.org_id - source.file_2.short_name = source.file_1.short_name + org_name = self.get_organism_name(self.org_id, self.transcripts_file) + + if org_name is not None: + source.file_1.short_name = org_name + + if self.paths[1] is not None: + source.file_2.taxon_id = self.org_id + source.file_2.short_name = source.file_1.short_name + + else: + LOGGER.warning( + f"Taxon ID '{self.org_id}' not found in organism dictionary, " + "inferring source organism..." + ) + index = self.create_kallisto_index() + library_source = self.get_source( + fastq=self.paths[0], + index=index, + ) + source.file_1.short_name = library_source.short_name + source.file_1.taxon_id = library_source.taxon_id + + if self.paths[1] is not None: + library_source = self.get_source( + fastq=self.paths[1], + index=index, + ) + source.file_2.short_name = library_source.short_name + source.file_2.taxon_id = library_source.taxon_id + else: - # Infer library source here and set it to source.library_source index = self.create_kallisto_index() library_source = self.get_source( fastq=self.paths[0], @@ -343,8 +365,5 @@ def get_organism_name( if taxon_id in org_dict: return org_dict[taxon_id] - LOGGER.warning( - f"Taxon ID '{taxon_id}' not found in organism dictionary, " - "using default 'None' for organism." - ) + return None From fa1f3fc638fe4e5dac8bcacf1ece4982147ce7c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 10 Nov 2023 11:38:58 +0100 Subject: [PATCH 10/17] replace json with model_dump --- htsinfer/htsinfer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 4967a4d9..d29c89ae 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -85,7 +85,7 @@ def evaluate(self): self.get_library_stats() LOGGER.info( "Library stats determined: " - f"{self.config.results.library_stats.json()}" + f"{self.config.results.library_stats.model_dump_json()}" ) # determine library source @@ -93,7 +93,7 @@ def evaluate(self): self.config.results.library_source = self.get_library_source() LOGGER.info( "Library source determined: " - f"{self.config.results.library_source.json()}" + f"{self.config.results.library_source.model_dump_json()}" ) # determine library type @@ -106,7 +106,7 @@ def evaluate(self): LOGGER.warning(f"{type(exc).__name__}: {str(exc)}") LOGGER.info( "Library type determined: " - f"{self.config.results.library_type.json()}" + f"{self.config.results.library_type.model_dump_json()}" ) # determine read orientation @@ -119,7 +119,7 @@ def evaluate(self): LOGGER.warning(f"{type(exc).__name__}: {str(exc)}") LOGGER.info( "Read orientation determined: " - f"{self.config.results.read_orientation.json()}" + f"{self.config.results.read_orientation.model_dump_json()}" ) # determine read layout @@ -132,7 +132,7 @@ def evaluate(self): LOGGER.warning(f"{type(exc).__name__}: {str(exc)}") LOGGER.info( "Read layout determined: " - f"{self.config.results.read_layout.json()}" + f"{self.config.results.read_layout.model_dump_json()}" ) except FileProblem as exc: @@ -148,7 +148,7 @@ def evaluate(self): LOGGER.error(f"{type(exc).__name__}: {str(exc)}") # log results - LOGGER.info(f"Results: {self.config.results.json()}") + LOGGER.info(f"Results: {self.config.results.model_dump_json()}") def prepare_env(self): """Set up work environment.""" From fbd91b0b3706435e30ab79046f8230acdae1c783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 10 Nov 2023 13:24:52 +0100 Subject: [PATCH 11/17] feat: add org_id param #108 --- htsinfer/get_library_source.py | 11 ++-- tests/test_get_library_source.py | 90 ++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 4 deletions(-) diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 4dd79c81..b4d7b06a 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -78,8 +78,11 @@ def evaluate(self) -> ResultsSource: # Check if library_source is provided, otherwise infer it if self.org_id is not None: source.file_1.taxon_id = self.org_id - org_name = self.get_organism_name(self.org_id, self.transcripts_file) - + org_name = self.get_organism_name( + self.org_id, + self.transcripts_file + ) + if org_name is not None: source.file_1.short_name = org_name @@ -89,8 +92,8 @@ def evaluate(self) -> ResultsSource: else: LOGGER.warning( - f"Taxon ID '{self.org_id}' not found in organism dictionary, " - "inferring source organism..." + f"Taxon ID '{self.org_id}' not found in " + "organism dictionary, inferring source organism..." ) index = self.create_kallisto_index() library_source = self.get_source( diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py index e8e01498..5cd2df54 100644 --- a/tests/test_get_library_source.py +++ b/tests/test_get_library_source.py @@ -355,3 +355,93 @@ def test_get_organism_name_file_problem(self): test_instance.get_organism_name( taxon_id, CONFIG.args.t_file_processed ) + + def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir): + """Test when self.org_id is None.""" + CONFIG.args.org_id = None + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the create_kallisto_index method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.create_kallisto_index', + lambda *args, **kwargs: tmpdir / "kallisto.idx", + ) + + # Mock the get_source method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_source', + lambda *args, **kwargs: SOURCE_FRUIT_FLY, + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name + + assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name + + def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir): + """Test when self.org_id is not None but org_name is not found.""" + CONFIG.args.org_id = 7227 + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the get_organism_name method to return None + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_organism_name', + lambda *args, **kwargs: None, + ) + + # Mock the create_kallisto_index method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.create_kallisto_index', + lambda *args, **kwargs: tmpdir / "kallisto.idx", + ) + + # Mock the get_source method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_source', + lambda *args, **kwargs: SOURCE_FRUIT_FLY, + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name + + assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name + + def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir): + """Test when self.org_id is not None and org_name is found.""" + CONFIG.args.org_id = 7227 + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the get_organism_name method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_organism_name', + lambda *args, **kwargs: "dmelanogaster", + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == 7227 + assert result.file_1.short_name == "dmelanogaster" + + assert result.file_2.taxon_id == 7227 + assert result.file_2.short_name == "dmelanogaster" From a6d4a56d53b824e6f78a3aed25fe41bb7c6c81fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 10 Nov 2023 13:24:52 +0100 Subject: [PATCH 12/17] feat: add org_id param #108 --- htsinfer/get_library_source.py | 11 ++-- tests/test_get_library_source.py | 90 ++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 4 deletions(-) diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 4dd79c81..b4d7b06a 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -78,8 +78,11 @@ def evaluate(self) -> ResultsSource: # Check if library_source is provided, otherwise infer it if self.org_id is not None: source.file_1.taxon_id = self.org_id - org_name = self.get_organism_name(self.org_id, self.transcripts_file) - + org_name = self.get_organism_name( + self.org_id, + self.transcripts_file + ) + if org_name is not None: source.file_1.short_name = org_name @@ -89,8 +92,8 @@ def evaluate(self) -> ResultsSource: else: LOGGER.warning( - f"Taxon ID '{self.org_id}' not found in organism dictionary, " - "inferring source organism..." + f"Taxon ID '{self.org_id}' not found in " + "organism dictionary, inferring source organism..." ) index = self.create_kallisto_index() library_source = self.get_source( diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py index e8e01498..5cd2df54 100644 --- a/tests/test_get_library_source.py +++ b/tests/test_get_library_source.py @@ -355,3 +355,93 @@ def test_get_organism_name_file_problem(self): test_instance.get_organism_name( taxon_id, CONFIG.args.t_file_processed ) + + def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir): + """Test when self.org_id is None.""" + CONFIG.args.org_id = None + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the create_kallisto_index method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.create_kallisto_index', + lambda *args, **kwargs: tmpdir / "kallisto.idx", + ) + + # Mock the get_source method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_source', + lambda *args, **kwargs: SOURCE_FRUIT_FLY, + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name + + assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name + + def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir): + """Test when self.org_id is not None but org_name is not found.""" + CONFIG.args.org_id = 7227 + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the get_organism_name method to return None + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_organism_name', + lambda *args, **kwargs: None, + ) + + # Mock the create_kallisto_index method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.create_kallisto_index', + lambda *args, **kwargs: tmpdir / "kallisto.idx", + ) + + # Mock the get_source method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_source', + lambda *args, **kwargs: SOURCE_FRUIT_FLY, + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name + + assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id + assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name + + def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir): + """Test when self.org_id is not None and org_name is found.""" + CONFIG.args.org_id = 7227 + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.args.out_dir = tmpdir + test_instance = GetLibSource(config=CONFIG) + + # Mock the get_organism_name method to return a specific result + monkeypatch.setattr( + 'htsinfer.get_library_source.GetLibSource.get_organism_name', + lambda *args, **kwargs: "dmelanogaster", + ) + + result = test_instance.evaluate() + + assert result.file_1.taxon_id == 7227 + assert result.file_1.short_name == "dmelanogaster" + + assert result.file_2.taxon_id == 7227 + assert result.file_2.short_name == "dmelanogaster" From 8c879356a7009d000c6e52aa2afbbb4e0da5cca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 14 Nov 2023 14:38:37 +0100 Subject: [PATCH 13/17] refactor: replace org with tax-id --- README.md | 2 +- htsinfer/cli.py | 8 ++--- htsinfer/get_library_source.py | 51 +++++++++++++++----------------- htsinfer/models.py | 4 +-- tests/test_get_library_source.py | 36 +++++++++++----------- 5 files changed, 49 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 2e0dcaab..53e7267d 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ htsinfer [--output-directory PATH] [--library-type-mates-cutoff FLOAT] [--read-orientation-min-mapped-reads INT] [--read-orientation-min-fraction FLOAT] - [--org-id INT] + [--tax-id INT] [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}] [-h] [--version] PATH [PATH] diff --git a/htsinfer/cli.py b/htsinfer/cli.py index ff12a825..217d6ce8 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -263,14 +263,14 @@ def __call__( ) ) parser.add_argument( - '--org-id', - dest="org_id", + '--tax-id', + dest="tax_id", metavar="INT", type=int, default=None, help=( - "source organism of the sequencing library; if provided, will not " - "not be inferred by the application" + "NCBI taxonomic identifier of source organism of the library; " + "if provided, will not be inferred by the application" ) ) parser.add_argument( diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index b4d7b06a..3fb70f84 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -4,8 +4,8 @@ from pathlib import Path import subprocess as sp import tempfile - from typing import Optional + from Bio import SeqIO # type: ignore import pandas as pd # type: ignore from pandas import DataFrame # type: ignore @@ -52,7 +52,7 @@ class GetLibSource: min_freq_ratio: Minimum frequency ratio between the first and second most frequent source in order for the former to be considered the library's source. - org_id: Taxonomy ID of the organism. + tax_id: Taxonomy ID of the organism. """ def __init__( # pylint: disable=E1101 self, @@ -66,7 +66,7 @@ def __init__( # pylint: disable=E1101 self.tmp_dir = config.args.tmp_dir self.min_match_pct = config.args.lib_source_min_match_pct self.min_freq_ratio = config.args.lib_source_min_freq_ratio - self.org_id = config.args.org_id + self.tax_id = config.args.tax_id def evaluate(self) -> ResultsSource: """Infer read source. @@ -76,23 +76,23 @@ def evaluate(self) -> ResultsSource: """ source = ResultsSource() # Check if library_source is provided, otherwise infer it - if self.org_id is not None: - source.file_1.taxon_id = self.org_id - org_name = self.get_organism_name( - self.org_id, + if self.tax_id is not None: + source.file_1.taxon_id = self.tax_id + src_name = self.get_source_name( + self.tax_id, self.transcripts_file ) - if org_name is not None: - source.file_1.short_name = org_name + if src_name is not None: + source.file_1.short_name = src_name if self.paths[1] is not None: - source.file_2.taxon_id = self.org_id + source.file_2.taxon_id = self.tax_id source.file_2.short_name = source.file_1.short_name else: LOGGER.warning( - f"Taxon ID '{self.org_id}' not found in " + f"Taxon ID '{self.tax_id}' not found in " "organism dictionary, inferring source organism..." ) index = self.create_kallisto_index() @@ -330,15 +330,15 @@ def get_source_expression( return dat_agg.sort_values(["tpm"], ascending=False) @staticmethod - def get_organism_name( + def get_source_name( taxon_id: int, transcripts_file: Path, ) -> Optional[str]: - """Return name of the organism, based on tax ID. + """Return name of the source organism, based on tax ID. Args: - taxon_id: Taxonomy ID of a given organism (int). - transcripts_file: Path to fasta file containing transcripts. + taxon_id: Taxonomy ID of a given organism. + transcripts_file: Path to FASTA file containing transcripts. Returns: Short name of the organism belonging to the given tax ID. @@ -346,27 +346,24 @@ def get_organism_name( Raises: Could not process input FASTA file. """ - org_dict = {} - # Construct dictionary of organism ID's and names + src_dict = {} + # Construct dictionary of taxonomy ID's and short names try: - for record in SeqIO.parse( + for record in list(SeqIO.parse( handle=transcripts_file, format='fasta', - ): - org_id = int(record.description.split("|")[4]) - org_name = record.description.split("|")[3] + )): + tax_id = int(record.description.split("|")[4]) + src_name = record.description.split("|")[3] - if org_id not in org_dict: - org_dict[org_id] = org_name - else: - org_dict[org_id] = org_name + src_dict[tax_id] = src_name except OSError as exc: raise FileProblem( f"Could not process file '{transcripts_file}'" ) from exc - if taxon_id in org_dict: - return org_dict[taxon_id] + if taxon_id in src_dict: + return src_dict[taxon_id] return None diff --git a/htsinfer/models.py b/htsinfer/models.py index 58ed952f..8174ded7 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -356,7 +356,7 @@ class Args(BaseModel): records: Number of input file records to process; set to `0` to process all records. threads: Number of threads to run STAR with. - org_id: Organism ID. + tax_id: Organism ID. transcripts_file: File path to transcripts FASTA file. read_layout_adapter_file: Path to text file containing 3' adapter sequences to scan for (one sequence per line). @@ -430,7 +430,7 @@ class Args(BaseModel): CleanupRegimes.DEFAULT records: int = 1000000 threads: int = 1 - org_id: Optional[int] = None + tax_id: Optional[int] = None transcripts_file: Path = Path() read_layout_adapter_file: Path = Path() read_layout_min_match_pct: float = 0.1 diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py index 5cd2df54..6e4b58a6 100644 --- a/tests/test_get_library_source.py +++ b/tests/test_get_library_source.py @@ -264,9 +264,9 @@ def test_evaluate_min_freq_ratio(self, tmpdir): file_2=Source() ) - def test_evaluate_org_id_not_none(self): - """Test when self.org_id is not None.""" - CONFIG.args.org_id = 7227 # An example taxon ID + def test_evaluate_tax_id_not_none(self): + """Test when self.tax_id is not None.""" + CONFIG.args.tax_id = 7227 # An example taxon ID CONFIG.args.t_file_processed = FILE_TRANSCRIPTS test_instance = GetLibSource(config=CONFIG) result = test_instance.evaluate() @@ -274,9 +274,9 @@ def test_evaluate_org_id_not_none(self): assert result.file_1.taxon_id == 7227 assert result.file_1.short_name == "dmelanogaster" - def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch): - """Test when self.org_id is None and self.paths[1] is not None.""" - CONFIG.args.org_id = None + def test_evaluate_tax_id_none_with_path_2(self, tmpdir, monkeypatch): + """Test when self.tax_id is None and self.paths[1] is not None.""" + CONFIG.args.tax_id = None CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS @@ -295,9 +295,9 @@ def test_evaluate_org_id_none_with_path_2(self, tmpdir, monkeypatch): assert result.file_2.taxon_id == SOURCE_HUMAN.taxon_id assert result.file_2.short_name == SOURCE_HUMAN.short_name - def test_evaluate_org_id_not_none_with_path_2(self, tmpdir): - """Test when self.org_id is not None and self.paths[1] is not None.""" - CONFIG.args.org_id = 7227 + def test_evaluate_tax_id_not_none_with_path_2(self, tmpdir): + """Test when self.tax_id is not None and self.paths[1] is not None.""" + CONFIG.args.tax_id = 7227 CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS @@ -356,9 +356,9 @@ def test_get_organism_name_file_problem(self): taxon_id, CONFIG.args.t_file_processed ) - def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir): - """Test when self.org_id is None.""" - CONFIG.args.org_id = None + def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir): + """Test when self.tax_id is None.""" + CONFIG.args.tax_id = None CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS @@ -386,9 +386,9 @@ def test_evaluate_org_id_is_none(self, monkeypatch, tmpdir): assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name - def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir): - """Test when self.org_id is not None but org_name is not found.""" - CONFIG.args.org_id = 7227 + def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir): + """Test when self.tax_id is not None but src_name is not found.""" + CONFIG.args.tax_id = 7227 CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS @@ -422,9 +422,9 @@ def test_evaluate_org_id_not_none_no_org_name(self, monkeypatch, tmpdir): assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name - def test_evaluate_org_id_not_none_name_found(self, monkeypatch, tmpdir): - """Test when self.org_id is not None and org_name is found.""" - CONFIG.args.org_id = 7227 + def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir): + """Test when self.tax_id is not None and src_name is found.""" + CONFIG.args.tax_id = 7227 CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS From 652923daf4346b59f4ebbe61649853c1e3f1619e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 14 Nov 2023 17:28:48 +0100 Subject: [PATCH 14/17] refactor get_library_source --- htsinfer/exceptions.py | 6 +++++ htsinfer/get_library_source.py | 47 ++++++++++------------------------ 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py index e40278c3..f526841a 100644 --- a/htsinfer/exceptions.py +++ b/htsinfer/exceptions.py @@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception): class CutadaptProblem(Exception): """Exception raised when running cutadapt commands.""" + + +class UnsupportedSampleSourceException(Exception): + """Exception raised when taxonomy ID is not found in the source + organism list. + """ diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 3fb70f84..25eecb51 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -4,7 +4,6 @@ from pathlib import Path import subprocess as sp import tempfile -from typing import Optional from Bio import SeqIO # type: ignore import pandas as pd # type: ignore @@ -14,6 +13,7 @@ FileProblem, KallistoProblem, TranscriptsFastaProblem, + UnsupportedSampleSourceException, ) from htsinfer.models import ( ResultsSource, @@ -82,34 +82,11 @@ def evaluate(self) -> ResultsSource: self.tax_id, self.transcripts_file ) + source.file_1.short_name = src_name - if src_name is not None: - source.file_1.short_name = src_name - - if self.paths[1] is not None: - source.file_2.taxon_id = self.tax_id - source.file_2.short_name = source.file_1.short_name - - else: - LOGGER.warning( - f"Taxon ID '{self.tax_id}' not found in " - "organism dictionary, inferring source organism..." - ) - index = self.create_kallisto_index() - library_source = self.get_source( - fastq=self.paths[0], - index=index, - ) - source.file_1.short_name = library_source.short_name - source.file_1.taxon_id = library_source.taxon_id - - if self.paths[1] is not None: - library_source = self.get_source( - fastq=self.paths[1], - index=index, - ) - source.file_2.short_name = library_source.short_name - source.file_2.taxon_id = library_source.taxon_id + if self.paths[1] is not None: + source.file_2.taxon_id = self.tax_id + source.file_2.short_name = source.file_1.short_name else: index = self.create_kallisto_index() @@ -333,7 +310,7 @@ def get_source_expression( def get_source_name( taxon_id: int, transcripts_file: Path, - ) -> Optional[str]: + ) -> str: """Return name of the source organism, based on tax ID. Args: @@ -344,10 +321,11 @@ def get_source_name( Short name of the organism belonging to the given tax ID. Raises: - Could not process input FASTA file. + FileProblem: Could not process input FASTA file. + UnsupportedSampleSourceException: Taxon ID is not supported. """ src_dict = {} - # Construct dictionary of taxonomy ID's and short names + try: for record in list(SeqIO.parse( handle=transcripts_file, @@ -363,7 +341,10 @@ def get_source_name( f"Could not process file '{transcripts_file}'" ) from exc - if taxon_id in src_dict: + try: return src_dict[taxon_id] - return None + except KeyError as exc: + raise UnsupportedSampleSourceException( + f'Taxon ID "{taxon_id}" is not supported by HTSinfer.' + ) from exc From fd29bc6b49bf812b296bec51a75a0ef97c1e5c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 14 Nov 2023 17:28:56 +0100 Subject: [PATCH 15/17] refactor get_library_source tests --- tests/test_get_library_source.py | 61 +++++++------------------------- 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/tests/test_get_library_source.py b/tests/test_get_library_source.py index 6e4b58a6..6539d0f1 100644 --- a/tests/test_get_library_source.py +++ b/tests/test_get_library_source.py @@ -5,7 +5,8 @@ from htsinfer.exceptions import ( FileProblem, KallistoProblem, - TranscriptsFastaProblem + TranscriptsFastaProblem, + UnsupportedSampleSourceException, ) from htsinfer.get_library_source import GetLibSource from htsinfer.models import ( @@ -323,36 +324,36 @@ def test_create_kallisto_index_problem(self, tmpdir): with pytest.raises(KallistoProblem): test_instance.create_kallisto_index() - def test_get_organism_name_found(self): + def test_get_source_name_found(self): """Test the function when the taxon_id is found in the organism dictionary.""" CONFIG.args.t_file_processed = FILE_TRANSCRIPTS test_instance = GetLibSource(config=CONFIG) taxon_id = 7227 - result = test_instance.get_organism_name( + result = test_instance.get_source_name( taxon_id, CONFIG.args.t_file_processed ) assert result == "dmelanogaster" - def test_get_organism_name_not_found(self): + def test_get_source_name_not_found(self): """Test the function when the taxon_id is not found in the organism dictionary.""" CONFIG.args.t_file_processed = FILE_TRANSCRIPTS test_instance = GetLibSource(config=CONFIG) taxon_id = 12345 # A tax ID that doesn't exist in transcripts - result = test_instance.get_organism_name( - taxon_id, CONFIG.args.t_file_processed - ) - assert result is None + with pytest.raises(UnsupportedSampleSourceException): + test_instance.get_source_name( + taxon_id, CONFIG.args.t_file_processed + ) - def test_get_organism_name_file_problem(self): + def test_get_source_name_file_problem(self): """Test the function when there's a file problem while processing the FASTA file.""" CONFIG.args.t_file_processed = FILE_DUMMY test_instance = GetLibSource(config=CONFIG) taxon_id = 7227 with pytest.raises(FileProblem): - test_instance.get_organism_name( + test_instance.get_source_name( taxon_id, CONFIG.args.t_file_processed ) @@ -386,42 +387,6 @@ def test_evaluate_tax_id_is_none(self, monkeypatch, tmpdir): assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name - def test_evaluate_tax_id_not_none_no_src_name(self, monkeypatch, tmpdir): - """Test when self.tax_id is not None but src_name is not found.""" - CONFIG.args.tax_id = 7227 - CONFIG.args.path_1_processed = FILE_MATE_1 - CONFIG.args.path_2_processed = FILE_MATE_2 - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - CONFIG.args.out_dir = tmpdir - test_instance = GetLibSource(config=CONFIG) - - # Mock the get_organism_name method to return None - monkeypatch.setattr( - 'htsinfer.get_library_source.GetLibSource.get_organism_name', - lambda *args, **kwargs: None, - ) - - # Mock the create_kallisto_index method to return a specific result - monkeypatch.setattr( - 'htsinfer.get_library_source.GetLibSource.create_kallisto_index', - lambda *args, **kwargs: tmpdir / "kallisto.idx", - ) - - # Mock the get_source method to return a specific result - monkeypatch.setattr( - 'htsinfer.get_library_source.GetLibSource.get_source', - lambda *args, **kwargs: SOURCE_FRUIT_FLY, - ) - - result = test_instance.evaluate() - - assert result.file_1.taxon_id == SOURCE_FRUIT_FLY.taxon_id - assert result.file_1.short_name == SOURCE_FRUIT_FLY.short_name - - assert result.file_2.taxon_id == SOURCE_FRUIT_FLY.taxon_id - assert result.file_2.short_name == SOURCE_FRUIT_FLY.short_name - def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir): """Test when self.tax_id is not None and src_name is found.""" CONFIG.args.tax_id = 7227 @@ -432,9 +397,9 @@ def test_evaluate_tax_id_not_none_name_found(self, monkeypatch, tmpdir): CONFIG.args.out_dir = tmpdir test_instance = GetLibSource(config=CONFIG) - # Mock the get_organism_name method to return a specific result + # Mock the get_source_name method to return a specific result monkeypatch.setattr( - 'htsinfer.get_library_source.GetLibSource.get_organism_name', + 'htsinfer.get_library_source.GetLibSource.get_source_name', lambda *args, **kwargs: "dmelanogaster", ) From a71d7573661dd082f63f922e571a50f10c6f2268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Tue, 14 Nov 2023 18:00:41 +0100 Subject: [PATCH 16/17] refactor: update models.py --- htsinfer/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htsinfer/models.py b/htsinfer/models.py index 8174ded7..239aa57e 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -356,7 +356,7 @@ class Args(BaseModel): records: Number of input file records to process; set to `0` to process all records. threads: Number of threads to run STAR with. - tax_id: Organism ID. + tax_id: Taxonomy ID of the source organism. transcripts_file: File path to transcripts FASTA file. read_layout_adapter_file: Path to text file containing 3' adapter sequences to scan for (one sequence per line). From c8a10e63f7b2172f35b6d5c076bf1cde61789c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 15 Nov 2023 13:41:57 +0100 Subject: [PATCH 17/17] refactor: fix typos --- htsinfer/exceptions.py | 4 +--- htsinfer/get_library_source.py | 2 +- htsinfer/models.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py index f526841a..eade2d89 100644 --- a/htsinfer/exceptions.py +++ b/htsinfer/exceptions.py @@ -47,6 +47,4 @@ class CutadaptProblem(Exception): class UnsupportedSampleSourceException(Exception): - """Exception raised when taxonomy ID is not found in the source - organism list. - """ + """Exception raised when taxonomy ID is not supported.""" diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 25eecb51..15a8c834 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -52,7 +52,7 @@ class GetLibSource: min_freq_ratio: Minimum frequency ratio between the first and second most frequent source in order for the former to be considered the library's source. - tax_id: Taxonomy ID of the organism. + tax_id: Taxonomy ID of the sample source. """ def __init__( # pylint: disable=E1101 self, diff --git a/htsinfer/models.py b/htsinfer/models.py index 239aa57e..8db42f6f 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -356,7 +356,7 @@ class Args(BaseModel): records: Number of input file records to process; set to `0` to process all records. threads: Number of threads to run STAR with. - tax_id: Taxonomy ID of the source organism. + tax_id: Taxonomy ID of the sample source. transcripts_file: File path to transcripts FASTA file. read_layout_adapter_file: Path to text file containing 3' adapter sequences to scan for (one sequence per line).