From 67bb5063eb9fc8d7e0841d40c73db5425dd3dbec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Juri=C4=8D?= <74237898+BorisYourich@users.noreply.github.com> Date: Thu, 14 Sep 2023 10:22:58 +0200 Subject: [PATCH 1/2] refactor: avoid duplicate mappings (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Boris Jurič <499542@mail.muni.cz> Co-authored-by: Alex Kanitz --- .github/workflows/ci.yml | 35 +-- README.md | 10 +- environment-dev.yml | 25 +- environment.yml | 24 +- htsinfer/cli.py | 1 + htsinfer/get_library_type.py | 31 +-- htsinfer/get_read_orientation.py | 345 +------------------------- htsinfer/htsinfer.py | 4 + htsinfer/mapping.py | 378 +++++++++++++++++++++++++++++ setup.py | 2 +- tests/test_get_library_type.py | 49 ++-- tests/test_get_read_orientation.py | 291 ++++------------------ tests/test_mapping.py | 259 ++++++++++++++++++++ tests/utils.py | 3 + 14 files changed, 801 insertions(+), 656 deletions(-) create mode 100644 htsinfer/mapping.py create mode 100644 tests/test_mapping.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22c59e57..3993e32a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,21 +12,18 @@ jobs: steps: - name: check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: set up miniconda and env uses: conda-incubator/setup-miniconda@v2 with: - auto-update-conda: true + python-version: "3.9" mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -50,7 +47,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.7', '3.8', '3.9' ] + python-version: [ '3.8', '3.9', '3.10' ] name: unit-testing-Python-${{ matrix.python-version }} @@ -63,16 +60,12 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} - auto-update-conda: true mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -100,29 +93,25 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.7', '3.8', '3.9' ] + python-version: [ '3.8', '3.9', '3.10' ] name: integration-testing-Python-${{ matrix.python-version }} steps: - name: check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: set up miniconda and env uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} - auto-update-conda: true mamba-version: "*" - channels: conda-forge,defaults - environment-file: environment.yml + auto-update-conda: true activate-environment: htsinfer + environment-file: environment-dev.yml auto-activate-base: false - - name: update env with dev packages - run: mamba env update --file environment-dev.yml - - name: display env info run: | conda info -a @@ -171,4 +160,4 @@ jobs: run: | echo "Push indicator: ${{ steps.docker.outputs.push-indicator }}" echo "# Set to 'true' if image was pushed, empty string otherwise" - test "${{ steps.docker.outputs.push-indicator }}" == "true" \ No newline at end of file + test "${{ steps.docker.outputs.push-indicator }}" == "true" diff --git a/README.md b/README.md index 35f49184..6149a708 100644 --- a/README.md +++ b/README.md @@ -123,12 +123,14 @@ dependencies via [Conda][conda]: git clone https://github.com/zavolanlab/htsinfer cd htsinfer conda env create --file environment.yml -conda env update --file environment-dev.yml # optional: install development/testing dependencies +# Alternatively, to install with development dependencies, +# run the following instead +conda env create --file environment-dev.yml ``` -Note that creating the environment takes non-trivial time and it is strongly -recommended that you install [Mamba][mamba] and replace `conda` with `mamba` -in the previous commands. +> Note that creating the environment takes non-trivial time and it is strongly +> recommended that you install [Mamba][mamba] and replace `conda` with `mamba` +> in the previous command. Then, activate the `htsinfer` Conda environment with: diff --git a/environment-dev.yml b/environment-dev.yml index 8b243f0f..40f4f5f4 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -1,11 +1,24 @@ name: htsinfer channels: - - defaults + - conda-forge + - bioconda dependencies: - - coverage>=5.3 - - flake8>=3.8.4 - - mypy>=0.782 - - pylint>=2.4.4 - - pytest>=6.1.0 + - biopython >=1.78 + - coverage >=5.3 + - cutadapt >=3.5, <=4.2 + - flake8 >=3.8.4 + - kallisto >=0.46.1, <= 0.48.0 + - mypy >=0.782 + - numpy >=1.22, <1.25 + - pandas >=1.3.5, <1.4.0 + - pip >=20.2.3 + - pyahocorasick >=1.4.0 + - pydantic >=1.8.1, <2 + - pylint >=2.4.4 + - pysam >=0.16.0 + - pytest >=6.1.0 + - python >=3.8, <=3.10 + - star >=2.7.6 - pip: - python-semantic-release>=7.15.0 + - -e . diff --git a/environment.yml b/environment.yml index 54c4b8a8..08e4fd28 100644 --- a/environment.yml +++ b/environment.yml @@ -1,18 +1,18 @@ name: htsinfer channels: - - defaults - - bioconda - conda-forge + - bioconda dependencies: - - biopython>=1.78 - - kallisto>=0.46.1, <= 0.48.0 - - pandas>=1.0.5 - - pip>=20.2.3 - - pyahocorasick>=1.4.0 - - pydantic>=1.8.1, <2 - - pysam>=0.16.0 - - python>3.6, <3.10 - - star>=2.7.6 - - cutadapt>=3.5, <=4.2 + - biopython >=1.78 + - cutadapt >=3.5, <=4.2 + - kallisto >=0.46.1, <= 0.48.0 + - numpy >=1.22, <1.25 + - pandas >=1.3.5, <1.4.0 + - pip >=20.2.3 + - pyahocorasick >=1.4.0 + - pydantic >=1.8.1, <2 + - pysam >=0.16.0 + - python >=3.8, <=3.10 + - star >=2.7.6 - pip: - -e . diff --git a/htsinfer/cli.py b/htsinfer/cli.py index 3fea80d4..409e1438 100644 --- a/htsinfer/cli.py +++ b/htsinfer/cli.py @@ -50,6 +50,7 @@ def __call__( values, option_string=None, ) -> None: + assert isinstance(values, list) if len(values) > 2: parser.print_usage(file=sys.stderr) sys.stderr.write( diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 2edfd095..a2c03a02 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -21,9 +21,7 @@ SeqIdFormats, Config, ) -from htsinfer.get_read_orientation import ( - GetOrientation, -) +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -62,6 +60,7 @@ class GetLibType: def __init__( self, config: Config, + mapping: Mapping, ): """Class constructor.""" self.path_1: Path = config.args.path_1_processed @@ -69,8 +68,7 @@ def __init__( self.library_source = config.results.library_source self.results: ResultsType = ResultsType() self.tmp_dir = config.args.tmp_dir - self.get_read_orientation: \ - GetOrientation = GetOrientation(config=config) + self.mapping = mapping self.max_distance = config.args.lib_type_max_distance self.cutoff = config.args.lib_type_mates_cutoff @@ -126,23 +124,24 @@ def _evaluate_mate_relationship( self.results.relationship = ( StatesTypeRelationship.split_mates ) + self.mapping.library_type.relationship = ( + StatesTypeRelationship.split_mates + ) else: - self.get_read_orientation.library_type.relationship \ + self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available - self.get_read_orientation.library_source = self.library_source - _ = self.get_read_orientation.evaluate() + self.mapping.library_source = self.library_source + self.mapping.evaluate() self._align_mates() def _align_mates(self): """Decide mate relationship by alignment.""" - alignment_1 = Path(self.tmp_dir) \ - / "alignments" / "file_1" / "Aligned.out.sam" - alignment_2 = Path(self.tmp_dir) \ - / "alignments" / "file_2" / "Aligned.out.sam" + alignment_1 = self.mapping.star_dirs[0] / 'Aligned.out.sam' + alignment_2 = self.mapping.star_dirs[1] / 'Aligned.out.sam' - samfile1 = pysam.AlignmentFile(alignment_1, 'r') - samfile2 = pysam.AlignmentFile(alignment_2, 'r') + samfile1 = pysam.AlignmentFile(str(alignment_1), 'r') + samfile2 = pysam.AlignmentFile(str(alignment_2), 'r') previous_seq_id1 = None previous_seq_id2 = None @@ -184,6 +183,10 @@ def _align_mates(self): self.results.relationship = ( StatesTypeRelationship.split_mates ) + self.mapping.library_type.relationship \ + = StatesTypeRelationship.split_mates + self.mapping.mapped = False + self.mapping.star_dirs = [] else: self.results.relationship = ( StatesTypeRelationship.not_mates diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index b923234a..d76b0922 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -2,26 +2,21 @@ from collections import defaultdict import logging -import math from pathlib import Path -import subprocess as sp from typing import (Any, DefaultDict, Dict, List) -from Bio import SeqIO # type: ignore import pysam # type: ignore from htsinfer.exceptions import ( FileProblem, - SamFileProblem, - StarProblem, ) from htsinfer.models import ( ResultsOrientation, StatesOrientation, StatesOrientationRelationship, - StatesTypeRelationship, Config, ) +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -54,6 +49,7 @@ class GetOrientation: def __init__( self, config: Config, + mapping: Mapping, ): """Class contructor.""" self.paths = (config.args.path_1_processed, @@ -62,9 +58,9 @@ def __init__( self.library_source = config.results.library_source self.transcripts_file = config.args.t_file_processed self.tmp_dir = config.args.tmp_dir - self.threads_star = config.args.threads self.min_mapped_reads = config.args.read_orientation_min_mapped_reads self.min_fraction = config.args.read_orientation_min_fraction + self.mapping = mapping def evaluate(self) -> ResultsOrientation: """Infer read orientation. @@ -73,327 +69,16 @@ def evaluate(self) -> ResultsOrientation: Orientation results object. """ - # get transcripts for current organims - transcripts = self.subset_transcripts_by_organism() - ref_size = self.get_fasta_size(fasta=transcripts) - index_string_size = self.get_star_index_string_size(ref_size=ref_size) - chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size, - transcripts=transcripts) - - # generate STAR alignments - index_dir = self.create_star_index( - fasta=transcripts, - index_string_size=index_string_size, - chr_bin_bits=chr_bin_bits, - ) - star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir) - self.generate_star_alignments(commands=star_cmds) - - # process alignments - star_dirs = [e for e in star_cmds if e is not None] - - return self.process_alignments(star_dirs=star_dirs) - - def subset_transcripts_by_organism(self) -> Path: - """Filter FASTA file of transcripts by current sources. - - The filtered file contains records from the indicated sources. - Typically, this is one source. However, for if two input files - were supplied that are originating from different sources (i.e., - not from a valid paired-ended library), it may be from two - different sources. If no source is supplied (because it could - not be inferred), no filtering is done. - - Returns: - Path to filtered FASTA file. - - Raises: - FileProblem: Could not open input/output FASTA file for - reading/writing. - """ - LOGGER.debug(f"Subsetting transcripts for: {self.library_source}") - - outfile = self.tmp_dir / f"{self.library_source}.fasta" - - def yield_filtered_seqs(): - """Generator yielding sequence records for specified sources. - - Yields: - Next FASTA sequence record of the specified sources. - - Raises: Could not process input FASTA file. - """ - sources = [] - if self.library_source.file_1.short_name is not None: - sources.append(self.library_source.file_1.short_name) - if self.library_source.file_2.short_name is not None: - sources.append(self.library_source.file_2.short_name) - try: - for record in SeqIO.parse( - handle=self.transcripts_file, - format='fasta', - ): - try: - org_name = record.description.split("|")[3] - except (ValueError, IndexError): - continue - if org_name in sources or len(sources) == 0: - yield record - - except OSError as exc: - raise FileProblem( - f"Could not process file '{self.transcripts_file}'" - ) from exc - - try: - SeqIO.write( - sequences=yield_filtered_seqs(), - handle=outfile, - format='fasta', - ) - except OSError as exc: - raise FileProblem( - f"Failed to write to FASTA file '{outfile}'" - ) from exc - - LOGGER.debug(f"Filtered transcripts file: {outfile}") - return outfile - - @staticmethod - def get_fasta_size(fasta: Path) -> int: - """Get size of FASTA file in total nucleotides. - - Args: - fasta: Path to FASTA file. - - Returns: - Total number of nucleotides of all records. - - Raises: - FileProblem: Could not open FASTA file for reading. - """ - nucleotides: int = 0 - - try: - for record in SeqIO.parse( - handle=fasta, - format='fasta', - ): - nucleotides += len(record.seq) - - except OSError as exc: - raise FileProblem( - f"Could not process file: {fasta}" - ) from exc - - LOGGER.debug(f"Size of reference: {nucleotides}") - return nucleotides - - @staticmethod - def get_star_index_string_size(ref_size: int) -> int: - """Get length of STAR SA pre-indexing string. - - Cf. - https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf - - Args: - ref_size: Size of genome/transcriptome reference in nucleotides. - - Returns: - Size (in nucleotides) of SA pre-indexing string. - """ - index_string_size = min( - 14, - int(math.floor(math.log2(ref_size) / 2 - 1)) - ) - LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}") - return index_string_size - - @staticmethod - def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int: - """Get size of bins for STAR genome storage. + self.mapping.paths = self.paths + self.mapping.library_type = self.library_type + self.mapping.library_source = self.library_source + self.mapping.transcripts_file = self.transcripts_file + self.mapping.tmp_dir = self.tmp_dir - Args: - ref_size: Size of genome/transcriptome reference in nucleotides. - transcripts: Path to filtered FASTA transcripts file. - - Returns: - Number of bins for genome storage. - """ - n_ref: int = 0 - - for _ in SeqIO.parse( - handle=transcripts, - format='fasta', - ): - n_ref += 1 - - chr_bin_bits = min( - 18, - int(round(math.log2(ref_size / n_ref))) - ) - LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits) - return chr_bin_bits - - def create_star_index( - self, - fasta: Path, - chr_bin_bits: int = 18, - index_string_size: int = 5, - ) -> Path: - """Prepare STAR index. - - Args: - fasta: Path to FASTA file of sequence records to create index from. - index_string_size: Size of SA pre-indexing string, in nucleotides. - - Returns: - Path to directory containing STAR index. - - Raises: - StarProblem: STAR index could not be created. - """ - LOGGER.debug(f"Creating STAR index for: {fasta}") - - index_dir: Path = Path(self.tmp_dir) / "index" - - # solves the macOS issue with STAR - index_dir.mkdir(parents=True, exist_ok=True) - - cmd = [ - "STAR", - "--runMode", "genomeGenerate", - "--genomeSAindexNbases", f"{str(index_string_size)}", - "--genomeChrBinNbits", f"{str(chr_bin_bits)}", - "--runThreadN", f"{str(self.threads_star)}", - "--genomeDir", f"{str(index_dir)}", - "--genomeFastaFiles", f"{str(fasta)}", - ] - - result = sp.run( - cmd, - capture_output=True, - text=True, - ) - if result.returncode != 0: - LOGGER.error(result.stderr) - raise StarProblem("Failed to create STAR index") + if not self.mapping.mapped: + self.mapping.evaluate() - LOGGER.debug(f"STAR index created: {index_dir}") - return index_dir - - def prepare_star_alignment_commands( - self, - index_dir: Path, - ) -> Dict[Path, List[str]]: - """Prepare STAR alignment commands. - - Args: - index_dir: Path to directory containing STAR index. - - Returns: - Dictionary of output paths and corresponding STAR commands. - """ - LOGGER.debug("Preparing STAR commands...") - - # helper function for compiling individual command - def build_star_command( - read_files: List[str], - out_dir: str, - ) -> List[str]: - """Compile an individual STAR alignment command. - - Args: - read_files: List of read file paths. - out_dir: STAR output directory. - - Returns: - STAR command list. - """ - cmd_base: List[str] = [ - "STAR", - "--alignIntronMax", "1", - "--alignEndsType", "Local", - "--runThreadN", f"{str(self.threads_star)}", - "--genomeDir", f"{str(index_dir)}", - "--outFilterMultimapNmax", "50", - "--outSAMunmapped", "Within", "KeepPairs", - ] - cmd: List[str] = cmd_base[:] - cmd.append("--readFilesIn") - cmd.extend(read_files) - cmd.append("--outFileNamePrefix") - cmd.append(out_dir) - - # solves the macOS issue with STAR - Path(out_dir).mkdir(parents=True, exist_ok=True) - - return cmd - - out_dir_base: Path = Path(self.tmp_dir) / "alignments" - commands: Dict = {} - - # create command for pairend-ended libraries - if ( - self.library_type.relationship - == StatesTypeRelationship.split_mates - ): - out_dir = out_dir_base / "paired" - commands[out_dir] = build_star_command( - read_files=[str(path) for path in self.paths], - out_dir=f"{str(out_dir)}/", - ) - # create commands for single-ended libraries - else: - out_dir = out_dir_base / "file_1" - commands[out_dir] = build_star_command( - read_files=[str(self.paths[0])], - out_dir=f"{str(out_dir)}/", - ) - # run two commands in case there is a second file provided that is - # not a mate of the first one - out_dir = out_dir_base / "file_2" - if self.paths[1] is not None: - commands[out_dir] = build_star_command( - read_files=[str(self.paths[1])], - out_dir=f"{str(out_dir)}/", - ) - - return commands - - @staticmethod - def generate_star_alignments(commands: Dict[Path, List[str]]) -> None: - """Align reads to index with STAR. - - Args: - commands: Dictionary of output paths and corresponding STAR - commands. - - Raises: - StarProblem: Generating alignments failed. - """ - LOGGER.debug("Aligning reads with STAR...") - - # execute commands - for out_dir, cmd in commands.items(): - try: - result = sp.run( - cmd, - capture_output=True, - text=True, - check=True, - ) - if result.returncode != 0: - LOGGER.error(result.stderr) - raise StarProblem( - "Failed to generate STAR alignments for command: " - f"{cmd}" - ) - except sp.CalledProcessError as exc: - raise StarProblem( - f"Failed to generate STAR alignments for command: {cmd}" - ) from exc - LOGGER.debug(f"Written STAR output to directory: {out_dir}") + return self.process_alignments(star_dirs=self.mapping.star_dirs) def process_alignments( self, @@ -459,17 +144,11 @@ def process_single( states[record.query_name].append( StatesOrientation.stranded_forward ) - - except OSError as exc: + except (OSError, ValueError) as exc: raise FileProblem( f"Failed to open SAM file: '{sam}'" ) from exc - except ValueError as exc: - raise SamFileProblem( - f"Not a valid SAM file: '{sam}'" - ) from exc - LOGGER.debug("Deciding read orientation...") reads = len(states) fractions = [ diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 28dd3bf8..9f2f6602 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -27,6 +27,7 @@ Config, ) from htsinfer.subset_fastq import SubsetFastq +from htsinfer.mapping import Mapping LOGGER = logging.getLogger(__name__) @@ -64,6 +65,7 @@ def __init__( else config.args.tmp_dir / config.args.transcripts_file.name ) self.state: RunStates = RunStates.OKAY + self.mapping: Mapping = Mapping(config=self.config) def evaluate(self): """Determine library metadata.""" @@ -247,6 +249,7 @@ def get_library_type(self): """Determine library type.""" get_lib_type = GetLibType( config=self.config, + mapping=self.mapping, ) get_lib_type.evaluate() self.config.results.library_type = get_lib_type.results @@ -255,6 +258,7 @@ def get_read_orientation(self): """Determine read orientation.""" get_read_orientation = GetOrientation( config=self.config, + mapping=self.mapping, ) self.config.results.read_orientation = get_read_orientation.evaluate() diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py new file mode 100644 index 00000000..cb53f7a5 --- /dev/null +++ b/htsinfer/mapping.py @@ -0,0 +1,378 @@ +"""Mapping FASTQ's and managing the outputs of STAR.""" + +import logging +import math +from pathlib import Path +import subprocess as sp +from typing import (Dict, List) + +from Bio import SeqIO # type: ignore + +from htsinfer.exceptions import ( + FileProblem, + StarProblem, +) +from htsinfer.models import ( + Config, + StatesTypeRelationship, +) + +LOGGER = logging.getLogger(__name__) + + +class Mapping: + """Map FASTQ file/s and manage outputs. + + Args: + path: Path to FASTQ file. + + Attributes: + path_1: Path to single-end library or first mate file. + path_2: Path to second mate file. + + Raise: + FileProblem: The input file could not be parsed or the output file + could not be written. + """ + + def __init__( + self, + config: Config, + ): + """Class contructor.""" + self.paths = (config.args.path_1_processed, + config.args.path_2_processed) + self.library_type = config.results.library_type + self.library_source = config.results.library_source + self.transcripts_file = config.args.t_file_processed + self.tmp_dir = config.args.tmp_dir + self.threads_star = config.args.threads + self.mapped = False + self.star_dirs: List[Path] = [] + + def evaluate(self): + """Infer read orientation. + + Returns: + Orientation results object. + """ + + # get transcripts for current organims + transcripts = self.subset_transcripts_by_organism() + ref_size = self.get_fasta_size(fasta=transcripts) + index_string_size = self.get_star_index_string_size(ref_size=ref_size) + chr_bin_bits = self.get_star_chr_bin_bits(ref_size=ref_size, + transcripts=transcripts) + + # generate STAR alignments + index_dir = self.create_star_index( + fasta=transcripts, + index_string_size=index_string_size, + chr_bin_bits=chr_bin_bits, + ) + star_cmds = self.prepare_star_alignment_commands(index_dir=index_dir) + self.generate_star_alignments(commands=star_cmds) + # process alignments + self.star_dirs = [e for e in star_cmds if e is not None] + self.mapped = True + + def subset_transcripts_by_organism(self) -> Path: + """Filter FASTA file of transcripts by current sources. + + The filtered file contains records from the indicated sources. + Typically, this is one source. However, for if two input files + were supplied that are originating from different sources (i.e., + not from a valid paired-ended library), it may be from two + different sources. If no source is supplied (because it could + not be inferred), no filtering is done. + + Returns: + Path to filtered FASTA file. + + Raises: + FileProblem: Could not open input/output FASTA file for + reading/writing. + """ + LOGGER.debug(f"Subsetting transcripts for: {self.library_source}") + + outfile = self.tmp_dir / "transcripts_subset.fasta" + + def yield_filtered_seqs(): + """Generator yielding sequence records for specified sources. + + Yields: + Next FASTA sequence record of the specified sources. + + Raises: Could not process input FASTA file. + """ + sources = [] + if self.library_source.file_1.short_name is not None: + sources.append(self.library_source.file_1.short_name) + if self.library_source.file_2.short_name is not None: + sources.append(self.library_source.file_2.short_name) + try: + for record in SeqIO.parse( + handle=self.transcripts_file, + format='fasta', + ): + try: + org_name = record.description.split("|")[3] + except (ValueError, IndexError): + continue + if org_name in sources or len(sources) == 0: + yield record + + except OSError as exc: + raise FileProblem( + f"Could not process file '{self.transcripts_file}'" + ) from exc + + try: + SeqIO.write( + sequences=yield_filtered_seqs(), + handle=outfile, + format='fasta', + ) + except OSError as exc: + raise FileProblem( + f"Failed to write to FASTA file '{outfile}'" + ) from exc + + LOGGER.debug(f"Filtered transcripts file: {outfile}") + return outfile + + @staticmethod + def get_fasta_size(fasta: Path) -> int: + """Get size of FASTA file in total nucleotides. + + Args: + fasta: Path to FASTA file. + + Returns: + Total number of nucleotides of all records. + + Raises: + FileProblem: Could not open FASTA file for reading. + """ + nucleotides: int = 0 + + try: + for record in SeqIO.parse( + handle=fasta, + format='fasta', + ): + nucleotides += len(record.seq) + + except OSError as exc: + raise FileProblem( + f"Could not process file: {fasta}" + ) from exc + + LOGGER.debug(f"Size of reference: {nucleotides}") + return nucleotides + + @staticmethod + def get_star_index_string_size(ref_size: int) -> int: + """Get length of STAR SA pre-indexing string. + + Cf. + https://github.com/alexdobin/STAR/blob/51b64d4fafb7586459b8a61303e40beceeead8c0/doc/STARmanual.pdf + + Args: + ref_size: Size of genome/transcriptome reference in nucleotides. + + Returns: + Size (in nucleotides) of SA pre-indexing string. + """ + index_string_size = min( + 14, + int(math.floor(math.log2(ref_size) / 2 - 1)) + ) + LOGGER.debug(f"STAR SA pre-indexing string size: {index_string_size}") + return index_string_size + + @staticmethod + def get_star_chr_bin_bits(ref_size: int, transcripts: Path) -> int: + """Get size of bins for STAR genome storage. + + Args: + ref_size: Size of genome/transcriptome reference in nucleotides. + transcripts: Path to filtered FASTA transcripts file. + + Returns: + Number of bins for genome storage. + """ + n_ref: int = 0 + + for _ in SeqIO.parse( + handle=transcripts, + format='fasta', + ): + n_ref += 1 + + chr_bin_bits = min( + 18, + int(round(math.log2(ref_size / n_ref))) + ) + LOGGER.debug("STAR size of bins for genome storage: %s", chr_bin_bits) + return chr_bin_bits + + def create_star_index( + self, + fasta: Path, + chr_bin_bits: int = 18, + index_string_size: int = 5, + ) -> Path: + """Prepare STAR index. + + Args: + fasta: Path to FASTA file of sequence records to create index from. + index_string_size: Size of SA pre-indexing string, in nucleotides. + + Returns: + Path to directory containing STAR index. + + Raises: + StarProblem: STAR index could not be created. + """ + LOGGER.debug(f"Creating STAR index for: {fasta}") + + index_dir: Path = Path(self.tmp_dir) / "index" + + # solves the macOS issue with STAR + index_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "STAR", + "--runMode", "genomeGenerate", + "--genomeSAindexNbases", f"{str(index_string_size)}", + "--genomeChrBinNbits", f"{str(chr_bin_bits)}", + "--runThreadN", f"{str(self.threads_star)}", + "--genomeDir", f"{str(index_dir)}", + "--genomeFastaFiles", f"{str(fasta)}", + ] + + result = sp.run( + cmd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + LOGGER.error(result.stderr) + raise StarProblem("Failed to create STAR index") + + LOGGER.debug(f"STAR index created: {index_dir}") + return index_dir + + def prepare_star_alignment_commands( + self, + index_dir: Path, + ) -> Dict[Path, List[str]]: + """Prepare STAR alignment commands. + + Args: + index_dir: Path to directory containing STAR index. + + Returns: + Dictionary of output paths and corresponding STAR commands. + """ + LOGGER.debug("Preparing STAR commands...") + + # helper function for compiling individual command + def build_star_command( + read_files: List[str], + out_dir: str, + ) -> List[str]: + """Compile an individual STAR alignment command. + + Args: + read_files: List of read file paths. + out_dir: STAR output directory. + + Returns: + STAR command list. + """ + cmd_base: List[str] = [ + "STAR", + "--alignIntronMax", "1", + "--alignEndsType", "Local", + "--runThreadN", f"{str(self.threads_star)}", + "--genomeDir", f"{str(index_dir)}", + "--outFilterMultimapNmax", "50", + "--outSAMunmapped", "Within", "KeepPairs", + ] + cmd: List[str] = cmd_base[:] + cmd.append("--readFilesIn") + cmd.extend(read_files) + cmd.append("--outFileNamePrefix") + cmd.append(out_dir) + + # solves the macOS issue with STAR + Path(out_dir).mkdir(parents=True, exist_ok=True) + + return cmd + + out_dir_base: Path = Path(self.tmp_dir) / "alignments" + commands: Dict = {} + + # create command for pairend-ended libraries + if ( + self.library_type.relationship + == StatesTypeRelationship.split_mates + ): + out_dir = out_dir_base / "paired" + commands[out_dir] = build_star_command( + read_files=[str(path) for path in self.paths], + out_dir=f"{str(out_dir)}/", + ) + # create commands for single-ended libraries + else: + out_dir = out_dir_base / "file_1" + commands[out_dir] = build_star_command( + read_files=[str(self.paths[0])], + out_dir=f"{str(out_dir)}/", + ) + # run two commands in case there is a second file provided that is + # not a mate of the first one + out_dir = out_dir_base / "file_2" + if self.paths[1] is not None: + commands[out_dir] = build_star_command( + read_files=[str(self.paths[1])], + out_dir=f"{str(out_dir)}/", + ) + + return commands + + @staticmethod + def generate_star_alignments(commands: Dict[Path, List[str]]) -> None: + """Align reads to index with STAR. + + Args: + commands: Dictionary of output paths and corresponding STAR + commands. + + Raises: + StarProblem: Generating alignments failed. + """ + LOGGER.debug("Aligning reads with STAR...") + + # execute commands + for out_dir, cmd in commands.items(): + try: + result = sp.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + if result.returncode != 0: + LOGGER.error(result.stderr) + raise StarProblem( + "Failed to generate STAR alignments for command: " + f"{cmd}" + ) + except sp.CalledProcessError as exc: + raise StarProblem( + f"Failed to generate STAR alignments for command: {cmd}" + ) from exc + LOGGER.debug(f"Written STAR output to directory: {out_dir}") diff --git a/setup.py b/setup.py index 6b037f1e..ed2144a3 100644 --- a/setup.py +++ b/setup.py @@ -26,9 +26,9 @@ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Natural Language :: English", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Utilities", ], diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index c738d8fd..dd52dfaa 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -38,6 +38,7 @@ SEQ_ID_MATE_2, SEQ_ID_SINGLE, CONFIG, + MAPPING, ) @@ -47,20 +48,23 @@ class TestGetLibType: def test_init_required(self): """Create instance with required parameters.""" CONFIG.args.path_2_processed = None - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) assert test_instance.path_1 == FILE_MATE_1 def test_init_all(self): """Create instance with all available parameters.""" CONFIG.args.path_2_processed = FILE_MATE_2 - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) assert test_instance.path_1 == FILE_MATE_1 assert test_instance.path_2 == FILE_MATE_2 def test_evaluate_one_file(self): """Get library type for a single file.""" CONFIG.args.path_2_processed = None - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert test_instance.results == ResultsType( file_1=StatesType.first_mate, @@ -71,7 +75,8 @@ def test_evaluate_one_file(self): def test_evaluate_two_files(self): """Get library type for two files.""" CONFIG.args.path_2_processed = FILE_MATE_2 - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert test_instance.results == ResultsType( file_1=StatesType.first_mate, @@ -83,7 +88,8 @@ def test_evaluate_mate_relationship_split_mates(self): """Test mate relationship evaluation logic with input files being mates of a paired-end library. """ - test_instance = GetLibType(config=CONFIG) + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.results.file_1 = StatesType.first_mate test_instance.results.file_2 = StatesType.second_mate test_instance._evaluate_mate_relationship( @@ -105,43 +111,38 @@ def test_evaluate_mate_relationship_split_mates(self): StatesTypeRelationship.split_mates ) - def test_evaluate_mate_relationship_not_mates(self): + def test_evaluate_mate_relationship_not_mates(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetLibType(config=CONFIG) + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.results.file_1 = StatesType.first_mate test_instance.results.file_2 = StatesType.second_mate - test_instance._evaluate_mate_relationship( - ids_1=["A", "B", "C"], - ids_2=["C", "B", "A"], - ) - assert ( - test_instance.results.relationship == - StatesTypeRelationship.not_mates - ) - test_instance.results.file_1 = StatesType.single - test_instance.results.file_2 = StatesType.first_mate - test_instance._evaluate_mate_relationship( - ids_1=["A", "B", "C"], - ids_2=["A", "B", "C"], - ) + test_instance.evaluate() assert ( test_instance.results.relationship == StatesTypeRelationship.not_mates ) - def test_evaluate_split_mates_not_matching_ids(self): + def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetLibType(config=CONFIG) + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2) + MAPPING.tmp_dir = tmpdir + test_instance = GetLibType(config=CONFIG, + mapping=MAPPING) test_instance.evaluate() assert ( test_instance.results.relationship == diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index 75a2a25d..10894334 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -4,8 +4,6 @@ from htsinfer.exceptions import ( FileProblem, - SamFileProblem, - StarProblem, ) from htsinfer.get_read_orientation import GetOrientation from htsinfer.models import ( @@ -18,11 +16,8 @@ StatesTypeRelationship, ) from tests.utils import ( - FILE_2000_RECORDS, - FILE_DUMMY, FILE_EMPTY_ALIGNED_SAM, FILE_BAD_ALIGNED_SAM, - FILE_INVALID_TRANSCRIPTS, FILE_MATE_1, FILE_MATE_2, FILE_ORIENTATION_ISF_1, @@ -38,11 +33,9 @@ FILE_UNMAPPED_PAIRED_1, FILE_UNMAPPED_PAIRED_2, FILE_UNMAPPED_SINGLE, + FILE_IDS_NOT_MATCH_2, CONFIG, - RaiseError, - SubprocessError, - SOURCE_HUMAN, - SOURCE_FRUIT_FLY, + MAPPING, ) @@ -54,7 +47,10 @@ def test_init_required(self): CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = None CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetOrientation(config=CONFIG) + CONFIG.results.library_type = ResultsType() + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.library_type == ResultsType() assert test_instance.transcripts_file == FILE_TRANSCRIPTS @@ -64,7 +60,8 @@ def test_init_required_paired(self): CONFIG.args.path_1_processed = FILE_MATE_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.paths[1] == FILE_MATE_2 assert test_instance.library_type == ResultsType() @@ -76,14 +73,14 @@ def test_init_all(self, tmpdir): CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.paths[1] == FILE_MATE_2 assert test_instance.library_type == ResultsType() assert test_instance.library_source == ResultsSource() assert test_instance.transcripts_file == FILE_TRANSCRIPTS assert test_instance.tmp_dir == tmpdir - assert test_instance.threads_star == 1 assert test_instance.min_mapped_reads == 18 assert test_instance.min_fraction == 0.75 @@ -94,7 +91,9 @@ def test_evaluate_single_unmapped(self, tmpdir): CONFIG.args.path_1_processed = FILE_UNMAPPED_SINGLE CONFIG.args.path_2_processed = None CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, @@ -110,7 +109,9 @@ def test_evaluate_single_sf(self, tmpdir): file_2=Source(), ) CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_forward, @@ -122,7 +123,9 @@ def test_evaluate_single_sr(self, tmpdir): """Get read orientation for a single-end stranded reverse library.""" CONFIG.args.path_1_processed = FILE_ORIENTATION_SR CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_reverse, @@ -134,7 +137,9 @@ def test_evaluate_single_u(self, tmpdir): """Get read orientation for a single-end unstranded library.""" CONFIG.args.path_1_processed = FILE_ORIENTATION_U CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.unstranded, @@ -153,7 +158,9 @@ def test_evaluate_paired_unmapped(self, tmpdir): CONFIG.results.library_type = ResultsType( relationship=StatesTypeRelationship.split_mates, ) - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, @@ -174,7 +181,9 @@ def test_evaluate_paired_isf(self, tmpdir): ) CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_forward, @@ -187,7 +196,9 @@ def test_evaluate_paired_isr(self, tmpdir): CONFIG.args.path_1_processed = FILE_ORIENTATION_ISR_1 CONFIG.args.path_2_processed = FILE_ORIENTATION_ISR_2 CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.stranded_reverse, @@ -200,7 +211,9 @@ def test_evaluate_paired_iu(self, tmpdir): CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.unstranded, @@ -208,183 +221,21 @@ def test_evaluate_paired_iu(self, tmpdir): relationship=StatesOrientationRelationship.inward_unstranded, ) - def test_subset_transcripts_by_organism(self, tmpdir): - """Get filtered orgainsm transcripts for different organisms.""" - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.subset_transcripts_by_organism() - filtered_organisms_transcripts = \ - tmpdir / f"{CONFIG.results.library_source}.fasta" - assert results == filtered_organisms_transcripts - - def test_subset_transcripts_by_organism_file_problem(self, tmpdir): - """Pass dummy file as transcripts.fasta file to simulate - file problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.t_file_processed = FILE_DUMMY - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - - with pytest.raises(FileProblem): - test_instance.subset_transcripts_by_organism() - - def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir): - """Pass invalid transcripts.fasta file to simulate index error.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY - ) - CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.subset_transcripts_by_organism() - filtered_organisms_transcripts = \ - tmpdir / f"{CONFIG.results.library_source}.fasta" - assert results == filtered_organisms_transcripts - - def test_get_fasta_size(self, tmpdir): - """Get nucleotide statistics for filtererd transcripts - with different organisms.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - filtered_organisms_transcripts = \ - test_instance.subset_transcripts_by_organism() - results = test_instance.get_fasta_size(filtered_organisms_transcripts) - assert results == 249986 - - def test_get_fasta_size_file_problem(self, tmpdir): - """Pass dummy file as filtered_organisms_transcripts - to simulate file problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - with pytest.raises(FileProblem): - test_instance.get_fasta_size(FILE_DUMMY) - - def test_get_star_index_string_size(self, tmpdir): - """Get length of STAR SA pre-indexing string.""" - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - results = test_instance.get_star_index_string_size(249986) - assert results == 7 - - def test_evaluate_star_index_problem(self, monkeypatch, tmpdir): - """Force raising exception to stimulate a star problem.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.split_mates, - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - monkeypatch.setattr( - 'htsinfer.get_read_orientation.GetOrientation.create_star_index', - lambda *args, **kwargs: StarProblem - ) - with pytest.raises(StarProblem): - test_instance.evaluate() - - def test_prepare_star_alignment_commands(self, tmpdir): - """Get star alignment command.""" - CONFIG.args.path_1_processed = FILE_2000_RECORDS - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, - ) - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=Source(), - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - index_dir = tmpdir / 'index' - file1_alignment_path = tmpdir / 'alignments/file_1' - cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ - + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ - + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ - + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ - + str(file1_alignment_path) + "/" - results = test_instance.prepare_star_alignment_commands( - index_dir=index_dir - ) - assert ' '.join(list(results.values())[0]) == cmd - - def test_generate_star_alignments_problem(self, monkeypatch, tmpdir): - """Force raising exception to simulate problem.""" - CONFIG.results.library_source = ResultsSource( - file_1=SOURCE_HUMAN, - file_2=SOURCE_FRUIT_FLY, - ) - CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 - CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 - CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, - ) - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - sub_method_name = 'htsinfer.get_read_orientation.' + \ - 'GetOrientation.generate_star_alignments' - monkeypatch.setattr( - sub_method_name, - lambda *args, **kwargs: StarProblem, - ) - with pytest.raises(FileProblem): - test_instance.evaluate() - - def test_generate_star_alignments_dummy_cmd(self, tmpdir): - """Pass dummy cmd to force simulate star problem.""" - CONFIG.args.path_2_processed = None - CONFIG.results.library_type = ResultsType() - CONFIG.results.library_source = ResultsSource() - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - index_dir = tmpdir / 'index' - file1_alignment_path = tmpdir / 'alignments/file_1' - dummy_cmd = [ - 'STAR', '--alignIntrnMax', '1', - '--alignEndsType', 'Local', '--runThreadN', '1', - "--genomeDir", f"{str(index_dir)}", - ] - cmds = {file1_alignment_path: dummy_cmd} - with pytest.raises(StarProblem): - test_instance.generate_star_alignments(cmds) - - def test_process_single_dummy_sam_file(self, tmpdir): + def test_process_single_dummy_sam_file_file_problem(self, tmpdir): """Pass dummy aligned.out.sam file to simulate file problem.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - with pytest.raises(SamFileProblem): + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) + with pytest.raises(FileProblem): test_instance.process_single(FILE_EMPTY_ALIGNED_SAM) def test_process_paired_dummy_sam_file(self, tmpdir): """Pass dummy aligned.out.sam file to simulate file problem.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) with pytest.raises(FileProblem): test_instance.process_paired(FILE_EMPTY_ALIGNED_SAM) @@ -392,69 +243,31 @@ def test_process_paired_wrong_sam_file(self, tmpdir): """Pass bad_aligned.out.sam file to ensure correct paired file behaviour.""" CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) assert test_instance.process_paired(FILE_BAD_ALIGNED_SAM) \ == ResultsOrientation() - def test_create_star_index_star_problem(self, tmpdir): - """Pass invalid transcripts path to simulate star problem.""" - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - transcripts_path = tmpdir / 'invalid' - with pytest.raises(StarProblem): - test_instance.create_star_index(transcripts_path) - def test_evaluate_paired_not_mates_unmapped(self, tmpdir): """Get read orientation for a paired-end library with no mappable reads. """ CONFIG.args.path_1_processed = FILE_UNMAPPED_PAIRED_1 - CONFIG.args.path_2_processed = FILE_UNMAPPED_PAIRED_2 + CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 CONFIG.results.library_type = ResultsType( - relationship=StatesTypeRelationship.not_mates, + relationship=StatesTypeRelationship.not_available, ) + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source(), + ) CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) + MAPPING.mapped = False + test_instance = GetOrientation(config=CONFIG, + mapping=MAPPING) results = test_instance.evaluate() assert results == ResultsOrientation( file_1=StatesOrientation.not_available, - file_2=StatesOrientation.not_available, + file_2=StatesOrientation.stranded_reverse, relationship=StatesOrientationRelationship.not_available, ) - - def test_subset_transcripts_by_organism_cannot_write_file( - self, monkeypatch, tmpdir - ): - """Force raising of ``OSError`` to simulate file problem.""" - CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 - CONFIG.args.path_2_processed = None - CONFIG.results.library_source = ResultsSource() - CONFIG.results.library_type = ResultsType() - CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - monkeypatch.setattr( - 'Bio.SeqIO.write', - RaiseError(exc=OSError), - ) - with pytest.raises(FileProblem): - test_instance.subset_transcripts_by_organism() - - def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir): - """Force raising of ``SubprocessError`` to simulate star probelm.""" - CONFIG.args.t_file_processed = FILE_TRANSCRIPTS - CONFIG.args.tmp_dir = tmpdir - test_instance = GetOrientation(config=CONFIG) - file1_alignment_path = tmpdir / 'alignments/file_1' - dummy_cmd = [ - 'STAR', '--alignIntrnMax', '1', - '--alignEndsType', 'Local', '--runThreadN', '1', - "--genomeDir", - ] - cmds = {file1_alignment_path: dummy_cmd} - monkeypatch.setattr( - 'subprocess.run', - lambda *args, **kwargs: SubprocessError(), - ) - with pytest.raises(StarProblem): - test_instance.generate_star_alignments(cmds) diff --git a/tests/test_mapping.py b/tests/test_mapping.py new file mode 100644 index 00000000..33798bed --- /dev/null +++ b/tests/test_mapping.py @@ -0,0 +1,259 @@ +"""Unit tests for module ``mapping.py``.""" + +import pytest + +from htsinfer.exceptions import ( + FileProblem, + StarProblem, +) +from htsinfer.mapping import Mapping +from htsinfer.models import ( + ResultsSource, + ResultsType, + Source, + StatesTypeRelationship, +) +from tests.utils import ( + FILE_2000_RECORDS, + FILE_DUMMY, + FILE_INVALID_TRANSCRIPTS, + FILE_MATE_1, + FILE_MATE_2, + FILE_ORIENTATION_IU_1, + FILE_ORIENTATION_IU_2, + FILE_TRANSCRIPTS, + CONFIG, + MAPPING, + RaiseError, + SubprocessError, + SOURCE_HUMAN, + SOURCE_FRUIT_FLY, +) + + +class TestMapping: + """Test ``Mapping`` class.""" + + def test_init_required(self): + """Create instance with required parameters.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = None + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.results.library_type = ResultsType() + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.library_type == ResultsType() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + + def test_init_required_paired(self): + """Create instance with required parameters for paired-end library.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.paths[1] == FILE_MATE_2 + assert test_instance.library_type == ResultsType() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + + def test_init_all(self, tmpdir): + """Create instance with all available parameters.""" + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + assert test_instance.paths[0] == FILE_MATE_1 + assert test_instance.paths[1] == FILE_MATE_2 + assert test_instance.library_type == ResultsType() + assert test_instance.library_source == ResultsSource() + assert test_instance.transcripts_file == FILE_TRANSCRIPTS + assert test_instance.tmp_dir == tmpdir + + def test_subset_transcripts_by_organism(self, tmpdir): + """Get filtered orgainsm transcripts for different organisms.""" + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY + ) + CONFIG.args.tmp_dir = tmpdir + MAPPING.mapped = False + test_instance = Mapping(config=CONFIG) + results = test_instance.subset_transcripts_by_organism() + filtered_organisms_transcripts = \ + tmpdir / "transcripts_subset.fasta" + assert results == filtered_organisms_transcripts + + def test_subset_transcripts_by_organism_file_problem(self, tmpdir): + """Pass dummy file as transcripts.fasta file to simulate + file problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.t_file_processed = FILE_DUMMY + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + + with pytest.raises(FileProblem): + test_instance.subset_transcripts_by_organism() + + def test_subset_transcripts_by_organism_invalid_fasta(self, tmpdir): + """Pass invalid transcripts.fasta file to simulate index error.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY + ) + CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + results = test_instance.subset_transcripts_by_organism() + filtered_organisms_transcripts = \ + tmpdir / "transcripts_subset.fasta" + assert results == filtered_organisms_transcripts + + def test_get_fasta_size(self, tmpdir): + """Get nucleotide statistics for filtererd transcripts + with different organisms.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY, + ) + CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + filtered_organisms_transcripts = \ + test_instance.subset_transcripts_by_organism() + results = test_instance.get_fasta_size(filtered_organisms_transcripts) + assert results == 249986 + + def test_get_fasta_size_file_problem(self, tmpdir): + """Pass dummy file as filtered_organisms_transcripts + to simulate file problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + with pytest.raises(FileProblem): + test_instance.get_fasta_size(FILE_DUMMY) + + def test_get_star_index_string_size(self, tmpdir): + """Get length of STAR SA pre-indexing string.""" + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + results = test_instance.get_star_index_string_size(249986) + assert results == 7 + + def test_evaluate_star_index_problem(self, monkeypatch, tmpdir): + """Force raising exception to stimulate a star problem.""" + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=SOURCE_FRUIT_FLY, + ) + CONFIG.args.path_2_processed = FILE_ORIENTATION_IU_2 + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.split_mates, + ) + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + monkeypatch.setattr( + 'htsinfer.mapping.Mapping.create_star_index', + lambda *args, **kwargs: StarProblem + ) + with pytest.raises(StarProblem): + test_instance.evaluate() + + def test_prepare_star_alignment_commands(self, tmpdir): + """Get star alignment command.""" + CONFIG.args.path_1_processed = FILE_2000_RECORDS + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType( + relationship=StatesTypeRelationship.not_mates, + ) + CONFIG.results.library_source = ResultsSource( + file_1=SOURCE_HUMAN, + file_2=Source(), + ) + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + index_dir = tmpdir / 'index' + file1_alignment_path = tmpdir / 'alignments/file_1' + cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ + + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ + + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ + + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ + + str(file1_alignment_path) + "/" + results = test_instance.prepare_star_alignment_commands( + index_dir=index_dir + ) + assert ' '.join(list(results.values())[0]) == cmd + + def test_generate_star_alignments_dummy_cmd(self, tmpdir): + """Pass dummy cmd to force simulate star problem.""" + CONFIG.args.path_2_processed = None + CONFIG.results.library_type = ResultsType() + CONFIG.results.library_source = ResultsSource() + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + index_dir = tmpdir / 'index' + file1_alignment_path = tmpdir / 'alignments/file_1' + dummy_cmd = [ + 'STAR', '--alignIntrnMax', '1', + '--alignEndsType', 'Local', '--runThreadN', '1', + "--genomeDir", f"{str(index_dir)}", + ] + cmds = {file1_alignment_path: dummy_cmd} + with pytest.raises(StarProblem): + test_instance.generate_star_alignments(cmds) + + def test_create_star_index_star_problem(self, tmpdir): + """Pass invalid transcripts path to simulate star problem.""" + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + transcripts_path = tmpdir / 'invalid' + with pytest.raises(StarProblem): + test_instance.create_star_index(transcripts_path) + + def test_subset_transcripts_by_organism_cannot_write_file( + self, monkeypatch, tmpdir + ): + """Force raising of ``OSError`` to simulate file problem.""" + CONFIG.args.path_1_processed = FILE_ORIENTATION_IU_1 + CONFIG.args.path_2_processed = None + CONFIG.results.library_source = ResultsSource() + CONFIG.results.library_type = ResultsType() + CONFIG.args.t_file_processed = FILE_INVALID_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + monkeypatch.setattr( + 'Bio.SeqIO.write', + RaiseError(exc=OSError), + ) + with pytest.raises(FileProblem): + test_instance.subset_transcripts_by_organism() + + def test_generate_star_alignments_star_problem(self, monkeypatch, tmpdir): + """Force raising of ``SubprocessError`` to simulate star probelm.""" + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + test_instance = Mapping(config=CONFIG) + file1_alignment_path = tmpdir / 'alignments/file_1' + dummy_cmd = [ + 'STAR', '--alignIntrnMax', '1', + '--alignEndsType', 'Local', '--runThreadN', '1', + "--genomeDir", + ] + cmds = {file1_alignment_path: dummy_cmd} + monkeypatch.setattr( + 'subprocess.run', + lambda *args, **kwargs: SubprocessError(), + ) + with pytest.raises(StarProblem): + test_instance.generate_star_alignments(cmds) diff --git a/tests/utils.py b/tests/utils.py index 2c924d7a..dfa8b98c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ from typing import Type from htsinfer.models import (Source, Config, Args, Results) +from htsinfer.mapping import Mapping # test files PACKAGE_DIR = Path(__file__).resolve().parents[1] / "htsinfer" @@ -87,6 +88,8 @@ results=Results(), ) +MAPPING = Mapping(config=CONFIG) + # helper classes class SubprocessError: From 172eb99f404dda69b5648f492ac4ea02ecb824d0 Mon Sep 17 00:00:00 2001 From: balajtimate <51365402+balajtimate@users.noreply.github.com> Date: Thu, 9 Nov 2023 15:12:15 +0100 Subject: [PATCH 2/2] fix: update Pydantic version (#146) * fix pydantic issues * fix: update pydantic version in envs * fix: pin sphinx-rtd-theme into env * fix: update readthedocs config --- .readthedocs.yaml | 2 +- environment-dev.yml | 5 +++-- environment.yml | 2 +- htsinfer/htsinfer.py | 3 +-- tests/test_htsinfer.py | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 14c41f2e..0922ec04 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,4 +13,4 @@ formats: - pdf - epub conda: - environment: environment.yml + environment: environment-dev.yml diff --git a/environment-dev.yml b/environment-dev.yml index 40f4f5f4..c97c611a 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -13,12 +13,13 @@ dependencies: - pandas >=1.3.5, <1.4.0 - pip >=20.2.3 - pyahocorasick >=1.4.0 - - pydantic >=1.8.1, <2 + - pydantic >=2, <3 - pylint >=2.4.4 - pysam >=0.16.0 - pytest >=6.1.0 - python >=3.8, <=3.10 + - python-semantic-release >=8 + - sphinx-rtd-theme - star >=2.7.6 - pip: - - python-semantic-release>=7.15.0 - -e . diff --git a/environment.yml b/environment.yml index 08e4fd28..273edbed 100644 --- a/environment.yml +++ b/environment.yml @@ -10,7 +10,7 @@ dependencies: - pandas >=1.3.5, <1.4.0 - pip >=20.2.3 - pyahocorasick >=1.4.0 - - pydantic >=1.8.1, <2 + - pydantic >=2, <3 - pysam >=0.16.0 - python >=3.8, <=3.10 - star >=2.7.6 diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 9f2f6602..4967a4d9 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -315,8 +315,7 @@ def clean_up(self): def print(self): """Print results to STDOUT.""" sys.stdout.write( - self.config.results.json( + self.config.results.model_dump_json( indent=3, - sort_keys=False, ) + linesep ) diff --git a/tests/test_htsinfer.py b/tests/test_htsinfer.py index 0489023b..ea6e65f7 100644 --- a/tests/test_htsinfer.py +++ b/tests/test_htsinfer.py @@ -305,7 +305,7 @@ def test_clean_up_keep_none(self, tmpdir): def test_clean_up_keep_results(self, tmpdir): """Remove temporary data.""" arguments = Args(path_1=FILE_MATE_1, - out_dir=tmpdir, + out_dir=tmpdir.strpath, tmpdir=tmpdir, ) results = Results() @@ -323,8 +323,8 @@ def test_clean_up_keep_results(self, tmpdir): def test_clean_up_keep_all(self, tmpdir): """Remove no data.""" arguments = Args(path_1=FILE_MATE_1, - out_dir=tmpdir, - tmpdir=tmpdir, + out_dir=tmpdir.strpath, + tmpdir=tmpdir.strpath, ) results = Results() configs = Config(