diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a82adc..d4c4a92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added classifier support for KMCP profiles (#129). +- Added a command-line option `--add-rank-lineage` to the `standardise` and + `merge` commands, which inserts a new column `rank_lineage` to results that + contains semi-colon-separated strings with the ranks (#130). ## [0.4.1] - (2023-07-13) diff --git a/Makefile b/Makefile index acd5e95..5d2ee91 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,19 @@ docs/tutorials/2612_se-ERR5766180-db_mOTU.out: docs/tutorials/2612_pe-ERR5766176-db1.kraken2.report.txt: cd docs/tutorials && curl -O https://raw.githubusercontent.com/taxprofiler/taxpasta/dev/tests/data/kraken2/2612_pe-ERR5766176-db1.kraken2.report.txt +taxonomy_directory := tests/data/taxonomy +## Generate test files +test-setup: $(taxonomy_directory) + +# Running this command assumes that the taxonkit +# (https://bioinf.shenwei.me/taxonkit/) has been installed beforehand and the +# NCBI taxonomy was downloaded. +$(taxonomy_directory): + taxonkit list --ids '160488,511145,889517' --indent "" \ + | taxonkit reformat --taxid-field 1 --output-ambiguous-result --format "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}" \ + | cut --fields=2-8 \ + | taxonkit create-taxdump --out-dir "$(taxonomy_directory)" --force --rank-names "superkingdom,phylum,class,order,family,genus,species" + ################################################################################ # Self Documenting Commands # ################################################################################ diff --git a/src/taxpasta/domain/service/taxonomy_service.py b/src/taxpasta/domain/service/taxonomy_service.py index f8e84b6..982cec3 100644 --- a/src/taxpasta/domain/service/taxonomy_service.py +++ b/src/taxpasta/domain/service/taxonomy_service.py @@ -55,6 +55,10 @@ def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]: def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]: """Return the lineage of a given taxonomy identifier as identifiers.""" + @abstractmethod + def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]: + """Return the lineage of a given taxonomy identifier as ranks.""" + @abstractmethod def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: """Add a column for the taxon name to the given table.""" @@ -73,6 +77,10 @@ def add_identifier_lineage( ) -> DataFrame[ResultTable]: """Add a column for the taxon lineage as identifiers to the given table.""" + @abstractmethod + def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: + """Add a column for the taxon lineage as ranks to the given table.""" + @abstractmethod def summarise_at( self, profile: DataFrame[StandardProfile], rank: str diff --git a/src/taxpasta/infrastructure/cli/merge.py b/src/taxpasta/infrastructure/cli/merge.py index 79245f2..8319a89 100644 --- a/src/taxpasta/infrastructure/cli/merge.py +++ b/src/taxpasta/infrastructure/cli/merge.py @@ -302,6 +302,12 @@ def merge( help="Add the taxon's entire lineage to the output. These are taxon " "identifiers separated by semi-colons.", ), + rank_lineage: bool = typer.Option( # noqa: B008 + False, + "--add-rank-lineage", + help="Add the taxon's entire rank lineage to the output. These are taxon " + "ranks separated by semi-colons.", + ), ) -> None: """Standardise and merge two or more taxonomic profiles.""" # Perform input validation. @@ -381,6 +387,14 @@ def merge( ) raise typer.Exit(code=2) + if rank_lineage: + if taxonomy is None: + logger.critical( + "The '--add-rank-lineage' option requires a taxonomy. Please " + "provide one using the option '--taxonomy'." + ) + raise typer.Exit(code=2) + # Ensure that we can write to the output directory. try: output.parent.mkdir(parents=True, exist_ok=True) @@ -433,6 +447,9 @@ def merge( # The order of the following conditions is chosen specifically to yield a pleasant # output format. + if rank_lineage and valid_output_format is not WideObservationTableFileFormat.BIOM: + assert taxonomy_service is not None # nosec assert_used + result = taxonomy_service.add_rank_lineage(result) if id_lineage and valid_output_format is not WideObservationTableFileFormat.BIOM: assert taxonomy_service is not None # nosec assert_used diff --git a/src/taxpasta/infrastructure/cli/standardise.py b/src/taxpasta/infrastructure/cli/standardise.py index 7c7c9a2..2a2112d 100644 --- a/src/taxpasta/infrastructure/cli/standardise.py +++ b/src/taxpasta/infrastructure/cli/standardise.py @@ -158,6 +158,12 @@ def standardise( help="Add the taxon's entire lineage to the output. These are taxon " "identifiers separated by semi-colons.", ), + add_rank_lineage: bool = typer.Option( # noqa: B008 + False, + "--add-rank-lineage", + help="Add the taxon's entire rank lineage to the output. These are taxon " + "ranks separated by semi-colons.", + ), ) -> None: """Standardise a taxonomic profile.""" # Perform input validation. @@ -213,6 +219,14 @@ def standardise( ) raise typer.Exit(code=2) + if add_rank_lineage: + if taxonomy is None: + logger.critical( + "The '--add-rank-lineage' option requires a taxonomy. Please " + "provide one using the option '--taxonomy'." + ) + raise typer.Exit(code=2) + # Ensure that we can write to the output directory. try: output.parent.mkdir(parents=True, exist_ok=True) @@ -240,6 +254,12 @@ def standardise( # The order of the following conditions is chosen specifically to yield a pleasant # output format. + if add_rank_lineage: + assert taxonomy_service is not None # nosec assert_used + result = Sample( + name=result.name, + profile=taxonomy_service.add_rank_lineage(result.profile), + ) if add_id_lineage: assert taxonomy_service is not None # nosec assert_used diff --git a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py index 2e6f57e..9635340 100644 --- a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py +++ b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py @@ -83,6 +83,14 @@ def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]: return None return taxon.taxid_lineage + def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]: + """Return the lineage of a given taxonomy identifier as ranks.""" + try: + taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db) + except TaxidError: + return None + return list(taxon.rank_name_dictionary.keys()) + def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: """Add a column for the taxon name to the given table.""" result = table.copy() @@ -141,6 +149,24 @@ def _taxid_lineage_as_str(self, taxonomy_id: int) -> Optional[str]: return None return ";".join([str(tax_id) for tax_id in taxon.taxid_lineage]) + def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: + """Add a column for the taxon lineage as ranks to the given table.""" + result = table.copy() + result.insert( + 1, + "rank_lineage", + table.taxonomy_id.map(self._rank_lineage_as_str), + ) + return result + + def _rank_lineage_as_str(self, taxonomy_id: int) -> Optional[str]: + """Return the rank lineage of a taxon as concatenated identifiers.""" + try: + taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db) + except TaxidError: + return None + return ";".join(taxon.rank_name_dictionary.keys()) + def summarise_at( self, profile: DataFrame[StandardProfile], rank: str ) -> DataFrame[StandardProfile]: diff --git a/tests/conftest.py b/tests/conftest.py index c5e8821..2757ce9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -101,3 +101,9 @@ def ganon_data_dir(data_dir: Path) -> Path: def kmcp_data_dir(data_dir: Path) -> Path: """Provide the path to the KMCP data directory.""" return data_dir / "kmcp" + + +@pytest.fixture(scope="session") +def taxonomy_data_dir(data_dir: Path) -> Path: + """Provide the path to the taxonomy data directory.""" + return data_dir / "taxonomy" diff --git a/tests/data/taxonomy/delnodes.dmp b/tests/data/taxonomy/delnodes.dmp new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/taxonomy/merged.dmp b/tests/data/taxonomy/merged.dmp new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/taxonomy/names.dmp b/tests/data/taxonomy/names.dmp new file mode 100644 index 0000000..aee5467 --- /dev/null +++ b/tests/data/taxonomy/names.dmp @@ -0,0 +1,19 @@ +1 | root | | scientific name | +86398254 | Pseudomonadales | | scientific name | +87250111 | Enterobacteriaceae | | scientific name | +329474883 | Gammaproteobacteria | | scientific name | +432158898 | Ascomycota | | scientific name | +476817098 | Eukaryota | | scientific name | +492356122 | Saccharomyces cerevisiae | | scientific name | +536329594 | Saccharomycetales | | scientific name | +609216830 | Bacteria | | scientific name | +615773024 | Saccharomycetaceae | | scientific name | +933264868 | Saccharomyces | | scientific name | +1012954932 | Enterobacterales | | scientific name | +1187493883 | Escherichia | | scientific name | +1199096325 | Saccharomycetes | | scientific name | +1478401337 | Pseudomonadaceae | | scientific name | +1616653803 | Pseudomonas | | scientific name | +1641076285 | Proteobacteria | | scientific name | +1887621118 | Pseudomonas putida | | scientific name | +1945799576 | Escherichia coli | | scientific name | diff --git a/tests/data/taxonomy/nodes.dmp b/tests/data/taxonomy/nodes.dmp new file mode 100644 index 0000000..aba6e16 --- /dev/null +++ b/tests/data/taxonomy/nodes.dmp @@ -0,0 +1,19 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +86398254 | 329474883 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +87250111 | 1012954932 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +329474883 | 1641076285 | class | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +432158898 | 476817098 | phylum | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +476817098 | 1 | superkingdom | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +492356122 | 933264868 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +536329594 | 1199096325 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +609216830 | 1 | superkingdom | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +615773024 | 536329594 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +933264868 | 615773024 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1012954932 | 329474883 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1187493883 | 87250111 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1199096325 | 432158898 | class | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1478401337 | 86398254 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1616653803 | 1478401337 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1641076285 | 609216830 | phylum | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1887621118 | 1616653803 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1945799576 | 1187493883 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py new file mode 100644 index 0000000..5109384 --- /dev/null +++ b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py @@ -0,0 +1,225 @@ +# Copyright (c) 2023 Moritz E. Beber +# Copyright (c) 2023 Maxime Borry +# Copyright (c) 2023 James A. Fellows Yates +# Copyright (c) 2023 Sofia Stamouli. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Test that the taxopy taxonomy service works as expected.""" + + +from collections import OrderedDict +from pathlib import Path +from typing import List + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from taxpasta.infrastructure.domain.service.taxopy_taxonomy_service import ( + TaxopyTaxonomyService, +) + + +@pytest.fixture(scope="module") +def tax_service(taxonomy_data_dir: Path) -> TaxopyTaxonomyService: + """Provide an instance of the taxopy taxonomy service.""" + return TaxopyTaxonomyService.from_taxdump(taxonomy_data_dir) + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, "root"), + (42, None), + (86398254, "Pseudomonadales"), + (432158898, "Ascomycota"), + (492356122, "Saccharomyces cerevisiae"), + (1945799576, "Escherichia coli"), + (1887621118, "Pseudomonas putida"), + ], +) +def test_get_taxon_name(tax_service: TaxopyTaxonomyService, tax_id: int, expected: str): + """Expect that we can retrieve the correct taxon name.""" + assert tax_service.get_taxon_name(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, "no rank"), + (42, None), + (476817098, "superkingdom"), + (432158898, "phylum"), + (329474883, "class"), + (86398254, "order"), + (87250111, "family"), + (933264868, "genus"), + (1887621118, "species"), + ], +) +def test_get_taxon_rank(tax_service: TaxopyTaxonomyService, tax_id: int, expected: str): + """Expect that we can retrieve the correct taxon rank.""" + assert tax_service.get_taxon_rank(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, ["root"]), + (42, None), + ( + 86398254, + [ + "Pseudomonadales", + "Gammaproteobacteria", + "Proteobacteria", + "Bacteria", + "root", + ], + ), + (1199096325, ["Saccharomycetes", "Ascomycota", "Eukaryota", "root"]), + ], +) +def test_get_taxon_name_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[str] +): + """Expect that we can retrieve the correct taxon name lineage.""" + assert tax_service.get_taxon_name_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, [1]), + (42, None), + (86398254, [86398254, 329474883, 1641076285, 609216830, 1]), + (1199096325, [1199096325, 432158898, 476817098, 1]), + ], +) +def test_get_taxon_identifier_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[int] +): + """Expect that we can retrieve the correct taxon identifier lineage.""" + assert tax_service.get_taxon_identifier_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, []), + (42, None), + (86398254, ["order", "class", "phylum", "superkingdom"]), + (1199096325, ["class", "phylum", "superkingdom"]), + ], +) +def test_get_taxon_rank_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[str] +): + """Expect that we can retrieve the correct taxon rank lineage.""" + assert tax_service.get_taxon_rank_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "lineage", + [ + "root", + None, + "Pseudomonadales;Gammaproteobacteria;Proteobacteria;" + "Bacteria;root", + "Saccharomycetes;Ascomycota;Eukaryota;root", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_name_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add name lineages to a result table.""" + assert_frame_equal(tax_service.add_name_lineage(result), expected) + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "id_lineage", + [ + "1", + None, + "86398254;329474883;1641076285;609216830;1", + "1199096325;432158898;476817098;1", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_identifier_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add identifier lineages to a result table.""" + assert_frame_equal(tax_service.add_identifier_lineage(result), expected) + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "rank_lineage", + [ + "", + None, + "order;class;phylum;superkingdom", + "class;phylum;superkingdom", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_rank_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add rank lineages to a result table.""" + assert_frame_equal(tax_service.add_rank_lineage(result), expected)