From 59267f7ffaedd0473d3873e7fa9f971da7aa3877 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 17:10:40 +0200 Subject: [PATCH 1/8] feat: add command to generate taxonomy test data --- Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Makefile b/Makefile index acd5e95..5d2ee91 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,19 @@ docs/tutorials/2612_se-ERR5766180-db_mOTU.out: docs/tutorials/2612_pe-ERR5766176-db1.kraken2.report.txt: cd docs/tutorials && curl -O https://raw.githubusercontent.com/taxprofiler/taxpasta/dev/tests/data/kraken2/2612_pe-ERR5766176-db1.kraken2.report.txt +taxonomy_directory := tests/data/taxonomy +## Generate test files +test-setup: $(taxonomy_directory) + +# Running this command assumes that the taxonkit +# (https://bioinf.shenwei.me/taxonkit/) has been installed beforehand and the +# NCBI taxonomy was downloaded. +$(taxonomy_directory): + taxonkit list --ids '160488,511145,889517' --indent "" \ + | taxonkit reformat --taxid-field 1 --output-ambiguous-result --format "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}" \ + | cut --fields=2-8 \ + | taxonkit create-taxdump --out-dir "$(taxonomy_directory)" --force --rank-names "superkingdom,phylum,class,order,family,genus,species" + ################################################################################ # Self Documenting Commands # ################################################################################ From 4ef3c4d74775864020cb7c07e8243cd10494e730 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 17:11:07 +0200 Subject: [PATCH 2/8] chore: generate taxonomy test files --- tests/data/taxonomy/delnodes.dmp | 0 tests/data/taxonomy/merged.dmp | 0 tests/data/taxonomy/names.dmp | 19 +++++++++++++++++++ tests/data/taxonomy/nodes.dmp | 19 +++++++++++++++++++ 4 files changed, 38 insertions(+) create mode 100644 tests/data/taxonomy/delnodes.dmp create mode 100644 tests/data/taxonomy/merged.dmp create mode 100644 tests/data/taxonomy/names.dmp create mode 100644 tests/data/taxonomy/nodes.dmp diff --git a/tests/data/taxonomy/delnodes.dmp b/tests/data/taxonomy/delnodes.dmp new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/taxonomy/merged.dmp b/tests/data/taxonomy/merged.dmp new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/taxonomy/names.dmp b/tests/data/taxonomy/names.dmp new file mode 100644 index 0000000..aee5467 --- /dev/null +++ b/tests/data/taxonomy/names.dmp @@ -0,0 +1,19 @@ +1 | root | | scientific name | +86398254 | Pseudomonadales | | scientific name | +87250111 | Enterobacteriaceae | | scientific name | +329474883 | Gammaproteobacteria | | scientific name | +432158898 | Ascomycota | | scientific name | +476817098 | Eukaryota | | scientific name | +492356122 | Saccharomyces cerevisiae | | scientific name | +536329594 | Saccharomycetales | | scientific name | +609216830 | Bacteria | | scientific name | +615773024 | Saccharomycetaceae | | scientific name | +933264868 | Saccharomyces | | scientific name | +1012954932 | Enterobacterales | | scientific name | +1187493883 | Escherichia | | scientific name | +1199096325 | Saccharomycetes | | scientific name | +1478401337 | Pseudomonadaceae | | scientific name | +1616653803 | Pseudomonas | | scientific name | +1641076285 | Proteobacteria | | scientific name | +1887621118 | Pseudomonas putida | | scientific name | +1945799576 | Escherichia coli | | scientific name | diff --git a/tests/data/taxonomy/nodes.dmp b/tests/data/taxonomy/nodes.dmp new file mode 100644 index 0000000..aba6e16 --- /dev/null +++ b/tests/data/taxonomy/nodes.dmp @@ -0,0 +1,19 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +86398254 | 329474883 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +87250111 | 1012954932 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +329474883 | 1641076285 | class | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +432158898 | 476817098 | phylum | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +476817098 | 1 | superkingdom | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +492356122 | 933264868 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +536329594 | 1199096325 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +609216830 | 1 | superkingdom | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +615773024 | 536329594 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +933264868 | 615773024 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1012954932 | 329474883 | order | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1187493883 | 87250111 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1199096325 | 432158898 | class | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1478401337 | 86398254 | family | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1616653803 | 1478401337 | genus | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1641076285 | 609216830 | phylum | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1887621118 | 1616653803 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +1945799576 | 1187493883 | species | XX | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | From 9242f3a2f6eca8514b5d3ac6b57c86670f4dea2e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:24:18 +0200 Subject: [PATCH 3/8] feat: add method to retrieve rank lineage and add to a table --- .../domain/service/taxopy_taxonomy_service.py | 26 ++ tests/conftest.py | 6 + .../service/test_taxopy_taxonomy_service.py | 223 ++++++++++++++++++ 3 files changed, 255 insertions(+) create mode 100644 tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py diff --git a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py index 2e6f57e..e8f8ae0 100644 --- a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py +++ b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py @@ -83,6 +83,14 @@ def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]: return None return taxon.taxid_lineage + def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]: + """Return the lineage of ranks of a given taxonomy identifier.""" + try: + taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db) + except TaxidError: + return None + return list(taxon.rank_name_dictionary.keys()) + def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: """Add a column for the taxon name to the given table.""" result = table.copy() @@ -141,6 +149,24 @@ def _taxid_lineage_as_str(self, taxonomy_id: int) -> Optional[str]: return None return ";".join([str(tax_id) for tax_id in taxon.taxid_lineage]) + def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: + """Add a column for the taxon rank lineage to the given table.""" + result = table.copy() + result.insert( + 1, + "rank_lineage", + table.taxonomy_id.map(self._rank_lineage_as_str), + ) + return result + + def _rank_lineage_as_str(self, taxonomy_id: int) -> Optional[str]: + """Return the rank lineage of a taxon as concatenated identifiers.""" + try: + taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db) + except TaxidError: + return None + return ";".join(taxon.rank_name_dictionary.keys()) + def summarise_at( self, profile: DataFrame[StandardProfile], rank: str ) -> DataFrame[StandardProfile]: diff --git a/tests/conftest.py b/tests/conftest.py index c5e8821..2757ce9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -101,3 +101,9 @@ def ganon_data_dir(data_dir: Path) -> Path: def kmcp_data_dir(data_dir: Path) -> Path: """Provide the path to the KMCP data directory.""" return data_dir / "kmcp" + + +@pytest.fixture(scope="session") +def taxonomy_data_dir(data_dir: Path) -> Path: + """Provide the path to the taxonomy data directory.""" + return data_dir / "taxonomy" diff --git a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py new file mode 100644 index 0000000..ec73c42 --- /dev/null +++ b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py @@ -0,0 +1,223 @@ +# Copyright (c) 2023 Moritz E. Beber +# Copyright (c) 2023 Maxime Borry +# Copyright (c) 2023 James A. Fellows Yates +# Copyright (c) 2023 Sofia Stamouli. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Test that the taxopy taxonomy service works as expected.""" + + +from collections import OrderedDict +from pathlib import Path + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from taxpasta.infrastructure.domain.service.taxopy_taxonomy_service import ( + TaxopyTaxonomyService, +) + + +@pytest.fixture(scope="module") +def tax_service(taxonomy_data_dir: Path) -> TaxopyTaxonomyService: + """Provide an instance of the taxopy taxonomy service.""" + return TaxopyTaxonomyService.from_taxdump(taxonomy_data_dir) + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, "root"), + (42, None), + (86398254, "Pseudomonadales"), + (432158898, "Ascomycota"), + (492356122, "Saccharomyces cerevisiae"), + (1945799576, "Escherichia coli"), + (1887621118, "Pseudomonas putida"), + ], +) +def test_get_taxon_name(tax_service: TaxopyTaxonomyService, tax_id: int, expected: str): + """Expect that we can retrieve the correct taxon name.""" + assert tax_service.get_taxon_name(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, "no rank"), + (42, None), + (476817098, "superkingdom"), + (432158898, "phylum"), + (329474883, "class"), + (86398254, "order"), + (87250111, "family"), + (933264868, "genus"), + (1887621118, "species"), + ], +) +def test_get_taxon_rank(tax_service: TaxopyTaxonomyService, tax_id: int, expected: str): + """Expect that we can retrieve the correct taxon rank.""" + assert tax_service.get_taxon_rank(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, ["root"]), + (42, None), + ( + 86398254, + [ + "Pseudomonadales", + "Gammaproteobacteria", + "Proteobacteria", + "Bacteria", + "root", + ], + ), + (1199096325, ["Saccharomycetes", "Ascomycota", "Eukaryota", "root"]), + ], +) +def test_get_taxon_name_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[str] +): + """Expect that we can retrieve the correct taxon name lineage.""" + assert tax_service.get_taxon_name_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, [1]), + (42, None), + (86398254, [86398254, 329474883, 1641076285, 609216830, 1]), + (1199096325, [1199096325, 432158898, 476817098, 1]), + ], +) +def test_get_taxon_identifier_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[int] +): + """Expect that we can retrieve the correct taxon identifier lineage.""" + assert tax_service.get_taxon_identifier_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("tax_id", "expected"), + [ + (1, []), + (42, None), + (86398254, ["order", "class", "phylum", "superkingdom"]), + (1199096325, ["class", "phylum", "superkingdom"]), + ], +) +def test_get_taxon_rank_lineage( + tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[str] +): + """Expect that we can retrieve the correct taxon rank lineage.""" + assert tax_service.get_taxon_rank_lineage(tax_id) == expected + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "lineage", + [ + "root", + None, + "Pseudomonadales;Gammaproteobacteria;Proteobacteria;Bacteria;root", + "Saccharomycetes;Ascomycota;Eukaryota;root", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_name_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add name lineages to a result table.""" + assert_frame_equal(tax_service.add_name_lineage(result), expected) + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "id_lineage", + [ + "1", + None, + "86398254;329474883;1641076285;609216830;1", + "1199096325;432158898;476817098;1", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_identifier_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add identifier lineages to a result table.""" + assert_frame_equal(tax_service.add_identifier_lineage(result), expected) + + +@pytest.mark.parametrize( + ("result", "expected"), + [ + ( + pd.DataFrame(OrderedDict([("taxonomy_id", [1, 42, 86398254, 1199096325])])), + pd.DataFrame( + OrderedDict( + [ + ("taxonomy_id", [1, 42, 86398254, 1199096325]), + ( + "rank_lineage", + [ + "", + None, + "order;class;phylum;superkingdom", + "class;phylum;superkingdom", + ], + ), + ] + ) + ), + ), + ], +) +def test_add_rank_lineage( + tax_service: TaxopyTaxonomyService, result: pd.DataFrame, expected: pd.DataFrame +): + """Expect that we can add rank lineages to a result table.""" + assert_frame_equal(tax_service.add_rank_lineage(result), expected) From f64047e7359020aebf274dae90fea4c118744156 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:31:59 +0200 Subject: [PATCH 4/8] style: break long string --- .../domain/service/test_taxopy_taxonomy_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py index ec73c42..a1403b5 100644 --- a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py +++ b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py @@ -144,7 +144,8 @@ def test_get_taxon_rank_lineage( [ "root", None, - "Pseudomonadales;Gammaproteobacteria;Proteobacteria;Bacteria;root", + "Pseudomonadales;Gammaproteobacteria;Proteobacteria;" + "Bacteria;root", "Saccharomycetes;Ascomycota;Eukaryota;root", ], ), From ef263198f2d9ef157db79986cccb7e5541a1ceff Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:34:22 +0200 Subject: [PATCH 5/8] fix: use 3.8 compatible type annotation --- .../domain/service/test_taxopy_taxonomy_service.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py index a1403b5..5109384 100644 --- a/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py +++ b/tests/unit/infrastructure/domain/service/test_taxopy_taxonomy_service.py @@ -21,6 +21,7 @@ from collections import OrderedDict from pathlib import Path +from typing import List import pandas as pd import pytest @@ -92,7 +93,7 @@ def test_get_taxon_rank(tax_service: TaxopyTaxonomyService, tax_id: int, expecte ], ) def test_get_taxon_name_lineage( - tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[str] + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[str] ): """Expect that we can retrieve the correct taxon name lineage.""" assert tax_service.get_taxon_name_lineage(tax_id) == expected @@ -108,7 +109,7 @@ def test_get_taxon_name_lineage( ], ) def test_get_taxon_identifier_lineage( - tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[int] + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[int] ): """Expect that we can retrieve the correct taxon identifier lineage.""" assert tax_service.get_taxon_identifier_lineage(tax_id) == expected @@ -124,7 +125,7 @@ def test_get_taxon_identifier_lineage( ], ) def test_get_taxon_rank_lineage( - tax_service: TaxopyTaxonomyService, tax_id: int, expected: list[str] + tax_service: TaxopyTaxonomyService, tax_id: int, expected: List[str] ): """Expect that we can retrieve the correct taxon rank lineage.""" assert tax_service.get_taxon_rank_lineage(tax_id) == expected From fee157cf0d46db90642a0a8605a52efa44552374 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:43:27 +0200 Subject: [PATCH 6/8] refactor: normalize abstract interface and concrete service --- src/taxpasta/domain/service/taxonomy_service.py | 8 ++++++++ .../domain/service/taxopy_taxonomy_service.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/taxpasta/domain/service/taxonomy_service.py b/src/taxpasta/domain/service/taxonomy_service.py index f8e84b6..982cec3 100644 --- a/src/taxpasta/domain/service/taxonomy_service.py +++ b/src/taxpasta/domain/service/taxonomy_service.py @@ -55,6 +55,10 @@ def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]: def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]: """Return the lineage of a given taxonomy identifier as identifiers.""" + @abstractmethod + def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]: + """Return the lineage of a given taxonomy identifier as ranks.""" + @abstractmethod def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: """Add a column for the taxon name to the given table.""" @@ -73,6 +77,10 @@ def add_identifier_lineage( ) -> DataFrame[ResultTable]: """Add a column for the taxon lineage as identifiers to the given table.""" + @abstractmethod + def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: + """Add a column for the taxon lineage as ranks to the given table.""" + @abstractmethod def summarise_at( self, profile: DataFrame[StandardProfile], rank: str diff --git a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py index e8f8ae0..9635340 100644 --- a/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py +++ b/src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py @@ -84,7 +84,7 @@ def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]: return taxon.taxid_lineage def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]: - """Return the lineage of ranks of a given taxonomy identifier.""" + """Return the lineage of a given taxonomy identifier as ranks.""" try: taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db) except TaxidError: @@ -150,7 +150,7 @@ def _taxid_lineage_as_str(self, taxonomy_id: int) -> Optional[str]: return ";".join([str(tax_id) for tax_id in taxon.taxid_lineage]) def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]: - """Add a column for the taxon rank lineage to the given table.""" + """Add a column for the taxon lineage as ranks to the given table.""" result = table.copy() result.insert( 1, From b217f335b259140943f2bc5bbbcdd4d7da6b23b4 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:46:23 +0200 Subject: [PATCH 7/8] feat: add new CLI option `--add-rank-lineage` --- src/taxpasta/infrastructure/cli/merge.py | 17 ++++++++++++++++ .../infrastructure/cli/standardise.py | 20 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/taxpasta/infrastructure/cli/merge.py b/src/taxpasta/infrastructure/cli/merge.py index 79245f2..8319a89 100644 --- a/src/taxpasta/infrastructure/cli/merge.py +++ b/src/taxpasta/infrastructure/cli/merge.py @@ -302,6 +302,12 @@ def merge( help="Add the taxon's entire lineage to the output. These are taxon " "identifiers separated by semi-colons.", ), + rank_lineage: bool = typer.Option( # noqa: B008 + False, + "--add-rank-lineage", + help="Add the taxon's entire rank lineage to the output. These are taxon " + "ranks separated by semi-colons.", + ), ) -> None: """Standardise and merge two or more taxonomic profiles.""" # Perform input validation. @@ -381,6 +387,14 @@ def merge( ) raise typer.Exit(code=2) + if rank_lineage: + if taxonomy is None: + logger.critical( + "The '--add-rank-lineage' option requires a taxonomy. Please " + "provide one using the option '--taxonomy'." + ) + raise typer.Exit(code=2) + # Ensure that we can write to the output directory. try: output.parent.mkdir(parents=True, exist_ok=True) @@ -433,6 +447,9 @@ def merge( # The order of the following conditions is chosen specifically to yield a pleasant # output format. + if rank_lineage and valid_output_format is not WideObservationTableFileFormat.BIOM: + assert taxonomy_service is not None # nosec assert_used + result = taxonomy_service.add_rank_lineage(result) if id_lineage and valid_output_format is not WideObservationTableFileFormat.BIOM: assert taxonomy_service is not None # nosec assert_used diff --git a/src/taxpasta/infrastructure/cli/standardise.py b/src/taxpasta/infrastructure/cli/standardise.py index 7c7c9a2..2a2112d 100644 --- a/src/taxpasta/infrastructure/cli/standardise.py +++ b/src/taxpasta/infrastructure/cli/standardise.py @@ -158,6 +158,12 @@ def standardise( help="Add the taxon's entire lineage to the output. These are taxon " "identifiers separated by semi-colons.", ), + add_rank_lineage: bool = typer.Option( # noqa: B008 + False, + "--add-rank-lineage", + help="Add the taxon's entire rank lineage to the output. These are taxon " + "ranks separated by semi-colons.", + ), ) -> None: """Standardise a taxonomic profile.""" # Perform input validation. @@ -213,6 +219,14 @@ def standardise( ) raise typer.Exit(code=2) + if add_rank_lineage: + if taxonomy is None: + logger.critical( + "The '--add-rank-lineage' option requires a taxonomy. Please " + "provide one using the option '--taxonomy'." + ) + raise typer.Exit(code=2) + # Ensure that we can write to the output directory. try: output.parent.mkdir(parents=True, exist_ok=True) @@ -240,6 +254,12 @@ def standardise( # The order of the following conditions is chosen specifically to yield a pleasant # output format. + if add_rank_lineage: + assert taxonomy_service is not None # nosec assert_used + result = Sample( + name=result.name, + profile=taxonomy_service.add_rank_lineage(result.profile), + ) if add_id_lineage: assert taxonomy_service is not None # nosec assert_used From 33a89de0dbc4647fbb970f57859e1d612a65b202 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 19 Aug 2023 18:55:20 +0200 Subject: [PATCH 8/8] docs: add changelog entry [skip ci] --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a82adc..d4c4a92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added classifier support for KMCP profiles (#129). +- Added a command-line option `--add-rank-lineage` to the `standardise` and + `merge` commands, which inserts a new column `rank_lineage` to results that + contains semi-colon-separated strings with the ranks (#130). ## [0.4.1] - (2023-07-13)