taxprofiler · Midnighter · Aug 19, 2023 · Aug 2, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+-   Added classifier support for KMCP profiles (#129).
+
 ## [0.4.1] - (2023-07-13)
 
 ### Fixed

diff --git a/docs/supported_profilers/kmcp.md b/docs/supported_profilers/kmcp.md
@@ -0,0 +1,29 @@
+# KMCP
+
+> [KMCP](https://github.com/shenwei356/kmcp) uses genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimized Compact Bit-Sliced Signature (COBS) index for fast alignment-free sequence searching. KMCP combines k-mer similarity and genome coverage information to reduce the false positive rate of k-mer-based taxonomic classification and profiling methods.
+
+## Profile Format
+
+Taxpasta expects a tab-separated file with seventeen columns. This is generated with the `kmcp profile` command. Taxpasta will interpret the columns as:
+
+| Column Header     | Description |
+| ----------------- | ----------- |
+| ref               |             |
+| percentage        |             |
+| coverage          | optional    |
+| score             |             |
+| chunksFrac        |             |
+| chunksRelDepth    |             |
+| chunksRelDepthStd | optional    |
+| reads             |             |
+| ureads            |             |
+| hicureads         |             |
+| refsize           |             |
+| refname           | optional    |
+| taxid             |             |
+| rank              | optional    |
+| taxname           | optional    |
+| taxpath           | optional    |
+| taxpathsn         | optional    |
+
+Please refer to the [KMCP documentation](https://bioinf.shenwei.me/kmcp/usage/#profile) for further description.
diff --git a/src/taxpasta/infrastructure/application/__init__.py b/src/taxpasta/infrastructure/application/__init__.py
@@ -41,6 +41,11 @@
     KaijuProfileReader,
     KaijuProfileStandardisationService,
 )
+from .kmcp import (
+    KMCPProfile,
+    KMCPProfileReader,
+    KMCPProfileStandardisationService,
+)
 from .kraken2 import (
     Kraken2Profile,
     Kraken2ProfileReader,

diff --git a/src/taxpasta/infrastructure/application/application_service_registry.py b/src/taxpasta/infrastructure/application/application_service_registry.py
@@ -64,6 +64,10 @@ def profile_reader(cls, profiler: SupportedProfiler) -> Type[ProfileReader]:
             from .kaiju import KaijuProfileReader
 
             return KaijuProfileReader
+        elif profiler is SupportedProfiler.kmcp:
+            from .kmcp import KMCPProfileReader
+
+            return KMCPProfileReader
         elif profiler is SupportedProfiler.kraken2:
             from .kraken2 import Kraken2ProfileReader
 
@@ -130,6 +134,10 @@ def profile_standardisation_service(
             from .ganon import GanonProfileStandardisationService
 
             return GanonProfileStandardisationService
+        elif profiler is SupportedProfiler.kmcp:
+            from .kmcp import KMCPProfileStandardisationService
+
+            return KMCPProfileStandardisationService
 
         else:
             raise ValueError("Unexpected")

diff --git a/src/taxpasta/infrastructure/application/kmcp/__init__.py b/src/taxpasta/infrastructure/application/kmcp/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 Moritz E. Beber
+# Copyright (c) 2023 Maxime Borry
+# Copyright (c) 2023 James A. Fellows Yates
+# Copyright (c) 2023 Sofia Stamouli.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .kmcp_profile import KMCPProfile
+from .kmcp_profile_reader import KMCPProfileReader
+from .kmcp_profile_standardisation_service import (
+    KMCPProfileStandardisationService,
+)
diff --git a/src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py b/src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023 Moritz E. Beber
+# Copyright (c) 2023 Maxime Borry
+# Copyright (c) 2023 James A. Fellows Yates
+# Copyright (c) 2023 Sofia Stamouli.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Provide a description of the KMCP profile format."""
+
+
+import numpy as np
+import pandera as pa
+from pandera.typing import Series
+
+from taxpasta.infrastructure.helpers import BaseDataFrameModel
+
+
+class KMCPProfile(BaseDataFrameModel):
+    """Define the expected KMCP profile format."""
+
+    reference: Series[str] = pa.Field(alias="ref")
+    percentage: Series[float] = pa.Field(ge=0.0, le=100.0)
+    coverage: Series[float] = pa.Field(ge=0.0, nullable=True)
+    score: Series[float] = pa.Field(ge=0.0, le=100.0)
+    chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias="chunksFrac")
+    chunks_relative_depth: Series[str] = pa.Field(alias="chunksRelDepth")
+    chunks_relative_depth_std: Series[float] = pa.Field(
+        ge=0.0, nullable=True, alias="chunksRelDepthStd"
+    )
+    reads: Series[int] = pa.Field(ge=0)
+    unique_reads: Series[int] = pa.Field(ge=0, alias="ureads")
+    high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias="hicureads")
+    reference_size: Series[int] = pa.Field(ge=0, alias="refsize")
+    reference_name: Series[str] = pa.Field(nullable=True, alias="refname")
+    taxid: Series[int] = pa.Field(ge=0)
+    rank: Series[str] = pa.Field(nullable=True)
+    taxonomic_name: Series[str] = pa.Field(nullable=True, alias="taxname")
+    taxonomic_path: Series[str] = pa.Field(nullable=True, alias="taxpath")
+    taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias="taxpathsn")
+
+    @pa.check("percentage", name="compositionality")
+    def check_compositionality(cls, percentage: Series[float]) -> bool:
+        """Check that the percentages add up to a hundred."""
+        # KMCP profile reports percentages with sixth decimals
+        return percentage.empty or bool(np.isclose(percentage.sum(), 100.0, atol=1.0))
diff --git a/src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py b/src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023 Moritz E. Beber
+# Copyright (c) 2023 Maxime Borry
+# Copyright (c) 2023 James A. Fellows Yates
+# Copyright (c) 2023 Sofia Stamouli.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Provide a reader for KMCP profiles."""
+
+
+import pandas as pd
+from pandera.typing import DataFrame
+
+from taxpasta.application.service import BufferOrFilepath, ProfileReader
+from taxpasta.infrastructure.helpers import raise_parser_warnings
+
+from .kmcp_profile import KMCPProfile
+
+
+class KMCPProfileReader(ProfileReader):
+    """Define a reader for KMCP profiles."""
+
+    @classmethod
+    @raise_parser_warnings
+    def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
+        """
+        Read a KMCP taxonomic profile from the given source.
+
+        Args:
+            profile: A source that contains a tab-separated taxonomic profile generated
+                by KMCP.
+
+        Returns:
+            A data frame representation of the KMCP profile.
+
+        """
+        result = pd.read_table(
+            filepath_or_buffer=profile,
+            sep="\t",
+            header=0,
+            index_col=False,
+            dtype={
+                KMCPProfile.chunks_relative_depth: str,
+            },
+        )
+        cls._check_num_columns(result, KMCPProfile)
+        return result
diff --git a/src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py b/src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2023 Moritz E. Beber
+# Copyright (c) 2023 Maxime Borry
+# Copyright (c) 2023 James A. Fellows Yates
+# Copyright (c) 2023 Sofia Stamouli.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Provide a standardisation service for KMCP profiles."""
+
+
+import logging
+
+import pandas as pd
+import pandera as pa
+from pandera.typing import DataFrame
+
+from taxpasta.application.service import ProfileStandardisationService
+from taxpasta.domain.model import StandardProfile
+
+from .kmcp_profile import KMCPProfile
+
+
+logger = logging.getLogger(__name__)
+
+
+class KMCPProfileStandardisationService(ProfileStandardisationService):
+    """Define a standardisation service for KMCP profiles."""
+
+    @classmethod
+    @pa.check_types(lazy=True)
+    def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
+        """
+        Tidy up and standardize a given KMCP profile.
+
+        Args:
+            profile: A taxonomic profile generated by KMCP.
+
+        Returns:
+            A standardized profile.
+
+        """
+        temp = (
+            profile[[KMCPProfile.taxid, KMCPProfile.reads]]
+            .copy()
+            .rename(
+                columns={
+                    KMCPProfile.taxid: StandardProfile.taxonomy_id,
+                    KMCPProfile.reads: StandardProfile.count,
+                }
+            )
+        )
+        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
+        result[StandardProfile.taxonomy_id] = result[
+            StandardProfile.taxonomy_id
+        ].astype(int)
+        # Replace missing values (unclassified reads) with ID zero and sum reads.
+        return pd.concat(
+            [
+                result,
+                pd.DataFrame(
+                    {
+                        StandardProfile.taxonomy_id: [0],
+                        StandardProfile.count: [
+                            temp.loc[
+                                temp[StandardProfile.taxonomy_id].isna(),
+                                StandardProfile.count,
+                            ].sum()
+                        ],
+                    },
+                    dtype=int,
+                ),
+            ],
+            ignore_index=True,
+        )
diff --git a/src/taxpasta/infrastructure/application/supported_profiler.py b/src/taxpasta/infrastructure/application/supported_profiler.py
@@ -31,6 +31,7 @@ class SupportedProfiler(str, Enum):
     diamond = "diamond"
     ganon = "ganon"
     kaiju = "kaiju"
+    kmcp = "kmcp"
     kraken2 = "kraken2"
     krakenuniq = "krakenuniq"
     megan6 = "megan6"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -95,3 +95,9 @@ def motus_data_dir(data_dir: Path) -> Path:
 def ganon_data_dir(data_dir: Path) -> Path:
     """Provide the path to the ganon data directory."""
     return data_dir / "ganon"
+
+
+@pytest.fixture(scope="session")
+def kmcp_data_dir(data_dir: Path) -> Path:
+    """Provide the path to the KMCP data directory."""
+    return data_dir / "kmcp"
diff --git a/tests/data/kmcp/2612_pe_ERR5766176_db1.kmcp_profile.profile b/tests/data/kmcp/2612_pe_ERR5766176_db1.kmcp_profile.profile
@@ -0,0 +1,2 @@
+ref	percentage	coverage	score	chunksFrac	chunksRelDepth	chunksRelDepthStd	reads	ureads	hicureads	refsize	refname	taxid	rank	taxname	taxpath	taxpathsn
+test_1	100.000000	1.238685	83.97	1.00	1.00	0.00	114	114	25	13897		2697049		SARS-CoV-2