Merge pull request #280 from monarch-initiative/relax-handling-of-str…

…ict-mtc-filtering Relax handling of the MTC filtering
monarch-initiative · Sep 23, 2024 · 620378e · 620378e
2 parents 4e9f7ab + 8a9dc2c
commit 620378e
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 27 deletions.
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
@@ -309,8 +309,37 @@ def __init__(
         self._corrected_pvals = (
             None if corrected_pvals is None else tuple(corrected_pvals)
         )
-        assert isinstance(gt_predicate, GenotypePolyPredicate)
         self._gt_predicate = gt_predicate
+        errors = self._check_sanity()
+        if errors:
+            raise ValueError(os.linesep.join(errors))
+
+    def _check_sanity(self) -> typing.Sequence[str]:
+        errors = []
+        # All sequences must have the same lengths ...
+        for seq, name in (
+            (self._n_usable, 'n_usable'),
+            (self._all_counts, 'all_counts'),
+            (self._pvals, 'pvals'),
+        ):
+            if len(self._pheno_predicates) != len(seq):
+                errors.append(
+                    f"`len(pheno_predicates)` must be the same as `len({name})` but "
+                    f"{len(self._pheno_predicates)}!={len(seq)}"
+                )
+
+        # ... including the optional corrected p values
+        if self._corrected_pvals is not None and len(self._pheno_predicates) != len(self._corrected_pvals):
+            errors.append(
+                f"`len(pheno_predicates)` must be the same as `len(corrected_pvals)` but "
+                f"{len(self._pheno_predicates)}!={len(self._corrected_pvals)}"
+            )
+
+        if not isinstance(self._gt_predicate, GenotypePolyPredicate):
+            errors.append(
+                "`gt_predicate` must be an instance of `GenotypePolyPredicate`"
+            )
+        return errors
 
     @property
     def gt_predicate(self) -> GenotypePolyPredicate:
@@ -428,6 +457,19 @@ def __init__(
         self._mtc_filter_name = mtc_filter_name
         self._mtc_filter_results = tuple(mtc_filter_results)
         self._mtc_name = mtc_name
+
+        errors = self._check_hpo_result_sanity()
+        if errors:
+            raise ValueError(os.linesep.join(errors))
+
+    def _check_hpo_result_sanity(self) -> typing.Sequence[str]:
+        errors = []
+        if len(self._pheno_predicates) != len(self._mtc_filter_results):
+            errors.append(
+                f"`len(pheno_predicates)` must be the same as `len(mtc_filter_results)` but "
+                f"{len(self._pheno_predicates)}!={len(self._mtc_filter_results)}"
+            )
+        return errors
 
     @property
     def mtc_filter_name(self) -> str:
@@ -515,25 +557,26 @@ def _compute_result(
             ph_predicates=pheno_predicates,
             counts=all_counts,
         )
-        mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
-        if not mtc_mask.any():
-            raise ValueError("No phenotypes are left for the analysis after MTC filtering step")
 
-        # 3 - Compute nominal p values
         pvals = np.full(shape=(len(n_usable),), fill_value=np.nan)
-        pvals[mtc_mask] = self._compute_nominal_pvals(
-            n_usable=slice_list_in_numpy_style(n_usable, mtc_mask),
-            all_counts=slice_list_in_numpy_style(all_counts, mtc_mask),
-        )
+        corrected_pvals = None
 
-        # 4 - Apply Multiple Testing Correction
-        if self._mtc_correction is None:
-            corrected_pvals = None
-        else:
-            corrected_pvals = np.full(shape=pvals.shape, fill_value=np.nan)
-            # Do not test the p values that have been filtered out.
-            corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask])
+        mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
+        if mtc_mask.any():
+            # We have at least one HPO term to test.
+
+            # 3 - Compute nominal p values
+            pvals[mtc_mask] = self._compute_nominal_pvals(
+                n_usable=slice_list_in_numpy_style(n_usable, mtc_mask),
+                all_counts=slice_list_in_numpy_style(all_counts, mtc_mask),
+            )
 
+            # 4 - Apply Multiple Testing Correction
+            if self._mtc_correction is not None:
+                corrected_pvals = np.full(shape=pvals.shape, fill_value=np.nan)
+                # Do not test the p values that have been filtered out.
+                corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask])
+
         return HpoTermAnalysisResult(
             pheno_predicates=pheno_predicates,
             n_usable=n_usable,

diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py
@@ -1,6 +1,7 @@
 import typing
 
 import hpotk
+import numpy as np
 import pytest
 
 from gpsea.model import Cohort
@@ -64,7 +65,7 @@ def test_compare_genotype_vs_phenotypes(
                 0.48571428571428565,
                 float("nan"),
                 0.1048951048951049,
-                1.,
+                1.0,
             ],
             nan_ok=True,
         )
@@ -80,21 +81,21 @@ def test_compare_genotype_vs_phenotypes(
             nan_ok=True,
         )
 
-    def test_compare_genotype_vs_phenotypes_explodes_if_no_phenotypes_are_left_after_mtc_filter(
+    def test_compare_genotype_vs_phenotypes_can_handle_if_no_phenotypes_are_left_after_mtc_filter(
         self,
         analysis: HpoTermAnalysis,
         degenerated_cohort: Cohort,
         suox_gt_predicate: GenotypePolyPredicate,
         suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
     ):
-        with pytest.raises(ValueError) as e:
-            analysis.compare_genotype_vs_phenotypes(
-                cohort=degenerated_cohort,
-                gt_predicate=suox_gt_predicate,
-                pheno_predicates=suox_pheno_predicates,
-            )
+        result = analysis.compare_genotype_vs_phenotypes(
+            cohort=degenerated_cohort,
+            gt_predicate=suox_gt_predicate,
+            pheno_predicates=suox_pheno_predicates,
+        )
 
         assert (
-            e.value.args[0]
-            == "No phenotypes are left for the analysis after MTC filtering step"
-        )
+            result.total_tests == 0
+        ), "No tests should have been done due to MTC filtering"
+        assert np.all(np.isnan(result.pvals)), "All p values should be NaN"
+        assert result.corrected_pvals is None