Merge pull request #4 from broadinstitute/qh/draft_toolbox

First draft of Toolbox use cases
broadinstitute · Dec 18, 2024 · ef83b20 · ef83b20
2 parents 8bde073 + c37ad6c
commit ef83b20
Show file tree

Hide file tree

Showing 17 changed files with 14,274 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,33 @@
 # gnomad-toolbox
 This repository provides a set of Python functions to simplify working with gnomAD Hail Tables. It includes tools for data access, filtering, and analysis.
+
+## Repository structure
+```
+ggnomad_toolbox/
+│
+├── load_data.py         # Functions to load gnomAD release Hail Tables.
+│
+├── filtering/
+│   ├── __init__.py
+│   ├── constraint.py    # Functions to filter constraint metrics (e.g., observed/expected ratios).
+│   ├── coverage.py      # Functions to filter variants or regions based on coverage thresholds.
+│   ├── frequency.py     # Functions to filter variants by allele frequency thresholds.
+│   ├── pext.py          # Functions to filter variants using predicted expression (pext) scores.
+|   ├── variant.py       # Functions to filter to a specific variant or set of variants.
+│   ├── vep.py           # Functions to filter variants based on VEP (Variant Effect Predictor) annotations.
+│
+├── analysis/
+│   ├── __init__.py
+│   ├── general.py       # General analysis functions, such as summarizing variant statistics.
+│
+├── notebooks/
+│   ├── intro_to_release_data.ipynb    # Jupyter notebook introducing the loading of gnomAD release data.
+```
+
+# TODO: Add fully detailed info about how to install and open the notebooks.
+## Getting started
+### Install
+pip install -r requirements.txt
+
+### Opening the notebooks
+jupyter lab
diff --git a/docs/generate_api_reference.py b/docs/generate_api_reference.py
@@ -21,16 +21,6 @@
 
 EXCLUDE_PACKAGES = ["tests"]
 
-EXCLUDE_TOP_LEVEL_PACKAGES = []
-"""
-List of packages/modules to exclude from API reference documentation.
-
-This should be a list of strings where each string is the full name (from the top level)
-of a package or module to exclude. For example, if 'gnomad_toolbox' includes a
-'example_notebooks' that you want to exclude, you would add
-'gnomad_toolbox.example_notebooks' to this list.
-"""
-
 PACKAGE_DOC_TEMPLATE = """{title}
 
 {package_doc}
@@ -119,7 +109,11 @@ def write_module_doc(module_name):
     write_file(doc_path, doc)
 
 
-def write_package_doc(package_name):
+def write_package_doc(
+    package_name,
+    package_doc = None,
+    doc_path = None,
+):
     """Write API reference documentation file for a package."""
     package = importlib.import_module(package_name)
 
@@ -139,32 +133,21 @@ def write_package_doc(package_name):
 
     doc = PACKAGE_DOC_TEMPLATE.format(
         title=format_title(package_name),
-        package_doc=package.__doc__ or "",
+        package_doc= package_doc or package.__doc__ or "",
         module_links="\n    ".join(module_links),
     )
 
-    doc_path = package_doc_path(package)
+    doc_path = doc_path or package_doc_path(package)
     write_file(doc_path, doc)
 
 
 if __name__ == "__main__":
-    packages = setuptools.find_namespace_packages(
-        where=REPOSITORY_ROOT_PATH, include=["gnomad_toolbox.*"]
-    )
-    top_level_packages = [
-        pkg for pkg in packages if pkg.count(".") == 1 and pkg not in EXCLUDE_TOP_LEVEL_PACKAGES
-    ]
-
-    for pkg in top_level_packages:
-        write_package_doc(pkg)
-
-    root_doc = PACKAGE_DOC_TEMPLATE.format(
-        title=format_title("gnomad_toolbox"),
-        package_doc="",
-        module_links="\n    ".join(
-            f"{pkg.split('.')[1]} <{pkg.split('.')[1]}/index>"
-            for pkg in top_level_packages
+    write_package_doc(
+        "gnomad_toolbox",
+        package_doc=(
+            "This repository provides a set of Python functions to simplify working "
+            "with gnomAD Hail Tables. It includes tools for data access, filtering, "
+            "and analysis."
         ),
+        doc_path=os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"),
     )
-
-    write_file(os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"), root_doc)
diff --git a/gnomad_toolbox/analysis/__init__.py b/gnomad_toolbox/analysis/__init__.py
@@ -0,0 +1 @@
+# noqa: D104
diff --git a/gnomad_toolbox/analysis/general.py b/gnomad_toolbox/analysis/general.py
@@ -0,0 +1,128 @@
+"""Set of general functions for gnomAD analysis."""
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import hail as hl
+
+from gnomad_toolbox.load_data import _get_gnomad_release
+
+
+# TODO: Modify this function in gnomad_methods.
+def freq_bin_expr(
+    freq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
+    index: int = 0,
+    ac_cutoffs: Optional[List[Union[int, Tuple[int, str]]]] = [
+        (0, "AC0"),
+        (1, "singleton"),
+        (2, "doubleton"),
+    ],
+    af_cutoffs: Optional[List[Union[float, Tuple[float, str]]]] = [
+        (1e-4, "0.01%"),
+        (1e-3, "0.1%"),
+        (1e-2, "1%"),
+        (1e-1, "10%"),
+    ],
+    upper_af: Optional[Union[float, Tuple[float, str]]] = (0.95, "95%"),
+) -> hl.expr.StringExpression:
+    """
+    Return frequency string annotations based on input AC or AF.
+
+    .. note::
+
+        - Default index is 0 because function assumes freq_expr was calculated with
+          `annotate_freq`.
+        - Frequency index 0 from `annotate_freq` is frequency for all pops calculated
+          on adj genotypes only.
+
+    :param freq_expr: Array of structs containing frequency information.
+    :param index: Which index of freq_expr to use for annotation. Default is 0.
+    :param ac_cutoffs: List of AC cutoffs to use for binning.
+    :param af_cutoffs: List of AF cutoffs to use for binning.
+    :param upper_af: Upper AF cutoff to use for binning.
+    :return: StringExpression containing bin name based on input AC or AF.
+    """
+    if isinstance(freq_expr, hl.expr.ArrayExpression):
+        freq_expr = freq_expr[index]
+
+    if ac_cutoffs and isinstance(ac_cutoffs[0], int):
+        ac_cutoffs = [(c, f"AC{c}") for c in ac_cutoffs]
+
+    if af_cutoffs and isinstance(af_cutoffs[0], float):
+        af_cutoffs = [(f, f"{f*100}%") for f in af_cutoffs]
+
+    if isinstance(upper_af, float):
+        upper_af = (upper_af, f"{upper_af*100}%")
+
+    freq_bin_expr = hl.case().when(hl.is_missing(freq_expr.AC), "Missing")
+    prev_af = None
+    for ac, name in sorted(ac_cutoffs):
+        freq_bin_expr = freq_bin_expr.when(freq_expr.AC == ac, name)
+        prev_af = name
+
+    for af, name in sorted(af_cutoffs):
+        prev_af = "<" if prev_af is None else f"{prev_af} - "
+        freq_bin_expr = freq_bin_expr.when(freq_expr.AF < af, f"{prev_af}{name}")
+        prev_af = name
+
+    if upper_af:
+        freq_bin_expr = freq_bin_expr.when(
+            freq_expr.AF > upper_af[0], f">{upper_af[1]}"
+        )
+        default_af = "<" if prev_af is None else f"{prev_af} - "
+        default_af = f"{default_af}{upper_af[1]}"
+    else:
+        default_af = f">{prev_af}"
+
+    return freq_bin_expr.default(default_af)
+
+
+def get_variant_count_by_freq_bin(
+    af_cutoffs: List[float] = [0.001, 0.01],
+    singletons: bool = False,
+    doubletons: bool = False,
+    pass_only: bool = True,
+    **kwargs,
+) -> Dict[str, int]:
+    """
+    Count variants by frequency bin.
+
+    By default, this function counts PASS variants that are AC0, AF < 0.01%, and
+    AF 0.01% - 0.1%.
+
+    The function can also include counts of singletons and doubletons, with or
+    without passing filters.
+
+    .. note::
+
+        This function works for gnomAD exomes and genomes data types, not yet for gnomAD
+        joint data type, since the HT schema is slightly different.
+
+    :param af_cutoffs: List of allele frequencies cutoffs.
+    :param singletons: Include singletons.
+    :param doubletons: Include doubletons.
+    :param pass_only: Include only PASS variants.
+    :param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
+        'ht', 'data_type', and 'version'.
+    :return: Dictionary with counts.
+    """
+    # Load the Hail Table if not provided
+    ht = _get_gnomad_release(dataset="variant", **kwargs)
+
+    # Filter to PASS variants.
+    if pass_only:
+        ht = ht.filter(hl.len(ht.filters) == 0)
+
+    # Initialize allele count cutoffs with AC0.
+    ac_cutoffs = [(0, "AC0")]
+
+    if singletons:
+        ac_cutoffs.append((1, "singletons"))
+
+    if doubletons:
+        ac_cutoffs.append((2, "doubletons"))
+
+    freq_expr = freq_bin_expr(
+        ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None
+    )
+
+    return ht.aggregate(hl.agg.counter(freq_expr))
diff --git a/gnomad_toolbox/filtering/__init__.py b/gnomad_toolbox/filtering/__init__.py
@@ -0,0 +1 @@
+# noqa: D104
diff --git a/gnomad_toolbox/filtering/constraint.py b/gnomad_toolbox/filtering/constraint.py
@@ -0,0 +1 @@
+# noqa: D104, D100
diff --git a/gnomad_toolbox/filtering/coverage.py b/gnomad_toolbox/filtering/coverage.py
@@ -0,0 +1 @@
+# noqa: D104, D100
diff --git a/gnomad_toolbox/filtering/frequency.py b/gnomad_toolbox/filtering/frequency.py
@@ -0,0 +1,90 @@
+"""Functions for filtering the gnomAD sites HT frequency data."""
+
+from typing import List, Union
+
+import hail as hl
+from gnomad.utils.filtering import filter_arrays_by_meta
+
+from gnomad_toolbox.filtering.variant import get_single_variant
+from gnomad_toolbox.load_data import _get_gnomad_release
+
+
+def get_ancestry_callstats(
+    gen_ancs: Union[str, List[str]],
+    **kwargs,
+) -> hl.Table:
+    """
+    Extract callstats for specified ancestry group(s).
+
+    :param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
+        'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
+        ancestry groups.
+    :param kwargs: Keyword arguments to pass to _get_gnomad_release.
+    :return: Table with callstats for the given ancestry groups and variant.
+    """
+    # Load the Hail Table if not provided
+    ht = _get_gnomad_release(dataset="variant", **kwargs)
+
+    # Check if gen_ancs is a single ancestry group.
+    one_anc = isinstance(gen_ancs, str)
+
+    if one_anc:
+        gen_ancs = [gen_ancs]
+
+    # Format gen_ancs to lowercase and filter arrays by metadata.
+    gen_ancs = [gen_anc.lower() for gen_anc in gen_ancs]
+    gen_anc_label = (
+        "gen_anc" if any(["gen_anc" in m for m in hl.eval(ht.freq_meta)]) else "pop"
+    )
+    items_to_filter = {gen_anc_label: gen_ancs, "group": ["adj"]}
+    freq_meta, array_exprs = filter_arrays_by_meta(
+        ht.freq_meta,
+        {
+            **{a: ht[a] for a in ["freq"]},
+            "freq_meta_sample_count": ht.index_globals().freq_meta_sample_count,
+        },
+        items_to_filter=items_to_filter,
+        keep=True,
+        combine_operator="and",
+        exact_match=True,
+    )
+    ht = ht.select(
+        "filters",
+        **{
+            m[gen_anc_label]: array_exprs["freq"][i]
+            for i, m in enumerate(hl.eval(freq_meta))
+        },
+    )
+
+    # Select a subset of the globals.
+    sample_count = array_exprs["freq_meta_sample_count"]
+    if one_anc:
+        sample_count = sample_count[0]
+    else:
+        sample_count = hl.struct(
+            **{
+                m[gen_anc_label]: sample_count[i]
+                for i, m in enumerate(hl.eval(freq_meta))
+            }
+        )
+    ht = ht.select_globals("date", "version", sample_count=sample_count)
+
+    return ht
+
+
+def get_single_variant_ancestry_callstats(
+    gen_ancs: Union[str, List[str]],
+    **kwargs,
+) -> hl.Table:
+    """
+    Extract callstats for specified ancestry group(s) and a single variant.
+
+    :param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
+        'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
+        ancestry groups.
+    :param kwargs: Keyword arguments to pass to get_single_variant.
+    :return: Table with callstats for the given ancestry groups and variant.
+    """
+    ht = get_single_variant(**kwargs)
+
+    return get_ancestry_callstats(gen_ancs, ht=ht)
diff --git a/gnomad_toolbox/filtering/pext.py b/gnomad_toolbox/filtering/pext.py
@@ -0,0 +1 @@
+# noqa: D104, D100