Skip to content

Commit

Permalink
Merge pull request #4 from broadinstitute/qh/draft_toolbox
Browse files Browse the repository at this point in the history
First draft of Toolbox use cases
  • Loading branch information
jkgoodrich authored Dec 18, 2024
2 parents 8bde073 + c37ad6c commit ef83b20
Show file tree
Hide file tree
Showing 17 changed files with 14,274 additions and 32 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,33 @@
# gnomad-toolbox
This repository provides a set of Python functions to simplify working with gnomAD Hail Tables. It includes tools for data access, filtering, and analysis.

## Repository structure
```
ggnomad_toolbox/
├── load_data.py # Functions to load gnomAD release Hail Tables.
├── filtering/
│ ├── __init__.py
│ ├── constraint.py # Functions to filter constraint metrics (e.g., observed/expected ratios).
│ ├── coverage.py # Functions to filter variants or regions based on coverage thresholds.
│ ├── frequency.py # Functions to filter variants by allele frequency thresholds.
│ ├── pext.py # Functions to filter variants using predicted expression (pext) scores.
| ├── variant.py # Functions to filter to a specific variant or set of variants.
│ ├── vep.py # Functions to filter variants based on VEP (Variant Effect Predictor) annotations.
├── analysis/
│ ├── __init__.py
│ ├── general.py # General analysis functions, such as summarizing variant statistics.
├── notebooks/
│ ├── intro_to_release_data.ipynb # Jupyter notebook introducing the loading of gnomAD release data.
```

# TODO: Add fully detailed info about how to install and open the notebooks.
## Getting started
### Install
pip install -r requirements.txt

### Opening the notebooks
jupyter lab
45 changes: 14 additions & 31 deletions docs/generate_api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,6 @@

EXCLUDE_PACKAGES = ["tests"]

EXCLUDE_TOP_LEVEL_PACKAGES = []
"""
List of packages/modules to exclude from API reference documentation.
This should be a list of strings where each string is the full name (from the top level)
of a package or module to exclude. For example, if 'gnomad_toolbox' includes a
'example_notebooks' that you want to exclude, you would add
'gnomad_toolbox.example_notebooks' to this list.
"""

PACKAGE_DOC_TEMPLATE = """{title}
{package_doc}
Expand Down Expand Up @@ -119,7 +109,11 @@ def write_module_doc(module_name):
write_file(doc_path, doc)


def write_package_doc(package_name):
def write_package_doc(
package_name,
package_doc = None,
doc_path = None,
):
"""Write API reference documentation file for a package."""
package = importlib.import_module(package_name)

Expand All @@ -139,32 +133,21 @@ def write_package_doc(package_name):

doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title(package_name),
package_doc=package.__doc__ or "",
package_doc= package_doc or package.__doc__ or "",
module_links="\n ".join(module_links),
)

doc_path = package_doc_path(package)
doc_path = doc_path or package_doc_path(package)
write_file(doc_path, doc)


if __name__ == "__main__":
packages = setuptools.find_namespace_packages(
where=REPOSITORY_ROOT_PATH, include=["gnomad_toolbox.*"]
)
top_level_packages = [
pkg for pkg in packages if pkg.count(".") == 1 and pkg not in EXCLUDE_TOP_LEVEL_PACKAGES
]

for pkg in top_level_packages:
write_package_doc(pkg)

root_doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title("gnomad_toolbox"),
package_doc="",
module_links="\n ".join(
f"{pkg.split('.')[1]} <{pkg.split('.')[1]}/index>"
for pkg in top_level_packages
write_package_doc(
"gnomad_toolbox",
package_doc=(
"This repository provides a set of Python functions to simplify working "
"with gnomAD Hail Tables. It includes tools for data access, filtering, "
"and analysis."
),
doc_path=os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"),
)

write_file(os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"), root_doc)
1 change: 1 addition & 0 deletions gnomad_toolbox/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104
128 changes: 128 additions & 0 deletions gnomad_toolbox/analysis/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Set of general functions for gnomAD analysis."""

from typing import Dict, List, Optional, Tuple, Union

import hail as hl

from gnomad_toolbox.load_data import _get_gnomad_release


# TODO: Modify this function in gnomad_methods.
def freq_bin_expr(
freq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
index: int = 0,
ac_cutoffs: Optional[List[Union[int, Tuple[int, str]]]] = [
(0, "AC0"),
(1, "singleton"),
(2, "doubleton"),
],
af_cutoffs: Optional[List[Union[float, Tuple[float, str]]]] = [
(1e-4, "0.01%"),
(1e-3, "0.1%"),
(1e-2, "1%"),
(1e-1, "10%"),
],
upper_af: Optional[Union[float, Tuple[float, str]]] = (0.95, "95%"),
) -> hl.expr.StringExpression:
"""
Return frequency string annotations based on input AC or AF.
.. note::
- Default index is 0 because function assumes freq_expr was calculated with
`annotate_freq`.
- Frequency index 0 from `annotate_freq` is frequency for all pops calculated
on adj genotypes only.
:param freq_expr: Array of structs containing frequency information.
:param index: Which index of freq_expr to use for annotation. Default is 0.
:param ac_cutoffs: List of AC cutoffs to use for binning.
:param af_cutoffs: List of AF cutoffs to use for binning.
:param upper_af: Upper AF cutoff to use for binning.
:return: StringExpression containing bin name based on input AC or AF.
"""
if isinstance(freq_expr, hl.expr.ArrayExpression):
freq_expr = freq_expr[index]

if ac_cutoffs and isinstance(ac_cutoffs[0], int):
ac_cutoffs = [(c, f"AC{c}") for c in ac_cutoffs]

if af_cutoffs and isinstance(af_cutoffs[0], float):
af_cutoffs = [(f, f"{f*100}%") for f in af_cutoffs]

if isinstance(upper_af, float):
upper_af = (upper_af, f"{upper_af*100}%")

freq_bin_expr = hl.case().when(hl.is_missing(freq_expr.AC), "Missing")
prev_af = None
for ac, name in sorted(ac_cutoffs):
freq_bin_expr = freq_bin_expr.when(freq_expr.AC == ac, name)
prev_af = name

for af, name in sorted(af_cutoffs):
prev_af = "<" if prev_af is None else f"{prev_af} - "
freq_bin_expr = freq_bin_expr.when(freq_expr.AF < af, f"{prev_af}{name}")
prev_af = name

if upper_af:
freq_bin_expr = freq_bin_expr.when(
freq_expr.AF > upper_af[0], f">{upper_af[1]}"
)
default_af = "<" if prev_af is None else f"{prev_af} - "
default_af = f"{default_af}{upper_af[1]}"
else:
default_af = f">{prev_af}"

return freq_bin_expr.default(default_af)


def get_variant_count_by_freq_bin(
af_cutoffs: List[float] = [0.001, 0.01],
singletons: bool = False,
doubletons: bool = False,
pass_only: bool = True,
**kwargs,
) -> Dict[str, int]:
"""
Count variants by frequency bin.
By default, this function counts PASS variants that are AC0, AF < 0.01%, and
AF 0.01% - 0.1%.
The function can also include counts of singletons and doubletons, with or
without passing filters.
.. note::
This function works for gnomAD exomes and genomes data types, not yet for gnomAD
joint data type, since the HT schema is slightly different.
:param af_cutoffs: List of allele frequencies cutoffs.
:param singletons: Include singletons.
:param doubletons: Include doubletons.
:param pass_only: Include only PASS variants.
:param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
'ht', 'data_type', and 'version'.
:return: Dictionary with counts.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Filter to PASS variants.
if pass_only:
ht = ht.filter(hl.len(ht.filters) == 0)

# Initialize allele count cutoffs with AC0.
ac_cutoffs = [(0, "AC0")]

if singletons:
ac_cutoffs.append((1, "singletons"))

if doubletons:
ac_cutoffs.append((2, "doubletons"))

freq_expr = freq_bin_expr(
ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None
)

return ht.aggregate(hl.agg.counter(freq_expr))
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/constraint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
90 changes: 90 additions & 0 deletions gnomad_toolbox/filtering/frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Functions for filtering the gnomAD sites HT frequency data."""

from typing import List, Union

import hail as hl
from gnomad.utils.filtering import filter_arrays_by_meta

from gnomad_toolbox.filtering.variant import get_single_variant
from gnomad_toolbox.load_data import _get_gnomad_release


def get_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s).
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to _get_gnomad_release.
:return: Table with callstats for the given ancestry groups and variant.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Check if gen_ancs is a single ancestry group.
one_anc = isinstance(gen_ancs, str)

if one_anc:
gen_ancs = [gen_ancs]

# Format gen_ancs to lowercase and filter arrays by metadata.
gen_ancs = [gen_anc.lower() for gen_anc in gen_ancs]
gen_anc_label = (
"gen_anc" if any(["gen_anc" in m for m in hl.eval(ht.freq_meta)]) else "pop"
)
items_to_filter = {gen_anc_label: gen_ancs, "group": ["adj"]}
freq_meta, array_exprs = filter_arrays_by_meta(
ht.freq_meta,
{
**{a: ht[a] for a in ["freq"]},
"freq_meta_sample_count": ht.index_globals().freq_meta_sample_count,
},
items_to_filter=items_to_filter,
keep=True,
combine_operator="and",
exact_match=True,
)
ht = ht.select(
"filters",
**{
m[gen_anc_label]: array_exprs["freq"][i]
for i, m in enumerate(hl.eval(freq_meta))
},
)

# Select a subset of the globals.
sample_count = array_exprs["freq_meta_sample_count"]
if one_anc:
sample_count = sample_count[0]
else:
sample_count = hl.struct(
**{
m[gen_anc_label]: sample_count[i]
for i, m in enumerate(hl.eval(freq_meta))
}
)
ht = ht.select_globals("date", "version", sample_count=sample_count)

return ht


def get_single_variant_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s) and a single variant.
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to get_single_variant.
:return: Table with callstats for the given ancestry groups and variant.
"""
ht = get_single_variant(**kwargs)

return get_ancestry_callstats(gen_ancs, ht=ht)
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/pext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
Loading

0 comments on commit ef83b20

Please sign in to comment.