-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from broadinstitute/qh/draft_toolbox
First draft of Toolbox use cases
- Loading branch information
Showing
17 changed files
with
14,274 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,33 @@ | ||
# gnomad-toolbox | ||
This repository provides a set of Python functions to simplify working with gnomAD Hail Tables. It includes tools for data access, filtering, and analysis. | ||
|
||
## Repository structure | ||
``` | ||
ggnomad_toolbox/ | ||
│ | ||
├── load_data.py # Functions to load gnomAD release Hail Tables. | ||
│ | ||
├── filtering/ | ||
│ ├── __init__.py | ||
│ ├── constraint.py # Functions to filter constraint metrics (e.g., observed/expected ratios). | ||
│ ├── coverage.py # Functions to filter variants or regions based on coverage thresholds. | ||
│ ├── frequency.py # Functions to filter variants by allele frequency thresholds. | ||
│ ├── pext.py # Functions to filter variants using predicted expression (pext) scores. | ||
| ├── variant.py # Functions to filter to a specific variant or set of variants. | ||
│ ├── vep.py # Functions to filter variants based on VEP (Variant Effect Predictor) annotations. | ||
│ | ||
├── analysis/ | ||
│ ├── __init__.py | ||
│ ├── general.py # General analysis functions, such as summarizing variant statistics. | ||
│ | ||
├── notebooks/ | ||
│ ├── intro_to_release_data.ipynb # Jupyter notebook introducing the loading of gnomAD release data. | ||
``` | ||
|
||
# TODO: Add fully detailed info about how to install and open the notebooks. | ||
## Getting started | ||
### Install | ||
pip install -r requirements.txt | ||
|
||
### Opening the notebooks | ||
jupyter lab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# noqa: D104 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
"""Set of general functions for gnomAD analysis.""" | ||
|
||
from typing import Dict, List, Optional, Tuple, Union | ||
|
||
import hail as hl | ||
|
||
from gnomad_toolbox.load_data import _get_gnomad_release | ||
|
||
|
||
# TODO: Modify this function in gnomad_methods. | ||
def freq_bin_expr( | ||
freq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression], | ||
index: int = 0, | ||
ac_cutoffs: Optional[List[Union[int, Tuple[int, str]]]] = [ | ||
(0, "AC0"), | ||
(1, "singleton"), | ||
(2, "doubleton"), | ||
], | ||
af_cutoffs: Optional[List[Union[float, Tuple[float, str]]]] = [ | ||
(1e-4, "0.01%"), | ||
(1e-3, "0.1%"), | ||
(1e-2, "1%"), | ||
(1e-1, "10%"), | ||
], | ||
upper_af: Optional[Union[float, Tuple[float, str]]] = (0.95, "95%"), | ||
) -> hl.expr.StringExpression: | ||
""" | ||
Return frequency string annotations based on input AC or AF. | ||
.. note:: | ||
- Default index is 0 because function assumes freq_expr was calculated with | ||
`annotate_freq`. | ||
- Frequency index 0 from `annotate_freq` is frequency for all pops calculated | ||
on adj genotypes only. | ||
:param freq_expr: Array of structs containing frequency information. | ||
:param index: Which index of freq_expr to use for annotation. Default is 0. | ||
:param ac_cutoffs: List of AC cutoffs to use for binning. | ||
:param af_cutoffs: List of AF cutoffs to use for binning. | ||
:param upper_af: Upper AF cutoff to use for binning. | ||
:return: StringExpression containing bin name based on input AC or AF. | ||
""" | ||
if isinstance(freq_expr, hl.expr.ArrayExpression): | ||
freq_expr = freq_expr[index] | ||
|
||
if ac_cutoffs and isinstance(ac_cutoffs[0], int): | ||
ac_cutoffs = [(c, f"AC{c}") for c in ac_cutoffs] | ||
|
||
if af_cutoffs and isinstance(af_cutoffs[0], float): | ||
af_cutoffs = [(f, f"{f*100}%") for f in af_cutoffs] | ||
|
||
if isinstance(upper_af, float): | ||
upper_af = (upper_af, f"{upper_af*100}%") | ||
|
||
freq_bin_expr = hl.case().when(hl.is_missing(freq_expr.AC), "Missing") | ||
prev_af = None | ||
for ac, name in sorted(ac_cutoffs): | ||
freq_bin_expr = freq_bin_expr.when(freq_expr.AC == ac, name) | ||
prev_af = name | ||
|
||
for af, name in sorted(af_cutoffs): | ||
prev_af = "<" if prev_af is None else f"{prev_af} - " | ||
freq_bin_expr = freq_bin_expr.when(freq_expr.AF < af, f"{prev_af}{name}") | ||
prev_af = name | ||
|
||
if upper_af: | ||
freq_bin_expr = freq_bin_expr.when( | ||
freq_expr.AF > upper_af[0], f">{upper_af[1]}" | ||
) | ||
default_af = "<" if prev_af is None else f"{prev_af} - " | ||
default_af = f"{default_af}{upper_af[1]}" | ||
else: | ||
default_af = f">{prev_af}" | ||
|
||
return freq_bin_expr.default(default_af) | ||
|
||
|
||
def get_variant_count_by_freq_bin( | ||
af_cutoffs: List[float] = [0.001, 0.01], | ||
singletons: bool = False, | ||
doubletons: bool = False, | ||
pass_only: bool = True, | ||
**kwargs, | ||
) -> Dict[str, int]: | ||
""" | ||
Count variants by frequency bin. | ||
By default, this function counts PASS variants that are AC0, AF < 0.01%, and | ||
AF 0.01% - 0.1%. | ||
The function can also include counts of singletons and doubletons, with or | ||
without passing filters. | ||
.. note:: | ||
This function works for gnomAD exomes and genomes data types, not yet for gnomAD | ||
joint data type, since the HT schema is slightly different. | ||
:param af_cutoffs: List of allele frequencies cutoffs. | ||
:param singletons: Include singletons. | ||
:param doubletons: Include doubletons. | ||
:param pass_only: Include only PASS variants. | ||
:param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes | ||
'ht', 'data_type', and 'version'. | ||
:return: Dictionary with counts. | ||
""" | ||
# Load the Hail Table if not provided | ||
ht = _get_gnomad_release(dataset="variant", **kwargs) | ||
|
||
# Filter to PASS variants. | ||
if pass_only: | ||
ht = ht.filter(hl.len(ht.filters) == 0) | ||
|
||
# Initialize allele count cutoffs with AC0. | ||
ac_cutoffs = [(0, "AC0")] | ||
|
||
if singletons: | ||
ac_cutoffs.append((1, "singletons")) | ||
|
||
if doubletons: | ||
ac_cutoffs.append((2, "doubletons")) | ||
|
||
freq_expr = freq_bin_expr( | ||
ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None | ||
) | ||
|
||
return ht.aggregate(hl.agg.counter(freq_expr)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# noqa: D104 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# noqa: D104, D100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# noqa: D104, D100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
"""Functions for filtering the gnomAD sites HT frequency data.""" | ||
|
||
from typing import List, Union | ||
|
||
import hail as hl | ||
from gnomad.utils.filtering import filter_arrays_by_meta | ||
|
||
from gnomad_toolbox.filtering.variant import get_single_variant | ||
from gnomad_toolbox.load_data import _get_gnomad_release | ||
|
||
|
||
def get_ancestry_callstats( | ||
gen_ancs: Union[str, List[str]], | ||
**kwargs, | ||
) -> hl.Table: | ||
""" | ||
Extract callstats for specified ancestry group(s). | ||
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas', | ||
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of | ||
ancestry groups. | ||
:param kwargs: Keyword arguments to pass to _get_gnomad_release. | ||
:return: Table with callstats for the given ancestry groups and variant. | ||
""" | ||
# Load the Hail Table if not provided | ||
ht = _get_gnomad_release(dataset="variant", **kwargs) | ||
|
||
# Check if gen_ancs is a single ancestry group. | ||
one_anc = isinstance(gen_ancs, str) | ||
|
||
if one_anc: | ||
gen_ancs = [gen_ancs] | ||
|
||
# Format gen_ancs to lowercase and filter arrays by metadata. | ||
gen_ancs = [gen_anc.lower() for gen_anc in gen_ancs] | ||
gen_anc_label = ( | ||
"gen_anc" if any(["gen_anc" in m for m in hl.eval(ht.freq_meta)]) else "pop" | ||
) | ||
items_to_filter = {gen_anc_label: gen_ancs, "group": ["adj"]} | ||
freq_meta, array_exprs = filter_arrays_by_meta( | ||
ht.freq_meta, | ||
{ | ||
**{a: ht[a] for a in ["freq"]}, | ||
"freq_meta_sample_count": ht.index_globals().freq_meta_sample_count, | ||
}, | ||
items_to_filter=items_to_filter, | ||
keep=True, | ||
combine_operator="and", | ||
exact_match=True, | ||
) | ||
ht = ht.select( | ||
"filters", | ||
**{ | ||
m[gen_anc_label]: array_exprs["freq"][i] | ||
for i, m in enumerate(hl.eval(freq_meta)) | ||
}, | ||
) | ||
|
||
# Select a subset of the globals. | ||
sample_count = array_exprs["freq_meta_sample_count"] | ||
if one_anc: | ||
sample_count = sample_count[0] | ||
else: | ||
sample_count = hl.struct( | ||
**{ | ||
m[gen_anc_label]: sample_count[i] | ||
for i, m in enumerate(hl.eval(freq_meta)) | ||
} | ||
) | ||
ht = ht.select_globals("date", "version", sample_count=sample_count) | ||
|
||
return ht | ||
|
||
|
||
def get_single_variant_ancestry_callstats( | ||
gen_ancs: Union[str, List[str]], | ||
**kwargs, | ||
) -> hl.Table: | ||
""" | ||
Extract callstats for specified ancestry group(s) and a single variant. | ||
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas', | ||
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of | ||
ancestry groups. | ||
:param kwargs: Keyword arguments to pass to get_single_variant. | ||
:return: Table with callstats for the given ancestry groups and variant. | ||
""" | ||
ht = get_single_variant(**kwargs) | ||
|
||
return get_ancestry_callstats(gen_ancs, ht=ht) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# noqa: D104, D100 |
Oops, something went wrong.