Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First draft of Toolbox use cases #4

Merged
merged 34 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
e835bf6
Add a draft notebook
KoalaQin Sep 27, 2024
c9f0e22
Add more use cases in notebook
KoalaQin Sep 30, 2024
a44e72d
Add example to get AF for one ancestry for one variant
KoalaQin Sep 30, 2024
4bc936c
Add table of contents and screenshot
KoalaQin Oct 1, 2024
2f1156e
Format gnomad_methods in requirements.txt
KoalaQin Oct 1, 2024
ff64d86
Remove setup.py
KoalaQin Oct 1, 2024
bc85e6f
Remove setup.py
KoalaQin Oct 1, 2024
e083fbf
Reorganize the notebook
KoalaQin Oct 1, 2024
b5101ba
Function to import data by version
KoalaQin Oct 29, 2024
2bd494e
Modify import_data function to match gnomad_methods repo version by d…
KoalaQin Oct 29, 2024
3cfb1bd
Formatting
KoalaQin Oct 29, 2024
8d904cf
Formatting
KoalaQin Oct 29, 2024
49b2cb0
Formatting
KoalaQin Oct 29, 2024
a7c8e53
Formatting
KoalaQin Oct 29, 2024
6d3471f
Formatting
KoalaQin Oct 29, 2024
1b847b4
Formatting
KoalaQin Oct 29, 2024
22c40f7
Formatting docstring block
KoalaQin Oct 29, 2024
66770ea
Function to extract callstats for one variant in one genetic ancestry…
KoalaQin Oct 29, 2024
71a33bd
Function to extract callstats for genetic ancestry groups
KoalaQin Oct 29, 2024
80b1f91
indent correctly
KoalaQin Oct 29, 2024
8dcce6c
Add functions to filter variants by gene symbol, interval and csqs
KoalaQin Nov 1, 2024
34fab76
Correct small errors
KoalaQin Nov 2, 2024
0ccdb5b
Update notebook
KoalaQin Nov 2, 2024
c183470
- Restructure files
jkgoodrich Dec 9, 2024
095a234
- Restructure files
jkgoodrich Dec 9, 2024
5fe3010
- Modifications to support setting a default data_type and version
jkgoodrich Dec 9, 2024
5330ea6
More clean-up of notebooks and functions
jkgoodrich Dec 11, 2024
b853fc2
Add notebooks to git
jkgoodrich Dec 11, 2024
710bdcf
Fix unterminated string error
KoalaQin Dec 13, 2024
4bfc305
Apply suggestions from code review
jkgoodrich Dec 18, 2024
ecb664b
format
jkgoodrich Dec 18, 2024
06de980
Put back setup.py
KoalaQin Dec 18, 2024
d09953c
Merge pull request #5 from broadinstitute/jg/draft_toolbox
jkgoodrich Dec 18, 2024
c37ad6c
move setup to the correct location
jkgoodrich Dec 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,33 @@
# gnomad-toolbox
This repository provides a set of Python functions to simplify working with gnomAD Hail Tables. It includes tools for data access, filtering, and analysis.

## Repository structure
```
ggnomad_toolbox/
├── load_data.py # Functions to load gnomAD release Hail Tables.
├── filtering/
│ ├── __init__.py
│ ├── constraint.py # Functions to filter constraint metrics (e.g., observed/expected ratios).
│ ├── coverage.py # Functions to filter variants or regions based on coverage thresholds.
│ ├── frequency.py # Functions to filter variants by allele frequency thresholds.
│ ├── pext.py # Functions to filter variants using predicted expression (pext) scores.
| ├── variant.py # Functions to filter to a specific variant or set of variants.
│ ├── vep.py # Functions to filter variants based on VEP (Variant Effect Predictor) annotations.
├── analysis/
│ ├── __init__.py
│ ├── general.py # General analysis functions, such as summarizing variant statistics.
├── notebooks/
│ ├── intro_to_release_data.ipynb # Jupyter notebook introducing the loading of gnomAD release data.
```

# TODO: Add fully detailed info about how to install and open the notebooks.
## Getting started
### Install
pip install -r requirements.txt

### Opening the notebooks
jupyter lab
45 changes: 14 additions & 31 deletions docs/generate_api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,6 @@

EXCLUDE_PACKAGES = ["tests"]

EXCLUDE_TOP_LEVEL_PACKAGES = []
"""
List of packages/modules to exclude from API reference documentation.

This should be a list of strings where each string is the full name (from the top level)
of a package or module to exclude. For example, if 'gnomad_toolbox' includes a
'example_notebooks' that you want to exclude, you would add
'gnomad_toolbox.example_notebooks' to this list.
"""

PACKAGE_DOC_TEMPLATE = """{title}

{package_doc}
Expand Down Expand Up @@ -119,7 +109,11 @@ def write_module_doc(module_name):
write_file(doc_path, doc)


def write_package_doc(package_name):
def write_package_doc(
package_name,
package_doc = None,
doc_path = None,
):
"""Write API reference documentation file for a package."""
package = importlib.import_module(package_name)

Expand All @@ -139,32 +133,21 @@ def write_package_doc(package_name):

doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title(package_name),
package_doc=package.__doc__ or "",
package_doc= package_doc or package.__doc__ or "",
module_links="\n ".join(module_links),
)

doc_path = package_doc_path(package)
doc_path = doc_path or package_doc_path(package)
write_file(doc_path, doc)


if __name__ == "__main__":
packages = setuptools.find_namespace_packages(
where=REPOSITORY_ROOT_PATH, include=["gnomad_toolbox.*"]
)
top_level_packages = [
pkg for pkg in packages if pkg.count(".") == 1 and pkg not in EXCLUDE_TOP_LEVEL_PACKAGES
]

for pkg in top_level_packages:
write_package_doc(pkg)

root_doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title("gnomad_toolbox"),
package_doc="",
module_links="\n ".join(
f"{pkg.split('.')[1]} <{pkg.split('.')[1]}/index>"
for pkg in top_level_packages
write_package_doc(
"gnomad_toolbox",
package_doc=(
"This repository provides a set of Python functions to simplify working "
"with gnomAD Hail Tables. It includes tools for data access, filtering, "
"and analysis."
),
doc_path=os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"),
)

write_file(os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"), root_doc)
1 change: 1 addition & 0 deletions gnomad_toolbox/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104
128 changes: 128 additions & 0 deletions gnomad_toolbox/analysis/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Set of general functions for gnomAD analysis."""

from typing import Dict, List, Optional, Tuple, Union

import hail as hl

from gnomad_toolbox.load_data import _get_gnomad_release


# TODO: Modify this function in gnomad_methods.
def freq_bin_expr(
freq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
index: int = 0,
ac_cutoffs: Optional[List[Union[int, Tuple[int, str]]]] = [
(0, "AC0"),
(1, "singleton"),
(2, "doubleton"),
],
af_cutoffs: Optional[List[Union[float, Tuple[float, str]]]] = [
(1e-4, "0.01%"),
(1e-3, "0.1%"),
(1e-2, "1%"),
(1e-1, "10%"),
],
upper_af: Optional[Union[float, Tuple[float, str]]] = (0.95, "95%"),
) -> hl.expr.StringExpression:
"""
Return frequency string annotations based on input AC or AF.

.. note::

- Default index is 0 because function assumes freq_expr was calculated with
`annotate_freq`.
- Frequency index 0 from `annotate_freq` is frequency for all pops calculated
on adj genotypes only.

:param freq_expr: Array of structs containing frequency information.
:param index: Which index of freq_expr to use for annotation. Default is 0.
:param ac_cutoffs: List of AC cutoffs to use for binning.
:param af_cutoffs: List of AF cutoffs to use for binning.
:param upper_af: Upper AF cutoff to use for binning.
:return: StringExpression containing bin name based on input AC or AF.
"""
if isinstance(freq_expr, hl.expr.ArrayExpression):
freq_expr = freq_expr[index]

if ac_cutoffs and isinstance(ac_cutoffs[0], int):
ac_cutoffs = [(c, f"AC{c}") for c in ac_cutoffs]

if af_cutoffs and isinstance(af_cutoffs[0], float):
af_cutoffs = [(f, f"{f*100}%") for f in af_cutoffs]

if isinstance(upper_af, float):
upper_af = (upper_af, f"{upper_af*100}%")

freq_bin_expr = hl.case().when(hl.is_missing(freq_expr.AC), "Missing")
prev_af = None
for ac, name in sorted(ac_cutoffs):
freq_bin_expr = freq_bin_expr.when(freq_expr.AC == ac, name)
prev_af = name

for af, name in sorted(af_cutoffs):
prev_af = "<" if prev_af is None else f"{prev_af} - "
freq_bin_expr = freq_bin_expr.when(freq_expr.AF < af, f"{prev_af}{name}")
prev_af = name

if upper_af:
freq_bin_expr = freq_bin_expr.when(
freq_expr.AF > upper_af[0], f">{upper_af[1]}"
)
default_af = "<" if prev_af is None else f"{prev_af} - "
default_af = f"{default_af}{upper_af[1]}"
else:
default_af = f">{prev_af}"

return freq_bin_expr.default(default_af)


def get_variant_count_by_freq_bin(
af_cutoffs: List[float] = [0.001, 0.01],
singletons: bool = False,
doubletons: bool = False,
pass_only: bool = True,
**kwargs,
) -> Dict[str, int]:
"""
Count variants by frequency bin.

By default, this function counts PASS variants that are AC0, AF < 0.01%, and
AF 0.01% - 0.1%.

The function can also include counts of singletons and doubletons, with or
without passing filters.

.. note::

This function works for gnomAD exomes and genomes data types, not yet for gnomAD
joint data type, since the HT schema is slightly different.

:param af_cutoffs: List of allele frequencies cutoffs.
:param singletons: Include singletons.
:param doubletons: Include doubletons.
:param pass_only: Include only PASS variants.
:param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
'ht', 'data_type', and 'version'.
:return: Dictionary with counts.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Filter to PASS variants.
if pass_only:
ht = ht.filter(hl.len(ht.filters) == 0)

# Initialize allele count cutoffs with AC0.
ac_cutoffs = [(0, "AC0")]

if singletons:
ac_cutoffs.append((1, "singletons"))

if doubletons:
ac_cutoffs.append((2, "doubletons"))

freq_expr = freq_bin_expr(
ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None
)

return ht.aggregate(hl.agg.counter(freq_expr))
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/constraint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
90 changes: 90 additions & 0 deletions gnomad_toolbox/filtering/frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Functions for filtering the gnomAD sites HT frequency data."""

from typing import List, Union

import hail as hl
from gnomad.utils.filtering import filter_arrays_by_meta

from gnomad_toolbox.filtering.variant import get_single_variant
from gnomad_toolbox.load_data import _get_gnomad_release


def get_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s).

:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to _get_gnomad_release.
:return: Table with callstats for the given ancestry groups and variant.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Check if gen_ancs is a single ancestry group.
one_anc = isinstance(gen_ancs, str)

if one_anc:
gen_ancs = [gen_ancs]

# Format gen_ancs to lowercase and filter arrays by metadata.
gen_ancs = [gen_anc.lower() for gen_anc in gen_ancs]
gen_anc_label = (
"gen_anc" if any(["gen_anc" in m for m in hl.eval(ht.freq_meta)]) else "pop"
)
items_to_filter = {gen_anc_label: gen_ancs, "group": ["adj"]}
freq_meta, array_exprs = filter_arrays_by_meta(
ht.freq_meta,
{
**{a: ht[a] for a in ["freq"]},
"freq_meta_sample_count": ht.index_globals().freq_meta_sample_count,
},
items_to_filter=items_to_filter,
keep=True,
combine_operator="and",
exact_match=True,
)
ht = ht.select(
"filters",
**{
m[gen_anc_label]: array_exprs["freq"][i]
for i, m in enumerate(hl.eval(freq_meta))
},
)

# Select a subset of the globals.
sample_count = array_exprs["freq_meta_sample_count"]
if one_anc:
sample_count = sample_count[0]
else:
sample_count = hl.struct(
**{
m[gen_anc_label]: sample_count[i]
for i, m in enumerate(hl.eval(freq_meta))
}
)
ht = ht.select_globals("date", "version", sample_count=sample_count)

return ht


def get_single_variant_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s) and a single variant.

:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to get_single_variant.
:return: Table with callstats for the given ancestry groups and variant.
"""
ht = get_single_variant(**kwargs)

return get_ancestry_callstats(gen_ancs, ht=ht)
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/pext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
Loading
Loading