Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Suggestions on the initial toolbox PR #5

Merged
merged 8 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,33 @@
# gnomad-toolbox
This repository provides a set of Python functions to simplify working with gnomAD Hail Tables. It includes tools for data access, filtering, and analysis.

## Repository structure
```
ggnomad_toolbox/
├── load_data.py # Functions to load gnomAD release Hail Tables.
├── filtering/
│ ├── __init__.py
│ ├── constraint.py # Functions to filter constraint metrics (e.g., observed/expected ratios).
│ ├── coverage.py # Functions to filter variants or regions based on coverage thresholds.
│ ├── frequency.py # Functions to filter variants by allele frequency thresholds.
│ ├── pext.py # Functions to filter variants using predicted expression (pext) scores.
| ├── variant.py # Functions to filter to a specific variant or set of variants.
│ ├── vep.py # Functions to filter variants based on VEP (Variant Effect Predictor) annotations.
├── analysis/
│ ├── __init__.py
│ ├── general.py # General analysis functions, such as summarizing variant statistics.
├── notebooks/
│ ├── intro_to_release_data.ipynb # Jupyter notebook introducing the loading of gnomAD release data.
```

# TODO: Add fully detailed info about how to install and open the notebooks.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These steps should include:

  1. install miniconda;
  2. set up a conda env for a specific version of Hail (and update JAVA);
  3. pip install gnomad_toolbox;
  4. set up the service account;

I can write this part once your PR is merged into mine.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, but don't add anything yet. First step is to get a bunch of tickets in for todo's and we can decide who will tackle what

## Getting started
### Install
pip install -r requirements.txt

### Opening the notebooks
jupyter lab
45 changes: 14 additions & 31 deletions docs/generate_api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,6 @@

EXCLUDE_PACKAGES = ["tests"]

EXCLUDE_TOP_LEVEL_PACKAGES = []
"""
List of packages/modules to exclude from API reference documentation.
This should be a list of strings where each string is the full name (from the top level)
of a package or module to exclude. For example, if 'gnomad_toolbox' includes a
'example_notebooks' that you want to exclude, you would add
'gnomad_toolbox.example_notebooks' to this list.
"""

PACKAGE_DOC_TEMPLATE = """{title}
{package_doc}
Expand Down Expand Up @@ -119,7 +109,11 @@ def write_module_doc(module_name):
write_file(doc_path, doc)


def write_package_doc(package_name):
def write_package_doc(
package_name,
package_doc = None,
doc_path = None,
):
"""Write API reference documentation file for a package."""
package = importlib.import_module(package_name)

Expand All @@ -139,32 +133,21 @@ def write_package_doc(package_name):

doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title(package_name),
package_doc=package.__doc__ or "",
package_doc= package_doc or package.__doc__ or "",
module_links="\n ".join(module_links),
)

doc_path = package_doc_path(package)
doc_path = doc_path or package_doc_path(package)
write_file(doc_path, doc)


if __name__ == "__main__":
packages = setuptools.find_namespace_packages(
where=REPOSITORY_ROOT_PATH, include=["gnomad_toolbox.*"]
)
top_level_packages = [
pkg for pkg in packages if pkg.count(".") == 1 and pkg not in EXCLUDE_TOP_LEVEL_PACKAGES
]

for pkg in top_level_packages:
write_package_doc(pkg)

root_doc = PACKAGE_DOC_TEMPLATE.format(
title=format_title("gnomad_toolbox"),
package_doc="",
module_links="\n ".join(
f"{pkg.split('.')[1]} <{pkg.split('.')[1]}/index>"
for pkg in top_level_packages
write_package_doc(
"gnomad_toolbox",
package_doc=(
"This repository provides a set of Python functions to simplify working "
"with gnomAD Hail Tables. It includes tools for data access, filtering, "
"and analysis."
),
doc_path=os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"),
)

write_file(os.path.join(DOCS_DIRECTORY, "api_reference", "index.rst"), root_doc)
File renamed without changes.
126 changes: 126 additions & 0 deletions gnomad_toolbox/analysis/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Set of general functions for gnomAD analysis."""

from typing import Dict, List, Optional, Tuple, Union

import hail as hl

from gnomad_toolbox.load_data import _get_gnomad_release


# TODO: Modify this function in gnomad_methods.
def freq_bin_expr(
freq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
index: int = 0,
ac_cutoffs: Optional[List[Union[int, Tuple[int, str]]]] = [
(0, "AC0"),
(1, "singleton"),
(2, "doubleton"),
],
af_cutoffs: Optional[List[Union[float, Tuple[float, str]]]] = [
(1e-4, "0.01%"),
(1e-3, "0.1%"),
(1e-2, "1%"),
(1e-1, "10%"),
],
upper_af: Optional[Union[float, Tuple[float, str]]] = (0.95, "95%"),
) -> hl.expr.StringExpression:
"""
Return frequency string annotations based on input AC or AF.

.. note::

- Default index is 0 because function assumes freq_expr was calculated with
`annotate_freq`.
- Frequency index 0 from `annotate_freq` is frequency for all pops calculated
on adj genotypes only.

:param freq_expr: Array of structs containing frequency information.
:param index: Which index of freq_expr to use for annotation. Default is 0.
:param ac_cutoffs:
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
:return: StringExpression containing bin name based on input AC or AF.
"""
if isinstance(freq_expr, hl.expr.ArrayExpression):
freq_expr = freq_expr[index]

if ac_cutoffs and isinstance(ac_cutoffs[0], int):
ac_cutoffs = [(c, f"AC{c}") for c in ac_cutoffs]

if af_cutoffs and isinstance(af_cutoffs[0], float):
af_cutoffs = [(c, f"{c*100}%") for c in af_cutoffs]
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(upper_af, float):
upper_af = (upper_af, f"{upper_af*100}%")

freq_bin_expr = hl.case().when(hl.is_missing(freq_expr.AC), "Missing")
prev_af = None
for ac, name in sorted(ac_cutoffs):
freq_bin_expr = freq_bin_expr.when(freq_expr.AC == ac, name)
prev_af = name

for af, name in sorted(af_cutoffs):
prev_af = "<" if prev_af is None else f"{prev_af} - "
freq_bin_expr = freq_bin_expr.when(freq_expr.AF < af, f"{prev_af}{name}")
prev_af = name

if upper_af:
freq_bin_expr = freq_bin_expr.when(
freq_expr.AF > upper_af[0], f">{upper_af[1]}"
)
default_af = "<" if prev_af is None else f"{prev_af} - "
default_af = f"{default_af}{upper_af[1]}"
else:
default_af = f">{prev_af}"

return freq_bin_expr.default(default_af)


def get_variant_count_by_freq_bin(
af_cutoffs: List[float] = [0.001, 0.01],
singletons: bool = False,
doubletons: bool = False,
pass_only: bool = True,
**kwargs,
) -> Dict[str, int]:
"""
Count variants by frequency bin.

By default, this function counts PASS variants that are AC0, AF < 0.01%, and
AF 0.01% - 0.1%.

The function can also include counts of singletons and doubletons, with or
without passing filters.

.. note::

This function works for gnomAD exomes and genomes data types, not yet for gnomAD
joint data type, since the HT schema is slightly different.

:param af_cutoffs: List of allele frequencies cutoffs.
:param singletons: Include singletons.
:param doubletons: Include doubletons.
:param pass_only: Include only PASS variants.
:param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
'ht', 'data_type', and 'version'.
:return: Dictionary with counts.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Filter to PASS variants.
if pass_only:
ht = ht.filter(hl.len(ht.filters) == 0)
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved

# Initialize allele count cutoffs with AC0.
ac_cutoffs = [(0, "AC0")]

if singletons:
ac_cutoffs.append((1, "singletons"))

if doubletons:
ac_cutoffs.append((2, "doubletons"))

freq_expr = freq_bin_expr(
ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None
)

return ht.aggregate(hl.agg.counter(freq_expr))
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/constraint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
90 changes: 90 additions & 0 deletions gnomad_toolbox/filtering/frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Functions for filtering the gnomAD sites HT frequency data."""

from typing import List, Union

import hail as hl
from gnomad.utils.filtering import filter_arrays_by_meta

from gnomad_toolbox.filtering.variant import get_single_variant
from gnomad_toolbox.load_data import _get_gnomad_release


def get_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s).
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to _get_gnomad_release.
:return: Table with callstats for the given ancestry groups and variant.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)

# Check if gen_ancs is a single ancestry group.
one_anc = isinstance(gen_ancs, str)

if one_anc:
gen_ancs = [gen_ancs]

# Format gen_ancs to lowercase and filter arrays by metadata.
gen_ancs = [gen_anc.lower() for gen_anc in gen_ancs]
gen_anc_label = (
"gen_anc" if any(["gen_anc" in m for m in hl.eval(ht.freq_meta)]) else "pop"
)
items_to_filter = {gen_anc_label: gen_ancs, "group": ["adj"]}
freq_meta, array_exprs = filter_arrays_by_meta(
ht.freq_meta,
{
**{a: ht[a] for a in ["freq"]},
"freq_meta_sample_count": ht.index_globals().freq_meta_sample_count,
},
items_to_filter=items_to_filter,
keep=True,
combine_operator="and",
exact_match=True,
)
ht = ht.select(
"filters",
**{
m[gen_anc_label]: array_exprs["freq"][i]
for i, m in enumerate(hl.eval(freq_meta))
},
)

# Select a subset of the globals.
sample_count = array_exprs["freq_meta_sample_count"]
if one_anc:
sample_count = sample_count[0]
else:
sample_count = hl.struct(
**{
m[gen_anc_label]: sample_count[i]
for i, m in enumerate(hl.eval(freq_meta))
}
)
ht = ht.select_globals("date", "version", sample_count=sample_count)

return ht


def get_single_variant_ancestry_callstats(
gen_ancs: Union[str, List[str]],
**kwargs,
) -> hl.Table:
"""
Extract callstats for specified ancestry group(s) and a single variant.
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to get_single_variant.
:return: Table with callstats for the given ancestry groups and variant.
"""
ht = get_single_variant(**kwargs)

return get_ancestry_callstats(gen_ancs, ht=ht)
1 change: 1 addition & 0 deletions gnomad_toolbox/filtering/pext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# noqa: D104, D100
Loading
Loading