Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get pLOFs for each gene #11

Merged
merged 34 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f482dba
Draft code
KoalaQin Dec 20, 2024
80b1af4
Get relevant coverage table for each version
KoalaQin Jan 2, 2025
532fcc8
Revert notebook changes
KoalaQin Jan 3, 2025
c6de768
Merge branch 'development' into qh/get_pLOFs
KoalaQin Jan 3, 2025
80d33be
reformat
KoalaQin Jan 3, 2025
319d9fa
Make the big functions to smaller ones
KoalaQin Jan 8, 2025
93184af
Move gnomad_methods to setup.py
KoalaQin Jan 10, 2025
b271529
update notebook
KoalaQin Jan 10, 2025
bdb41d2
Move modified gnomad_methods back to requirements.txt
KoalaQin Jan 10, 2025
bf593fc
rename to gnomad
KoalaQin Jan 10, 2025
ae2acbc
Updates to loading data to support more data types and inclusion of c…
jkgoodrich Jan 14, 2025
8eb7296
Fixes during testing
jkgoodrich Jan 14, 2025
87906ee
Fix GnomADSession init
jkgoodrich Jan 14, 2025
95daf52
Fix table in `get_gnomad_release` docstring
jkgoodrich Jan 14, 2025
2853978
Fix use of `data_type` in `_get_dataset`
jkgoodrich Jan 14, 2025
23e0fe2
split with comma instead of tab
jkgoodrich Jan 14, 2025
41c6739
Use correct browser resource, browser_variant
jkgoodrich Jan 14, 2025
93cc5cd
Add additional datasets to the data exploration
jkgoodrich Jan 14, 2025
d9c0a6b
Update gnomad_toolbox/load_data.py
jkgoodrich Jan 14, 2025
82b9dbb
Add links for data download to notebook summary table
jkgoodrich Jan 14, 2025
1d081ea
browser_variant -> browser in notebook table
jkgoodrich Jan 14, 2025
8700831
Changes to some headers and added links to the notebook
jkgoodrich Jan 15, 2025
8833771
Add liftover and internal links to the notebook
jkgoodrich Jan 15, 2025
b143d27
Add comment explaining why we support a subset of the versions available
jkgoodrich Jan 15, 2025
5a71f8c
Apply suggestions from code review
jkgoodrich Jan 15, 2025
836994d
Filter to only canonical in filter_to_plofs
jkgoodrich Jan 15, 2025
e56464b
Don't need the MANE and canonical constraint config
jkgoodrich Jan 15, 2025
e3496e4
Merge branch 'jg/get_pLOFs' of https://github.com/broadinstitute/gnom…
jkgoodrich Jan 15, 2025
de240b5
Fix table in `get_gnomad_release` docstring
jkgoodrich Jan 15, 2025
ffbe8e8
Run explore_release_data.ipynb all the way through
jkgoodrich Jan 15, 2025
970b2dc
Fix browser change log link
jkgoodrich Jan 15, 2025
bacd5b5
Merge pull request #16 from broadinstitute/jg/get_pLOFs
jkgoodrich Jan 15, 2025
4e7b02a
Move pLOF function to contraint
KoalaQin Jan 15, 2025
2949779
Merge from development
KoalaQin Jan 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions gnomad_toolbox/analysis/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hail as hl
from gnomad.assessment.summary_stats import freq_bin_expr

from gnomad_toolbox.load_data import _get_gnomad_release
from gnomad_toolbox.load_data import _get_dataset


def get_variant_count_by_freq_bin(
Expand Down Expand Up @@ -33,12 +33,12 @@ def get_variant_count_by_freq_bin(
:param singletons: Include singletons.
:param doubletons: Include doubletons.
:param pass_only: Include only PASS variants.
:param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
'ht', 'data_type', and 'version'.
:param kwargs: Keyword arguments to pass to `_get_dataset`. Includes 'ht',
'data_type', and 'version'.
:return: Dictionary with counts.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)

# Filter to PASS variants.
if pass_only:
Expand Down
6 changes: 3 additions & 3 deletions gnomad_toolbox/filtering/frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from gnomad.utils.filtering import filter_arrays_by_meta

from gnomad_toolbox.filtering.variant import get_single_variant
from gnomad_toolbox.load_data import _get_gnomad_release
from gnomad_toolbox.load_data import _get_dataset


def get_ancestry_callstats(
Expand All @@ -19,11 +19,11 @@ def get_ancestry_callstats(
:param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
ancestry groups.
:param kwargs: Keyword arguments to pass to _get_gnomad_release.
:param kwargs: Keyword arguments to pass to _get_dataset.
:return: Table with callstats for the given ancestry groups and variant.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)

# Check if gen_ancs is a single ancestry group.
one_anc = isinstance(gen_ancs, str)
Expand Down
32 changes: 12 additions & 20 deletions gnomad_toolbox/filtering/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from typing import Optional, Union

import hail as hl
from gnomad.utils.filtering import filter_by_intervals as interval_filter
from gnomad.utils.filtering import filter_to_gencode_cds
from gnomad.utils.parse import parse_variant
from gnomad.utils.reference_genome import get_reference_genome

from gnomad_toolbox.load_data import _get_gnomad_release
from gnomad_toolbox.load_data import _get_dataset


def get_single_variant(
Expand All @@ -34,7 +35,7 @@ def get_single_variant(
:param position: Variant position. Required if `variant` is not provided.
:param ref: Reference allele. Required if `variant` is not provided.
:param alt: Alternate allele. Required if `variant` is not provided.
:param kwargs: Additional arguments to pass to `_get_gnomad_release`.
:param kwargs: Additional arguments to pass to `_get_dataset`.
:return: Table with the single variant.
"""
if not variant and not all([contig, position, ref, alt]):
Expand All @@ -44,7 +45,7 @@ def get_single_variant(
)

# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)

# Determine the reference genome build for the ht.
build = get_reference_genome(ht.locus).name
Expand Down Expand Up @@ -78,27 +79,18 @@ def filter_by_intervals(
:param intervals: Interval string or list of interval strings. The interval string
format has to be "contig:start-end", e.g.,"1:1000-2000" (GRCh37) or
"chr1:1000-2000" (GRCh38).
:param kwargs: Arguments to pass to `_get_gnomad_release`.
:param kwargs: Arguments to pass to `_get_dataset`.
:return: Table with variants in the interval(s).
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)

# Determine the reference genome build for the ht.
build = get_reference_genome(ht.locus).name

if isinstance(intervals, str):
intervals = [intervals]

if build == "GRCh38" and any([not i.startswith("chr") for i in intervals]):
raise ValueError("Interval must start with 'chr' for GRCh38 reference genome.")

ht = hl.filter_intervals(
ht, [hl.parse_locus_interval(i, reference_genome=build) for i in intervals]
return interval_filter(
ht,
intervals,
reference_genome=get_reference_genome(ht.locus).name,
)

return ht


# TODO: Add a pre-processing step to filter out these genes on chrY to
# match the gnomAD browser.
Expand All @@ -120,11 +112,11 @@ def filter_by_gene_symbol(gene: str, exon_padding_bp: int = 75, **kwargs) -> hl.
:param gene: Gene symbol.
:param exon_padding_bp: Number of base pairs to pad the CDS intervals. Default is
75bp.
:param kwargs: Arguments to pass to `_get_gnomad_release`.
:param kwargs: Arguments to pass to `_get_dataset`.
:return: Table with variants in the gene.
"""
# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)
ht = filter_to_gencode_cds(ht, genes=gene, padding_bp=exon_padding_bp)

return ht
187 changes: 166 additions & 21 deletions gnomad_toolbox/filtering/vep.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
"""Functions to filter gnomAD sites HT by VEP annotations."""

from typing import List, Optional

import hail as hl
from gnomad.utils.vep import LOF_CSQ_SET, filter_vep_transcript_csqs_expr
from gnomad.utils.filtering import filter_gencode_ht
from gnomad.utils.vep import (
LOF_CSQ_SET,
filter_vep_transcript_csqs,
filter_vep_transcript_csqs_expr,
)

from gnomad_toolbox.load_data import _get_gnomad_release
from gnomad_toolbox.load_data import (
CONSTRAINT_DATA,
_get_dataset,
get_compatible_dataset_versions,
)


# TODO: Check these csq sets, the ones in the code don't match what is listed on the
Expand Down Expand Up @@ -76,7 +87,7 @@ def filter_by_consequence_category(
:param synonymous: Whether to include synonymous variants.
:param other: Whether to include other variants.
:param pass_filters: Boolean if the variants pass the filters.
:param kwargs: Arguments to pass to _get_gnomad_release.
:param kwargs: Arguments to pass to `_get_dataset`.
:return: Table with variants with the specified consequences.
"""
if not any([plof, missense, synonymous, other]):
Expand All @@ -85,7 +96,7 @@ def filter_by_consequence_category(
)

# Load the Hail Table if not provided
ht = _get_gnomad_release(dataset="variant", **kwargs)
ht = _get_dataset(dataset="variant", **kwargs)

lof_csqs = list(LOF_CSQ_SET)
missense_csqs = ["missense_variant", "inframe_insertion", "inframe_deletion"]
Expand Down Expand Up @@ -116,23 +127,157 @@ def filter_by_consequence_category(
return ht.filter(filter_expr)


# TODO: The following was in one of the notebooks, and I think we should add a wrapper
# around this function to make it much simpler instead of using it in the notebook.
def get_gene_intervals(
gene_symbol: str, gencode_version: Optional[str] = None
) -> List[hl.utils.Interval]:
"""
Get the GENCODE genomic intervals for a given gene symbol.

:param gene_symbol: Gene symbol.
:param gencode_version: Optional GENCODE version. If not provided, uses the gencode
version associated with the gnomAD session.
:return: List of GENCODE intervals for the specified gene.
"""
# Load the Hail Table if not provided.
ht = _get_dataset(dataset="gencode", version=gencode_version)
gene_symbol = gene_symbol.upper()

intervals = filter_gencode_ht(gencode_ht=ht, feature="gene", genes=gene_symbol)
intervals = intervals.interval.collect()

if not intervals:
raise ValueError(f"No interval found for gene: {gene_symbol}")

return intervals


def filter_to_high_confidence_loftee(
gene_symbol: Optional[str] = None,
no_lof_flags: bool = False,
mane_select_only: bool = False,
canonical_only: bool = False,
version: Optional[str] = None,
**kwargs,
) -> hl.Table:
"""
Filter gnomAD variants to high-confidence LOFTEE variants for a gene.

:param gene_symbol: Optional gene symbol to filter by.
:param no_lof_flags: Whether to exclude variants with LOFTEE flags. Default is
False.
:param mane_select_only: Whether to include only MANE Select transcripts. Default
is False.
:param canonical_only: Whether to include only canonical transcripts. Default is
False.
:param kwargs: Additional arguments to pass to `_get_dataset`.
:return: Table with high-confidence LOFTEE variants.
"""
# Load the Hail Table if not provided.
ht = _get_dataset(dataset="variant", version=version, **kwargs)
gene_symbol = gene_symbol.upper() if gene_symbol else None

if gene_symbol:
gencode_version = get_compatible_dataset_versions("gencode", version)
ht = hl.filter_intervals(
ht, get_gene_intervals(gene_symbol, gencode_version=gencode_version)
)

return filter_vep_transcript_csqs(
ht,
synonymous=False,
canonical=canonical_only,
mane_select=mane_select_only,
genes=[gene_symbol],
match_by_gene_symbol=True,
loftee_labels=["HC"],
no_lof_flags=no_lof_flags,
)


# TODO: Let's move this function to constraint.py and change the name to something more
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
# descriptive, like maybe get_observed_plofs_for_gene_constraint.
def filter_to_plofs(
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
gene_symbol: str,
version: str = None,
variant_ht: hl.Table = None,
coverage_ht: hl.Table = None,
) -> hl.Table:
"""
Filter to observed pLoF variants used for gene constraint metrics.

The pLOF variant count displayed on the browser meets the following requirements:

- PASS variant QC
- SNV
- Allele frequency ≤ 0.1%
- High-confidence LOFTEE in the MANE Select or Canonical transcript
- ≥ a specified coverage threshold (depends on the version)

:param gene_symbol: Gene symbol.
:param version: Optional gnomAD dataset version. If not provided, uses the gnomAD
session version.
:param variant_ht: Optional Hail Table with variants. If not provided, uses the
exome variant Table for the gnomAD session version.
:param coverage_ht: Optional Hail Table with coverage data. If not provided, uses
the exome coverage Table for the gnomAD session version.
:return: Table with pLoF variants.
"""
if variant_ht is not None and coverage_ht is None:
raise ValueError("Variant Hail Table provided without coverage Hail Table.")

if coverage_ht is not None and variant_ht is None:
raise ValueError("Coverage Hail Table provided without variant Hail Table.")

# Load the variant exomes Hail Table if not provided.
variant_ht = _get_dataset(
dataset="variant",
ht=variant_ht,
data_type="exomes",
version=version,
)

# Filter to LOFTEE high-confidence variants for certain genes
# Determine the coverage version compatible with the variant version.
coverage_version = get_compatible_dataset_versions("coverage", version, "exomes")

# In this example, we are filtering to variants in ASH1L that are LOFTEE high-confidence
# (with no flags) in the MANE select transcript.
# Load the coverage Hail Table if not provided.
coverage_ht = _get_dataset(
dataset="coverage",
ht=coverage_ht,
data_type="exomes",
version=coverage_version,
)

# Get gene intervals and filter tables.
gencode_version = get_compatible_dataset_versions("gencode", version)
intervals = get_gene_intervals(gene_symbol, gencode_version=gencode_version)
variant_ht = hl.filter_intervals(variant_ht, intervals)
coverage_ht = hl.filter_intervals(coverage_ht, intervals)

# Determine constraint filters.
constraint_version = get_compatible_dataset_versions("constraint", version)
constraint_info = CONSTRAINT_DATA[constraint_version]
cov_field = constraint_info["exome_coverage_field"]
cov_cutoff = constraint_info["exome_coverage_cutoff"]
af_cutoff = constraint_info["af_cutoff"]

# Annotate the exome coverage.
variant_ht = variant_ht.annotate(
exome_coverage=coverage_ht[variant_ht.locus][cov_field]
)

# Apply constraint filters.
variant_ht = variant_ht.filter(
(hl.len(variant_ht.filters) == 0)
& (hl.is_snp(variant_ht.alleles[0], variant_ht.alleles[1]))
& (variant_ht.freq[0].AF <= af_cutoff)
& (variant_ht.exome_coverage >= cov_cutoff)
)

# Filter to high-confidence LOFTEE variants.
variant_ht = filter_to_high_confidence_loftee(
gene_symbol=gene_symbol,
ht=variant_ht,
canonical_only=True,
)

# from gnomad.utils.vep import filter_vep_transcript_csqs
# ht = get_gnomad_release(data_type='exomes', version='4.1')
# ht = filter_vep_transcript_csqs(
# ht,
# synonymous=False,
# mane_select=True,
# genes=["ASH1L"],
# match_by_gene_symbol=True,
# additional_filtering_criteria=[lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)],
# )
# ht.show()
# ht.count()
return variant_ht
Loading
Loading