Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get pLOFs for each gene #11

Merged
merged 34 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f482dba
Draft code
KoalaQin Dec 20, 2024
80b1af4
Get relevant coverage table for each version
KoalaQin Jan 2, 2025
532fcc8
Revert notebook changes
KoalaQin Jan 3, 2025
c6de768
Merge branch 'development' into qh/get_pLOFs
KoalaQin Jan 3, 2025
80d33be
reformat
KoalaQin Jan 3, 2025
319d9fa
Make the big functions to smaller ones
KoalaQin Jan 8, 2025
93184af
Move gnomad_methods to setup.py
KoalaQin Jan 10, 2025
b271529
update notebook
KoalaQin Jan 10, 2025
bdb41d2
Move modified gnomad_methods back to requirements.txt
KoalaQin Jan 10, 2025
bf593fc
rename to gnomad
KoalaQin Jan 10, 2025
ae2acbc
Updates to loading data to support more data types and inclusion of c…
jkgoodrich Jan 14, 2025
8eb7296
Fixes during testing
jkgoodrich Jan 14, 2025
87906ee
Fix GnomADSession init
jkgoodrich Jan 14, 2025
95daf52
Fix table in `get_gnomad_release` docstring
jkgoodrich Jan 14, 2025
2853978
Fix use of `data_type` in `_get_dataset`
jkgoodrich Jan 14, 2025
23e0fe2
split with comma instead of tab
jkgoodrich Jan 14, 2025
41c6739
Use correct browser resource, browser_variant
jkgoodrich Jan 14, 2025
93cc5cd
Add additional datasets to the data exploration
jkgoodrich Jan 14, 2025
d9c0a6b
Update gnomad_toolbox/load_data.py
jkgoodrich Jan 14, 2025
82b9dbb
Add links for data download to notebook summary table
jkgoodrich Jan 14, 2025
1d081ea
browser_variant -> browser in notebook table
jkgoodrich Jan 14, 2025
8700831
Changes to some headers and added links to the notebook
jkgoodrich Jan 15, 2025
8833771
Add liftover and internal links to the notebook
jkgoodrich Jan 15, 2025
b143d27
Add comment explaining why we support a subset of the versions available
jkgoodrich Jan 15, 2025
5a71f8c
Apply suggestions from code review
jkgoodrich Jan 15, 2025
836994d
Filter to only canonical in filter_to_plofs
jkgoodrich Jan 15, 2025
e56464b
Don't need the MANE and canonical constraint config
jkgoodrich Jan 15, 2025
e3496e4
Merge branch 'jg/get_pLOFs' of https://github.com/broadinstitute/gnom…
jkgoodrich Jan 15, 2025
de240b5
Fix table in `get_gnomad_release` docstring
jkgoodrich Jan 15, 2025
ffbe8e8
Run explore_release_data.ipynb all the way through
jkgoodrich Jan 15, 2025
970b2dc
Fix browser change log link
jkgoodrich Jan 15, 2025
bacd5b5
Merge pull request #16 from broadinstitute/jg/get_pLOFs
jkgoodrich Jan 15, 2025
4e7b02a
Move pLOF function to contraint
KoalaQin Jan 15, 2025
2949779
Merge from development
KoalaQin Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 110 additions & 19 deletions gnomad_toolbox/filtering/vep.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
"""Functions to filter gnomAD sites HT by VEP annotations."""

from typing import List

import hail as hl
from gnomad.utils.vep import LOF_CSQ_SET, filter_vep_transcript_csqs_expr
from gnomad.resources.grch37.reference_data import gencode as grch37_gencode
from gnomad.resources.grch38.reference_data import gencode as grch38_gencode
from gnomad.utils.vep import (
LOF_CSQ_SET,
filter_vep_transcript_csqs,
filter_vep_transcript_csqs_expr,
)

from gnomad_toolbox.load_data import _get_gnomad_release
from gnomad_toolbox.load_data import (
_get_gnomad_release,
get_coverage_for_variant,
gnomad_session,
)


# TODO: Check these csq sets, the ones in the code don't match what is listed on the
Expand Down Expand Up @@ -116,23 +128,102 @@ def filter_by_consequence_category(
return ht.filter(filter_expr)


# TODO: The following was in one of the notebooks, and I think we should add a wrapper
# around this function to make it much simpler instead of using it in the notebook.
def get_gene_intervals(gene_symbol: str, version: str) -> List[hl.utils.Interval]:
"""
Get the genomic intervals for a given gene symbol.

:param gene_symbol: Gene symbol.
:param version: Dataset version.
:return: List of intervals for the specified gene.
"""
gen_ht = grch37_gencode.ht() if version.startswith("2.") else grch38_gencode.ht()
intervals = (
gen_ht.filter(
(gen_ht.feature == "gene")
& (gen_ht.gene_name.lower() == gene_symbol.lower())
)
.select()
.collect()
)
if not intervals:
raise ValueError(f"No interval found for gene: {gene_symbol}")
return [row["interval"] for row in intervals]

# Filter to LOFTEE high-confidence variants for certain genes

# In this example, we are filtering to variants in ASH1L that are LOFTEE high-confidence
# (with no flags) in the MANE select transcript.
def filter_hc_variants(var_ht: hl.Table, gene_symbol: str, version: str) -> hl.Table:
"""
Filter variants to high-confidence LOFTEE variants with optional transcript selection.

:param var_ht: Variants Table.
:param gene_symbol: Gene symbol.
:param version: Dataset version.
:return: Filtered variants Hail Table.
"""
return filter_vep_transcript_csqs(
var_ht,
synonymous=False,
mane_select=version.startswith("4."),
genes=[gene_symbol.upper()],
match_by_gene_symbol=True,
additional_filtering_criteria=[
lambda x: (x.lof == "HC")
& (hl.is_missing(x.lof_flags) | (x.lof_flags == ""))
],
)


def filter_to_plofs(
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
gene_symbol: str, select_fields: bool = False, **kwargs
) -> hl.Table:
"""
Filter to observed pLoF variants used for gene constraint metrics.

.. note::

pLOF variants meets the following requirements:
- High-confidence LOFTEE variants (without any flags),
- Only variants in the MANE Select transcript,
- PASS variants that are SNVs with MAF ≤ 0.1%,
- Exome median depth ≥ 30 (# TODO: This is changing in v4 constraint?)

:param gene_symbol: Gene symbol.
:param select_fields: Whether to limit the output to specific fields.
:return: Table with pLoF variants.
"""
var_version = kwargs.pop("version", gnomad_session.version)
var_ht = _get_gnomad_release(dataset="variant", version=var_version, **kwargs)
cov_ht = get_coverage_for_variant(var_version, **kwargs)

# Get gene intervals and filter tables
intervals = get_gene_intervals(gene_symbol, var_version)
var_ht = hl.filter_intervals(var_ht, intervals)
cov_ht = hl.filter_intervals(cov_ht, intervals)

# Filter to high-confidence LOFTEE variants
var_ht = filter_hc_variants(var_ht, gene_symbol, var_version)

# Version-specific expressions
if var_version.startswith("2."):
allele_type_expr = var_ht.allele_type
cov_cut_expr = cov_ht[var_ht.locus].median
else:
allele_type_expr = var_ht.allele_info.allele_type
cov_cut_expr = cov_ht[var_ht.locus].median_approx

# Apply final filters
var_ht = var_ht.filter(
(hl.len(var_ht.filters) == 0)
& (allele_type_expr == "snv")
& (var_ht.freq[0].AF <= 0.001)
& (cov_cut_expr >= 30)
)

# Select specific fields if requested
if select_fields:
var_ht = var_ht.select(
freq=var_ht.freq[0],
csq=var_ht.vep.transcript_consequences[0].consequence_terms,
coverage=cov_ht[var_ht.locus],
)

# from gnomad.utils.vep import filter_vep_transcript_csqs
# ht = get_gnomad_release(data_type='exomes', version='4.1')
# ht = filter_vep_transcript_csqs(
# ht,
# synonymous=False,
# mane_select=True,
# genes=["ASH1L"],
# match_by_gene_symbol=True,
# additional_filtering_criteria=[lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)],
# )
# ht.show()
# ht.count()
return var_ht
19 changes: 19 additions & 0 deletions gnomad_toolbox/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,22 @@ def get_gnomad_release(
:return: Hail Table for requested dataset, data type, and version.
"""
return _get_gnomad_release(dataset=dataset, data_type=data_type, version=version)


def get_coverage_for_variant(variant_version: str, **kwargs) -> hl.Table:
"""
Get the appropriate coverage table based on the provided version.

:param variant_version: Version of the variant dataset.
:return: Hail Table for the corresponding coverage dataset.
"""
if variant_version.startswith("4."):
return _get_gnomad_release(dataset="coverage", version="4.0", **kwargs)
elif variant_version.startswith("3."):
return _get_gnomad_release(dataset="coverage", version="3.0.1", **kwargs)
elif variant_version.startswith("2."):
return _get_gnomad_release(dataset="coverage", version="2.1", **kwargs)
else:
raise ValueError(
f"Unrecognized version: '{variant_version}'. Please specify a valid version."
)
Loading