broadinstitute · KoalaQin · Jan 15, 2025 · Dec 20, 2024 · Jan 2, 2025 · Jan 3, 2025
diff --git a/gnomad_toolbox/analysis/general.py b/gnomad_toolbox/analysis/general.py
@@ -5,7 +5,7 @@
 import hail as hl
 from gnomad.assessment.summary_stats import freq_bin_expr
 
-from gnomad_toolbox.load_data import _get_gnomad_release
+from gnomad_toolbox.load_data import _get_dataset
 
 
 def get_variant_count_by_freq_bin(
@@ -33,12 +33,12 @@ def get_variant_count_by_freq_bin(
     :param singletons: Include singletons.
     :param doubletons: Include doubletons.
     :param pass_only: Include only PASS variants.
-    :param kwargs: Keyword arguments to pass to _get_gnomad_release. Includes
-        'ht', 'data_type', and 'version'.
+    :param kwargs: Keyword arguments to pass to `_get_dataset`. Includes 'ht',
+        'data_type', and 'version'.
     :return: Dictionary with counts.
     """
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
 
     # Filter to PASS variants.
     if pass_only:

diff --git a/gnomad_toolbox/filtering/frequency.py b/gnomad_toolbox/filtering/frequency.py
@@ -6,7 +6,7 @@
 from gnomad.utils.filtering import filter_arrays_by_meta
 
 from gnomad_toolbox.filtering.variant import get_single_variant
-from gnomad_toolbox.load_data import _get_gnomad_release
+from gnomad_toolbox.load_data import _get_dataset
 
 
 def get_ancestry_callstats(
@@ -19,11 +19,11 @@ def get_ancestry_callstats(
     :param gen_ancs: Genetic ancestry group(s) (e.g., 'afr', 'amr', 'asj', 'eas',
         'fin', 'nfe', 'oth', 'sas'). Can be a single ancestry group or a list of
         ancestry groups.
-    :param kwargs: Keyword arguments to pass to _get_gnomad_release.
+    :param kwargs: Keyword arguments to pass to _get_dataset.
     :return: Table with callstats for the given ancestry groups and variant.
     """
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
 
     # Check if gen_ancs is a single ancestry group.
     one_anc = isinstance(gen_ancs, str)

diff --git a/gnomad_toolbox/filtering/variant.py b/gnomad_toolbox/filtering/variant.py
@@ -3,11 +3,12 @@
 from typing import Optional, Union
 
 import hail as hl
+from gnomad.utils.filtering import filter_by_intervals as interval_filter
 from gnomad.utils.filtering import filter_to_gencode_cds
 from gnomad.utils.parse import parse_variant
 from gnomad.utils.reference_genome import get_reference_genome
 
-from gnomad_toolbox.load_data import _get_gnomad_release
+from gnomad_toolbox.load_data import _get_dataset
 
 
 def get_single_variant(
@@ -34,7 +35,7 @@ def get_single_variant(
     :param position: Variant position. Required if `variant` is not provided.
     :param ref: Reference allele. Required if `variant` is not provided.
     :param alt: Alternate allele. Required if `variant` is not provided.
-    :param kwargs: Additional arguments to pass to `_get_gnomad_release`.
+    :param kwargs: Additional arguments to pass to `_get_dataset`.
     :return: Table with the single variant.
     """
     if not variant and not all([contig, position, ref, alt]):
@@ -44,7 +45,7 @@ def get_single_variant(
         )
 
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
 
     # Determine the reference genome build for the ht.
     build = get_reference_genome(ht.locus).name
@@ -78,27 +79,18 @@ def filter_by_intervals(
     :param intervals: Interval string or list of interval strings. The interval string
         format has to be "contig:start-end", e.g.,"1:1000-2000" (GRCh37) or
         "chr1:1000-2000" (GRCh38).
-    :param kwargs: Arguments to pass to `_get_gnomad_release`.
+    :param kwargs: Arguments to pass to `_get_dataset`.
     :return: Table with variants in the interval(s).
     """
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
 
-    # Determine the reference genome build for the ht.
-    build = get_reference_genome(ht.locus).name
-
-    if isinstance(intervals, str):
-        intervals = [intervals]
-
-    if build == "GRCh38" and any([not i.startswith("chr") for i in intervals]):
-        raise ValueError("Interval must start with 'chr' for GRCh38 reference genome.")
-
-    ht = hl.filter_intervals(
-        ht, [hl.parse_locus_interval(i, reference_genome=build) for i in intervals]
+    return interval_filter(
+        ht,
+        intervals,
+        reference_genome=get_reference_genome(ht.locus).name,
     )
 
-    return ht
-
 
 # TODO: Add a pre-processing step to filter out these genes on chrY to
 # match the gnomAD browser.
@@ -120,11 +112,11 @@ def filter_by_gene_symbol(gene: str, exon_padding_bp: int = 75, **kwargs) -> hl.
     :param gene: Gene symbol.
     :param exon_padding_bp: Number of base pairs to pad the CDS intervals. Default is
         75bp.
-    :param kwargs: Arguments to pass to `_get_gnomad_release`.
+    :param kwargs: Arguments to pass to `_get_dataset`.
     :return: Table with variants in the gene.
     """
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
     ht = filter_to_gencode_cds(ht, genes=gene, padding_bp=exon_padding_bp)
 
     return ht
diff --git a/gnomad_toolbox/filtering/vep.py b/gnomad_toolbox/filtering/vep.py
@@ -1,9 +1,20 @@
 """Functions to filter gnomAD sites HT by VEP annotations."""
 
+from typing import List, Optional
+
 import hail as hl
-from gnomad.utils.vep import LOF_CSQ_SET, filter_vep_transcript_csqs_expr
+from gnomad.utils.filtering import filter_gencode_ht
+from gnomad.utils.vep import (
+    LOF_CSQ_SET,
+    filter_vep_transcript_csqs,
+    filter_vep_transcript_csqs_expr,
+)
 
-from gnomad_toolbox.load_data import _get_gnomad_release
+from gnomad_toolbox.load_data import (
+    CONSTRAINT_DATA,
+    _get_dataset,
+    get_compatible_dataset_versions,
+)
 
 
 # TODO: Check these csq sets, the ones in the code don't match what is listed on the
@@ -76,7 +87,7 @@ def filter_by_consequence_category(
     :param synonymous: Whether to include synonymous variants.
     :param other: Whether to include other variants.
     :param pass_filters: Boolean if the variants pass the filters.
-    :param kwargs: Arguments to pass to _get_gnomad_release.
+    :param kwargs: Arguments to pass to `_get_dataset`.
     :return: Table with variants with the specified consequences.
     """
     if not any([plof, missense, synonymous, other]):
@@ -85,7 +96,7 @@ def filter_by_consequence_category(
         )
 
     # Load the Hail Table if not provided
-    ht = _get_gnomad_release(dataset="variant", **kwargs)
+    ht = _get_dataset(dataset="variant", **kwargs)
 
     lof_csqs = list(LOF_CSQ_SET)
     missense_csqs = ["missense_variant", "inframe_insertion", "inframe_deletion"]
@@ -116,23 +127,157 @@ def filter_by_consequence_category(
     return ht.filter(filter_expr)
 
 
-# TODO: The following was in one of the notebooks, and I think we should add a wrapper
-#  around this function to make it much simpler instead of using it in the notebook.
+def get_gene_intervals(
+    gene_symbol: str, gencode_version: Optional[str] = None
+) -> List[hl.utils.Interval]:
+    """
+    Get the GENCODE genomic intervals for a given gene symbol.
+
+    :param gene_symbol: Gene symbol.
+    :param gencode_version: Optional GENCODE version. If not provided, uses the gencode
+        version associated with the gnomAD session.
+    :return: List of GENCODE intervals for the specified gene.
+    """
+    # Load the Hail Table if not provided.
+    ht = _get_dataset(dataset="gencode", version=gencode_version)
+    gene_symbol = gene_symbol.upper()
+
+    intervals = filter_gencode_ht(gencode_ht=ht, feature="gene", genes=gene_symbol)
+    intervals = intervals.interval.collect()
+
+    if not intervals:
+        raise ValueError(f"No interval found for gene: {gene_symbol}")
+
+    return intervals
+
+
+def filter_to_high_confidence_loftee(
+    gene_symbol: Optional[str] = None,
+    no_lof_flags: bool = False,
+    mane_select_only: bool = False,
+    canonical_only: bool = False,
+    version: Optional[str] = None,
+    **kwargs,
+) -> hl.Table:
+    """
+    Filter gnomAD variants to high-confidence LOFTEE variants for a gene.
+
+    :param gene_symbol: Optional gene symbol to filter by.
+    :param no_lof_flags: Whether to exclude variants with LOFTEE flags. Default is
+        False.
+    :param mane_select_only: Whether to include only MANE Select transcripts. Default
+        is False.
+    :param canonical_only: Whether to include only canonical transcripts. Default is
+        False.
+    :param kwargs: Additional arguments to pass to `_get_dataset`.
+    :return: Table with high-confidence LOFTEE variants.
+    """
+    # Load the Hail Table if not provided.
+    ht = _get_dataset(dataset="variant", version=version, **kwargs)
+    gene_symbol = gene_symbol.upper() if gene_symbol else None
+
+    if gene_symbol:
+        gencode_version = get_compatible_dataset_versions("gencode", version)
+        ht = hl.filter_intervals(
+            ht, get_gene_intervals(gene_symbol, gencode_version=gencode_version)
+        )
+
+    return filter_vep_transcript_csqs(
+        ht,
+        synonymous=False,
+        canonical=canonical_only,
+        mane_select=mane_select_only,
+        genes=[gene_symbol],
+        match_by_gene_symbol=True,
+        loftee_labels=["HC"],
+        no_lof_flags=no_lof_flags,
+    )
+
+
+# TODO: Let's move this function to constraint.py and change the name to something more
+#  descriptive, like maybe get_observed_plofs_for_gene_constraint.
+def filter_to_plofs(
+    gene_symbol: str,
+    version: str = None,
+    variant_ht: hl.Table = None,
+    coverage_ht: hl.Table = None,
+) -> hl.Table:
+    """
+    Filter to observed pLoF variants used for gene constraint metrics.
+
+    The pLOF variant count displayed on the browser meets the following requirements:
+
+        - PASS variant QC
+        - SNV
+        - Allele frequency ≤ 0.1%
+        - High-confidence LOFTEE in the MANE Select or Canonical transcript
+        - ≥ a specified coverage threshold (depends on the version)
+
+    :param gene_symbol: Gene symbol.
+    :param version: Optional gnomAD dataset version. If not provided, uses the gnomAD
+        session version.
+    :param variant_ht: Optional Hail Table with variants. If not provided, uses the
+        exome variant Table for the gnomAD session version.
+    :param coverage_ht: Optional Hail Table with coverage data. If not provided, uses
+        the exome coverage Table for the gnomAD session version.
+    :return: Table with pLoF variants.
+    """
+    if variant_ht is not None and coverage_ht is None:
+        raise ValueError("Variant Hail Table provided without coverage Hail Table.")
+
+    if coverage_ht is not None and variant_ht is None:
+        raise ValueError("Coverage Hail Table provided without variant Hail Table.")
+
+    # Load the variant exomes Hail Table if not provided.
+    variant_ht = _get_dataset(
+        dataset="variant",
+        ht=variant_ht,
+        data_type="exomes",
+        version=version,
+    )
 
-# Filter to LOFTEE high-confidence variants for certain genes
+    # Determine the coverage version compatible with the variant version.
+    coverage_version = get_compatible_dataset_versions("coverage", version, "exomes")
 
-# In this example, we are filtering to variants in ASH1L that are LOFTEE high-confidence
-# (with no flags) in the MANE select transcript.
+    # Load the coverage Hail Table if not provided.
+    coverage_ht = _get_dataset(
+        dataset="coverage",
+        ht=coverage_ht,
+        data_type="exomes",
+        version=coverage_version,
+    )
+
+    # Get gene intervals and filter tables.
+    gencode_version = get_compatible_dataset_versions("gencode", version)
+    intervals = get_gene_intervals(gene_symbol, gencode_version=gencode_version)
+    variant_ht = hl.filter_intervals(variant_ht, intervals)
+    coverage_ht = hl.filter_intervals(coverage_ht, intervals)
+
+    # Determine constraint filters.
+    constraint_version = get_compatible_dataset_versions("constraint", version)
+    constraint_info = CONSTRAINT_DATA[constraint_version]
+    cov_field = constraint_info["exome_coverage_field"]
+    cov_cutoff = constraint_info["exome_coverage_cutoff"]
+    af_cutoff = constraint_info["af_cutoff"]
+
+    # Annotate the exome coverage.
+    variant_ht = variant_ht.annotate(
+        exome_coverage=coverage_ht[variant_ht.locus][cov_field]
+    )
+
+    # Apply constraint filters.
+    variant_ht = variant_ht.filter(
+        (hl.len(variant_ht.filters) == 0)
+        & (hl.is_snp(variant_ht.alleles[0], variant_ht.alleles[1]))
+        & (variant_ht.freq[0].AF <= af_cutoff)
+        & (variant_ht.exome_coverage >= cov_cutoff)
+    )
+
+    # Filter to high-confidence LOFTEE variants.
+    variant_ht = filter_to_high_confidence_loftee(
+        gene_symbol=gene_symbol,
+        ht=variant_ht,
+        canonical_only=True,
+    )
 
-# from gnomad.utils.vep import filter_vep_transcript_csqs
-# ht = get_gnomad_release(data_type='exomes', version='4.1')
-# ht = filter_vep_transcript_csqs(
-#    ht,
-#    synonymous=False,
-#    mane_select=True,
-#    genes=["ASH1L"],
-#    match_by_gene_symbol=True,
-#    additional_filtering_criteria=[lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)],
-# )
-# ht.show()
-# ht.count()
+    return variant_ht