Revert "Squashed commit of the following:"

This reverts commit 5ce3e77.
PMBio · May 13, 2024 · f5b95f8 · f5b95f8
1 parent 5ce3e77
commit f5b95f8
Show file tree

Hide file tree

Showing 82 changed files with 90 additions and 1,804 deletions.
diff --git a/.github/workflows/test-runner.yml b/.github/workflows/test-runner.yml
@@ -42,24 +42,3 @@ jobs:
       - name: Run pytest preprocessing
         run: pytest -v ${{ github.workspace }}/tests/preprocessing
         shell: micromamba-shell {0}
-
-  DeepRVAT-Tests-Runner-Annotations:
-    runs-on: ubuntu-latest
-    steps:
-
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - uses: mamba-org/setup-micromamba@v1.8.0
-        with:
-          environment-name: deeprvat-annotation-gh-action
-          environment-file: ${{ github.workspace }}/deeprvat_annotations.yml
-          cache-environment: true
-          cache-downloads: true
-
-      - name: Install DeepRVAT
-        run: pip install -e ${{ github.workspace }}
-        shell: micromamba-shell {0}
-
-      - name: Run pytest annotations
-        run: pytest -v ${{ github.workspace }}/tests/annotations
-        shell: micromamba-shell {0}
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -1,7 +1,5 @@
 import logging
 import os
-
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
 import pickle
 import random
 import sys
@@ -1362,6 +1360,45 @@ def merge_deepsea_pcas(
     merged.to_parquet(out_file)
 
 
+@cli.command()
+@click.argument("in_variants", type=click.Path(exists=True))
+@click.argument("out_variants", type=click.Path())
+def process_annotations(in_variants: str, out_variants: str):
+    """
+    Process variant annotations, filter for canonical variants, and aggregate consequences.
+
+    Parameters:
+    - in_variants (str): Path to the input variant annotation file in parquet format.
+    - out_variants (str): Path to save the processed variant annotation file in parquet format.
+
+    Returns:
+    None
+
+    Notes:
+    - The function reads the input variant annotation file.
+    - It filters for canonical variants where the 'CANONICAL' column is equal to 'YES'.
+    - The 'Gene' column is renamed to 'gene_id'.
+    - Consequences for different alleles are aggregated by combining the variant ID with the gene ID.
+    - The processed variant annotations are saved to the specified output file.
+
+    Example:
+    $ python annotations.py process_annotations input_variants.parquet output_variants.parquet
+    """
+    variant_path = Path(in_variants)
+    variants = pd.read_parquet(variant_path)
+
+    logger.info("filtering for canonical variants")
+
+    variants = variants.loc[variants.CANONICAL == "YES"]
+    variants.rename(columns={"Gene": "gene_id"}, inplace=True)
+
+    logger.info("aggregating consequences for different alleles")
+
+    # combining variant id with gene id
+    variants["censequence_id"] = variants["id"].astype(str) + variants["gene_id"]
+    variants.to_parquet(out_variants, compression="zstd")
+
+
 def process_chunk_addids(chunk: pd.DataFrame, variants: pd.DataFrame) -> pd.DataFrame:
     """
     Process a chunk of data by adding identifiers from a variants dataframe.
@@ -1470,14 +1507,16 @@ def add_ids(annotation_file: str, variant_file: str, njobs: int, out_file: str):
 @cli.command()
 @click.argument("annotation_file", type=click.Path(exists=True))
 @click.argument("variant_file", type=click.Path(exists=True))
+@click.argument("njobs", type=int)
 @click.argument("out_file", type=click.Path())
-def add_ids_dask(annotation_file: str, variant_file: str, out_file: str):
+def add_ids_dask(annotation_file: str, variant_file: str, njobs: int, out_file: str):
     """
     Add identifiers from a variant file to an annotation file using Dask and save the result.
 
     Parameters:
     - annotation_file (str): Path to the input annotation file in Parquet format.
     - variant_file (str): Path to the input variant file in Parquet format.
+    - njobs (int): Number of parallel jobs to process the data.
     - out_file (str): Path to save the processed data in Parquet format.
 
     Returns:
@@ -1493,7 +1532,7 @@ def add_ids_dask(annotation_file: str, variant_file: str, out_file: str):
     $ python annotations.py add_ids_dask annotation_data.parquet variant_data.parquet 4 processed_data.parquet
     """
     data = dd.read_parquet(annotation_file, blocksize=25e9)
-    all_variants = pd.read_parquet(variant_file)
+    all_variants = pd.read_table(variant_file)
     data = data.rename(
         columns={
             "#CHROM": "chrom",
@@ -1666,7 +1705,7 @@ def merge_annotations(
     logger.info("load variant_file")
 
     logger.info(f"reading in {variant_file}")
-    variants = pd.read_parquet(variant_file)
+    variants = pd.read_csv(variant_file, sep="\t")
 
     logger.info("merge vep to variants M:1")
     ca = vep_df.merge(
@@ -1738,19 +1777,7 @@ def process_vep(
         vcf_file, names=["chrom", "pos", "#Uploaded_variation", "ref", "alt"]
     )
     if "#Uploaded_variation" in vep_file.columns:
-        vep_file = vep_file.merge(vcf_df, on="#Uploaded_variation", how="left")
-        if vep_file.chrom.isna().sum() > 0:
-            vep_file.loc[vep_file.chrom.isna(), ["chrom", "pos", "ref", "alt"]] = (
-                vep_file[vep_file["chrom"].isna()]["#Uploaded_variation"]
-                .str.replace("_", ":")
-                .str.replace("/", ":")
-                .str.split(":", expand=True)
-                .values
-            )
-    assert vep_file.chrom.isna().sum() == 0
-    assert vep_file.pos.isna().sum() == 0
-    assert vep_file.ref.isna().sum() == 0
-    assert vep_file.alt.isna().sum() == 0
+        vep_file = vep_file.merge(vcf_df, on="#Uploaded_variation")
 
     if "pos" in vep_file.columns:
         vep_file["pos"] = vep_file["pos"].astype(int)
@@ -1952,7 +1979,7 @@ def get_af_from_gt(genotype_file: str, variants_filepath: str, out_file: str):
     """
     import h5py
 
-    variants = pd.read_parquet(variants_filepath)
+    variants = pd.read_table(variants_filepath)
     max_variant_id = variants["id"].max()
 
     logger.info("Computing allele frequencies")
@@ -2015,19 +2042,19 @@ def calculate_maf(annotations_path: str, out_file: str):
 
 
 @cli.command()
-@click.argument("gene_id_file", type=click.Path(exists=True))
+@click.argument("protein_id_file", type=click.Path(exists=True))
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str):
+def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
     """
-    Add gene IDs to the annotations based on gene ID mapping file.
+    Add protein IDs to the annotations based on protein ID mapping file.
 
     Parameters:
-    - gene_id_file (str): Path to the gene ID mapping file.
+    - protein_id_file (str): Path to the protein ID mapping file.
     - annotations_path (str): Path to the annotations file.
     - out_file (str): Path to the output file to save the annotations with protein IDs.
     """
-    genes = pd.read_parquet(gene_id_file)
+    genes = pd.read_parquet(protein_id_file)
     genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True)
     genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True)
     genes.rename(columns={"id": "gene_id"}, inplace=True)
@@ -2042,7 +2069,7 @@ def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str):
 @cli.command()
 @click.argument("gtf_filepath", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def create_gene_id_file(gtf_filepath: str, out_file: str):
+def create_protein_id_file(gtf_filepath: str, out_file: str):
     """
     Create a protein ID mapping file from the GTF file.
 

diff --git a/deeprvat_annotations.yml b/deeprvat_annotations.yml
@@ -16,7 +16,6 @@ dependencies:
   - tensorflow=2.11.0
   - pyarrow=11.0.0
   - fastparquet=2023.4.0
-  - bioconda::pyranges=0.0.129
   #comment out lines below if you want to use preinstalled bcftools or samtools
   - bcftools=1.17
   - samtools=1.17

diff --git a/docs/annotations.md b/docs/annotations.md
@@ -88,12 +88,6 @@ Data for VEP plugins and the CADD cache are stored in `annotation data`.
 
 ## Running the annotation pipeline
 ### Preconfiguration
-- Install the annotation environment
-    ```shell
-    mamba env create -f path/to/deeprvat/deeprvat_annotations.yml
-    mamba activate deeprvat_annotations
-    pip install -e path/to/deeprvat
-    ```
 - Clone the repositories mentioned in [requirements](#requirements) into `repo_dir` and install the needed conda environments with  
     ```shell
     mamba env create -f repo_dir/absplice/environment.yaml

diff --git a/...ng_workdir/norm/variants/variants.parquet → ...ing_workdir/norm/variants/variants.tsv.gz b/...ng_workdir/norm/variants/variants.parquet → ...ing_workdir/norm/variants/variants.tsv.gz