Merge branch 'main' into fix-typo-cond-window

PMBio · Sep 24, 2024 · e6fd13b · e6fd13b
2 parents 7dbdbae + d9b11b2
commit e6fd13b
Show file tree

Hide file tree

Showing 72 changed files with 2,665 additions and 962 deletions.
diff --git a/.github/workflows/pipeline-tests.yml b/.github/workflows/pipeline-tests.yml
@@ -3,6 +3,29 @@ run-name: DeepRVAT Pipeline Tests 🧬🧪💻🧑‍🔬
 on: [ push ]
 
 jobs:
+  # Config Setup
+  Smoke-GenerateConfig-Training:
+    uses: ./.github/workflows/run-pipeline.yml
+    with:
+      pipeline_file: ./pipelines/run_training.snakefile
+      environment_file: ./deeprvat_env_no_gpu.yml
+      prerun_cmd: cp ./example/config/deeprvat_input_training_config.yaml ./example/ 
+
+  Smoke-GenerateConfig-Training-AssociationTesting:
+    uses: ./.github/workflows/run-pipeline.yml
+    with:
+      pipeline_file: ./pipelines/training_association_testing.snakefile
+      environment_file: ./deeprvat_env_no_gpu.yml
+      prerun_cmd: cp ./example/config/deeprvat_input_config.yaml ./example/
+
+  Smoke-GenerateConfig-PreTrained:
+    uses: ./.github/workflows/run-pipeline.yml
+    with:
+      pipeline_file: ./pipelines/association_testing_pretrained.snakefile
+      environment_file: ./deeprvat_env_no_gpu.yml
+      prerun_cmd: cp ./example/config/deeprvat_input_pretrained_models_config.yaml ./example/ && ln -s $GITHUB_WORKSPACE/pretrained_models ./example/
+
+
   # Training Pipeline
   Smoke-RunTraining:
     uses: ./.github/workflows/run-pipeline.yml
@@ -158,6 +181,11 @@ jobs:
     with:
       pipeline_file: ./pipelines/annotations.snakefile
       environment_file: ./deeprvat_annotations.yml
+      prerun_cmd: |
+        mkdir -pv ./example/preprocess/workdir/norm/variants && \
+        mkdir -pv ./example/preprocess/workdir/preprocessed && \
+        touch ./example/preprocess/workdir/preprocessed/genotypes.h5 && \
+        touch ./example/preprocess/workdir/norm/variants/variants.parquet
       pipeline_config: ./example/config/deeprvat_annotation_config.yaml
       pipeline_directory: ./example/annotations
       download_fasta_data: true

diff --git a/LICENSE b/LICENSE
@@ -1,7 +1,8 @@
-MIT License
+The scource code of DeepRVAT is under MIT license. The pre-trained DeepRVAT models are under the CC BY NC 4.0 license for academic and non-commercial use. This is because DeepRVAT makes use of SpliceAI and PrimateAI scores which are currently under the CC BY NC 4.0 by Illumina.
 
-Copyright (c) 2022, Eva Holtkamp, Brian Clarke, Hakime Öztürk, Felix Brechtmann,
-Florian Hölzlwimmer, Julien Gagneur, Oliver Stegle
+## MIT License
+
+Copyright (c) 2022, Brian Clarke, Eva Holtkamp, Hakime Öztürk, Marcel Mück, Magnus Wahlberg, Kayla Meyer, Felix Munzlinger, Felix Brechtmann, Florian R. Hölzlwimmer, Jonas Lindner, Zhifen Chen, Julien Gagneur, Oliver Stegle
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,3 +21,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -7,7 +7,6 @@
 import sys
 import time
 from pathlib import Path
-from typing import Optional
 import dask.dataframe as dd
 import numpy as np
 import click
@@ -671,6 +670,9 @@ def filter_annotations_by_exon_distance(
     )
     del merged
     len_after_filtering = len(filtered_merge)
+    assert (
+        len_after_filtering > 0
+    ), "Data frame is empty after filtering on exon distance, abort."
     logger.info(
         f"filtered rows by exon distance ({max_dist}bp), dropped({len_bf_filtering - len_after_filtering} rows / {np.round(100*(len_bf_filtering - len_after_filtering)/len_bf_filtering)}%)"
     )
@@ -795,12 +797,12 @@ def deepsea_pca(
 
     del X_std
 
-    logger.info(f"Writing values to data frame")
+    logger.info("Writing values to data frame")
     pca_df = pd.DataFrame(
         X_pca, columns=[f"DeepSEA_PC_{i}" for i in range(1, n_components + 1)]
     )
     del X_pca
-    logger.info(f"adding key values to data frame")
+    logger.info("adding key values to data frame")
     pca_df = pd.concat([key_df, pca_df], axis=1)
 
     logger.info("Sanity check of results")
@@ -1946,17 +1948,21 @@ def merge_af(annotations_path: str, af_df_path: str, out_file: str):
 @cli.command()
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def calculate_maf(annotations_path: str, out_file: str):
+@click.option("--af-column-name", type=str, default="af")
+def calculate_maf(annotations_path: str, out_file: str, af_column_name: str):
     """
     Calculate minor allele frequency (MAF) from allele frequency data in annotations.
 
     Parameters:
+    - af-column-name(str): Name of the allele frequency column to calculate MAF from
     - annotations_path (str): Path to the annotations file containing allele frequency data.
     - out_file (str): Path to the output file to save the calculated MAF data.
+
+
     """
     logger.info(f"reading annotation file {annotations_path}")
     annotation_file = pd.read_parquet(annotations_path)
-    af = annotation_file["af"]
+    af = annotation_file[af_column_name]
     annotation_file = annotation_file.drop(
         columns=["UKB_AF_MB", "UKB_MAF"], errors="ignore"
     )

diff --git a/deeprvat/cv_utils.py b/deeprvat/cv_utils.py
@@ -1,9 +1,5 @@
-import pandas as pd
 import yaml
-import os
 import sys
-from typing import Optional
-import re
 
 # import pickle
 import logging
@@ -100,80 +96,117 @@ def generate_test_config(input_config, out_file, fold, n_folds):
 
 
 @cli.command()
-@click.option("--link-burdens", type=click.Path())
+@click.option("--skip-burdens", is_flag=True)
 @click.option("--burden-dirs", "-b", multiple=True)
-@click.argument("out_dir", type=click.Path(), default="./")
+@click.option("--xy-dirs", "-b", multiple=True)
+@click.argument("out_dir_burdens", type=click.Path(), default="./")
+@click.argument("out_dir_xy", type=click.Path(), default="./")
 @click.argument("config_file", type=click.Path(exists=True))
 def combine_test_set_burdens(
-    out_dir,
-    link_burdens,
+    out_dir_burdens,
+    out_dir_xy,
+    skip_burdens,
     burden_dirs,
+    xy_dirs,
     config_file,
 ):
+    assert len(burden_dirs) == len(xy_dirs)
+
     with open(config_file) as f:
         config = yaml.safe_load(f)
     compression_level = 1
-    skip_burdens = link_burdens is not None
     n_total_samples = []
-    for burden_dir in burden_dirs:
-        print(burden_dir)
-        this_y = zarr.open(f"{burden_dir}/y.zarr")
-        this_x = zarr.open(f"{burden_dir}/x.zarr")
+    for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
+        logger.debug(xy_dir)
+        this_y = zarr.open(f"{xy_dir}/y.zarr")
+        this_x = zarr.open(f"{xy_dir}/x.zarr")
+        this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
         # this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')
 
-        assert this_y.shape[0] == this_x.shape[0]  # == this_burdens.shape[0]
+        assert this_y.shape[0] == this_x.shape[0]
         n_total_samples.append(this_y.shape[0])
 
+        if not skip_burdens:
+            this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")
+            this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
+            assert this_y.shape[0] == this_burdens.shape[0]
+            logger.debug(this_sample_ids_xy, this_sample_ids_burdens)
+            assert np.array_equal(this_sample_ids_xy, this_sample_ids_burdens)
+
     n_total_samples = np.sum(n_total_samples)
-    print(f"Total number of samples {n_total_samples}")
+    logger.info(f"Total number of samples: {n_total_samples}")
     if not skip_burdens:
-        this_burdens = zarr.open(
-            f"{burden_dir}/burdens.zarr"
-        )  # any burden tensor (here from the last file to get dims 1 -n)
         burdens = zarr.open(
-            Path(out_dir) / "burdens.zarr",
+            Path(out_dir_burdens) / "burdens.zarr",
             mode="a",
             shape=(n_total_samples,) + this_burdens.shape[1:],
             chunks=(1000, 1000),
             dtype=np.float32,
             compressor=Blosc(clevel=compression_level),
         )
-        print(f"burdens shape: {burdens.shape}")
-    else:
-        burdens = None
+        logger.info(f"burdens shape: {burdens.shape}")
+        sample_ids_burdens = zarr.open(
+            Path(out_dir_burdens) / "sample_ids.zarr",
+            mode="a",
+            shape=(n_total_samples),
+            chunks=(None),
+            dtype="U200",
+            compressor=Blosc(clevel=compression_level),
+        )
 
     y = zarr.open(
-        Path(out_dir) / "y.zarr",
+        Path(out_dir_xy) / "y.zarr",
         mode="a",
         shape=(n_total_samples,) + this_y.shape[1:],
         chunks=(None, None),
         dtype=np.float32,
         compressor=Blosc(clevel=compression_level),
     )
     x = zarr.open(
-        Path(out_dir) / "x.zarr",
+        Path(out_dir_xy) / "x.zarr",
         mode="a",
         shape=(n_total_samples,) + this_x.shape[1:],
         chunks=(None, None),
         dtype=np.float32,
         compressor=Blosc(clevel=compression_level),
     )
+    sample_ids_xy = zarr.open(
+        Path(out_dir_xy) / "sample_ids.zarr",
+        mode="a",
+        shape=(n_total_samples),
+        chunks=(None),
+        dtype="U200",
+        compressor=Blosc(clevel=compression_level),
+    )
 
     start_idx = 0
 
-    for burden_dir in burden_dirs:
-        this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
+    for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
+        this_y = zarr.load(f"{xy_dir}/y.zarr")
         end_idx = start_idx + this_y.shape[0]
-        this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
-        if not skip_burdens:
-            logger.info("writing burdens")
-            this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
-            burdens[start_idx:end_idx] = this_burdens
         print((start_idx, end_idx))
+        this_x = zarr.load(f"{xy_dir}/x.zarr")
+        this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
         y[start_idx:end_idx] = this_y
         x[start_idx:end_idx] = this_x
+        sample_ids_xy[start_idx:end_idx] = this_sample_ids_xy
+        if not skip_burdens:
+            logger.info("writing burdens")
+            this_burdens = zarr.load(f"{burden_dir}/burdens.zarr")
+            burdens[start_idx:end_idx] = this_burdens
+            this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
+            sample_ids_burdens[start_idx:end_idx] = this_sample_ids_burdens
         start_idx = end_idx
 
+    # sanity check
+    if not skip_burdens and not np.array_equal(sample_ids_xy[:], sample_ids_burdens[:]):
+        logger.error(
+            "sample_ids_xy, sample_ids_burdens do not match:\n"
+            + f"sample_ids_xy: {sample_ids_xy[:]}"
+            + f"sample_ids_burdens: {sample_ids_burdens[:]}"
+        )
+        raise RuntimeError("sample_ids_xy, sample_ids_burdens do not match")
+
     y_transformation = config["association_testing_data"]["dataset_config"].get(
         "y_transformation", None
     )
@@ -202,17 +235,16 @@ def combine_test_set_burdens(
                 for col in range(this_y.shape[1]):
                     this_y[:, col] = standardize_series(this_y[:, col])
             elif y_transformation == "quantile_transform":
-                logger.info(f"  Quantile transforming combined target phenotype (y)")
+                logger.info("  Quantile transforming combined target phenotype (y)")
                 for col in range(this_y.shape[1]):
                     this_y[:, col] = my_quantile_transform(this_y[:, col])
             y[:] = this_y
+
+    if not skip_burdens:
+        genes = np.load(f"{burden_dirs[0]}/genes.npy")
+        np.save(Path(out_dir_burdens) / "genes.npy", genes)
+
     print("done")
-    if link_burdens is not None:
-        source_path = Path(out_dir) / "burdens.zarr"
-        source_path.unlink(missing_ok=True)
-        source_path.symlink_to(link_burdens)
-    genes = np.load(f"{burden_dirs[0]}/genes.npy")
-    np.save(Path(out_dir) / "genes.npy", genes)
 
 
 if __name__ == "__main__":