Skip to content

Commit

Permalink
Merge branch 'main' into fix-typo-cond-window
Browse files Browse the repository at this point in the history
  • Loading branch information
bfclarke authored Sep 24, 2024
2 parents 7dbdbae + d9b11b2 commit e6fd13b
Show file tree
Hide file tree
Showing 72 changed files with 2,665 additions and 962 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/pipeline-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,29 @@ run-name: DeepRVAT Pipeline Tests 🧬🧪💻🧑‍🔬
on: [ push ]

jobs:
# Config Setup
Smoke-GenerateConfig-Training:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/run_training.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_training_config.yaml ./example/

Smoke-GenerateConfig-Training-AssociationTesting:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/training_association_testing.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_config.yaml ./example/

Smoke-GenerateConfig-PreTrained:
uses: ./.github/workflows/run-pipeline.yml
with:
pipeline_file: ./pipelines/association_testing_pretrained.snakefile
environment_file: ./deeprvat_env_no_gpu.yml
prerun_cmd: cp ./example/config/deeprvat_input_pretrained_models_config.yaml ./example/ && ln -s $GITHUB_WORKSPACE/pretrained_models ./example/


# Training Pipeline
Smoke-RunTraining:
uses: ./.github/workflows/run-pipeline.yml
Expand Down Expand Up @@ -158,6 +181,11 @@ jobs:
with:
pipeline_file: ./pipelines/annotations.snakefile
environment_file: ./deeprvat_annotations.yml
prerun_cmd: |
mkdir -pv ./example/preprocess/workdir/norm/variants && \
mkdir -pv ./example/preprocess/workdir/preprocessed && \
touch ./example/preprocess/workdir/preprocessed/genotypes.h5 && \
touch ./example/preprocess/workdir/norm/variants/variants.parquet
pipeline_config: ./example/config/deeprvat_annotation_config.yaml
pipeline_directory: ./example/annotations
download_fasta_data: true
Expand Down
8 changes: 5 additions & 3 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
MIT License
The scource code of DeepRVAT is under MIT license. The pre-trained DeepRVAT models are under the CC BY NC 4.0 license for academic and non-commercial use. This is because DeepRVAT makes use of SpliceAI and PrimateAI scores which are currently under the CC BY NC 4.0 by Illumina.

Copyright (c) 2022, Eva Holtkamp, Brian Clarke, Hakime Öztürk, Felix Brechtmann,
Florian Hölzlwimmer, Julien Gagneur, Oliver Stegle
## MIT License

Copyright (c) 2022, Brian Clarke, Eva Holtkamp, Hakime Öztürk, Marcel Mück, Magnus Wahlberg, Kayla Meyer, Felix Munzlinger, Felix Brechtmann, Florian R. Hölzlwimmer, Jonas Lindner, Zhifen Chen, Julien Gagneur, Oliver Stegle

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,3 +21,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

16 changes: 11 additions & 5 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import sys
import time
from pathlib import Path
from typing import Optional
import dask.dataframe as dd
import numpy as np
import click
Expand Down Expand Up @@ -671,6 +670,9 @@ def filter_annotations_by_exon_distance(
)
del merged
len_after_filtering = len(filtered_merge)
assert (
len_after_filtering > 0
), "Data frame is empty after filtering on exon distance, abort."
logger.info(
f"filtered rows by exon distance ({max_dist}bp), dropped({len_bf_filtering - len_after_filtering} rows / {np.round(100*(len_bf_filtering - len_after_filtering)/len_bf_filtering)}%)"
)
Expand Down Expand Up @@ -795,12 +797,12 @@ def deepsea_pca(

del X_std

logger.info(f"Writing values to data frame")
logger.info("Writing values to data frame")
pca_df = pd.DataFrame(
X_pca, columns=[f"DeepSEA_PC_{i}" for i in range(1, n_components + 1)]
)
del X_pca
logger.info(f"adding key values to data frame")
logger.info("adding key values to data frame")
pca_df = pd.concat([key_df, pca_df], axis=1)

logger.info("Sanity check of results")
Expand Down Expand Up @@ -1946,17 +1948,21 @@ def merge_af(annotations_path: str, af_df_path: str, out_file: str):
@cli.command()
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
def calculate_maf(annotations_path: str, out_file: str):
@click.option("--af-column-name", type=str, default="af")
def calculate_maf(annotations_path: str, out_file: str, af_column_name: str):
"""
Calculate minor allele frequency (MAF) from allele frequency data in annotations.
Parameters:
- af-column-name(str): Name of the allele frequency column to calculate MAF from
- annotations_path (str): Path to the annotations file containing allele frequency data.
- out_file (str): Path to the output file to save the calculated MAF data.
"""
logger.info(f"reading annotation file {annotations_path}")
annotation_file = pd.read_parquet(annotations_path)
af = annotation_file["af"]
af = annotation_file[af_column_name]
annotation_file = annotation_file.drop(
columns=["UKB_AF_MB", "UKB_MAF"], errors="ignore"
)
Expand Down
108 changes: 70 additions & 38 deletions deeprvat/cv_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import pandas as pd
import yaml
import os
import sys
from typing import Optional
import re

# import pickle
import logging
Expand Down Expand Up @@ -100,80 +96,117 @@ def generate_test_config(input_config, out_file, fold, n_folds):


@cli.command()
@click.option("--link-burdens", type=click.Path())
@click.option("--skip-burdens", is_flag=True)
@click.option("--burden-dirs", "-b", multiple=True)
@click.argument("out_dir", type=click.Path(), default="./")
@click.option("--xy-dirs", "-b", multiple=True)
@click.argument("out_dir_burdens", type=click.Path(), default="./")
@click.argument("out_dir_xy", type=click.Path(), default="./")
@click.argument("config_file", type=click.Path(exists=True))
def combine_test_set_burdens(
out_dir,
link_burdens,
out_dir_burdens,
out_dir_xy,
skip_burdens,
burden_dirs,
xy_dirs,
config_file,
):
assert len(burden_dirs) == len(xy_dirs)

with open(config_file) as f:
config = yaml.safe_load(f)
compression_level = 1
skip_burdens = link_burdens is not None
n_total_samples = []
for burden_dir in burden_dirs:
print(burden_dir)
this_y = zarr.open(f"{burden_dir}/y.zarr")
this_x = zarr.open(f"{burden_dir}/x.zarr")
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
logger.debug(xy_dir)
this_y = zarr.open(f"{xy_dir}/y.zarr")
this_x = zarr.open(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
# this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')

assert this_y.shape[0] == this_x.shape[0] # == this_burdens.shape[0]
assert this_y.shape[0] == this_x.shape[0]
n_total_samples.append(this_y.shape[0])

if not skip_burdens:
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
assert this_y.shape[0] == this_burdens.shape[0]
logger.debug(this_sample_ids_xy, this_sample_ids_burdens)
assert np.array_equal(this_sample_ids_xy, this_sample_ids_burdens)

n_total_samples = np.sum(n_total_samples)
print(f"Total number of samples {n_total_samples}")
logger.info(f"Total number of samples: {n_total_samples}")
if not skip_burdens:
this_burdens = zarr.open(
f"{burden_dir}/burdens.zarr"
) # any burden tensor (here from the last file to get dims 1 -n)
burdens = zarr.open(
Path(out_dir) / "burdens.zarr",
Path(out_dir_burdens) / "burdens.zarr",
mode="a",
shape=(n_total_samples,) + this_burdens.shape[1:],
chunks=(1000, 1000),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
print(f"burdens shape: {burdens.shape}")
else:
burdens = None
logger.info(f"burdens shape: {burdens.shape}")
sample_ids_burdens = zarr.open(
Path(out_dir_burdens) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

y = zarr.open(
Path(out_dir) / "y.zarr",
Path(out_dir_xy) / "y.zarr",
mode="a",
shape=(n_total_samples,) + this_y.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
x = zarr.open(
Path(out_dir) / "x.zarr",
Path(out_dir_xy) / "x.zarr",
mode="a",
shape=(n_total_samples,) + this_x.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
sample_ids_xy = zarr.open(
Path(out_dir_xy) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

start_idx = 0

for burden_dir in burden_dirs:
this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
this_y = zarr.load(f"{xy_dir}/y.zarr")
end_idx = start_idx + this_y.shape[0]
this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
burdens[start_idx:end_idx] = this_burdens
print((start_idx, end_idx))
this_x = zarr.load(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
y[start_idx:end_idx] = this_y
x[start_idx:end_idx] = this_x
sample_ids_xy[start_idx:end_idx] = this_sample_ids_xy
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.load(f"{burden_dir}/burdens.zarr")
burdens[start_idx:end_idx] = this_burdens
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
sample_ids_burdens[start_idx:end_idx] = this_sample_ids_burdens
start_idx = end_idx

# sanity check
if not skip_burdens and not np.array_equal(sample_ids_xy[:], sample_ids_burdens[:]):
logger.error(
"sample_ids_xy, sample_ids_burdens do not match:\n"
+ f"sample_ids_xy: {sample_ids_xy[:]}"
+ f"sample_ids_burdens: {sample_ids_burdens[:]}"
)
raise RuntimeError("sample_ids_xy, sample_ids_burdens do not match")

y_transformation = config["association_testing_data"]["dataset_config"].get(
"y_transformation", None
)
Expand Down Expand Up @@ -202,17 +235,16 @@ def combine_test_set_burdens(
for col in range(this_y.shape[1]):
this_y[:, col] = standardize_series(this_y[:, col])
elif y_transformation == "quantile_transform":
logger.info(f" Quantile transforming combined target phenotype (y)")
logger.info(" Quantile transforming combined target phenotype (y)")
for col in range(this_y.shape[1]):
this_y[:, col] = my_quantile_transform(this_y[:, col])
y[:] = this_y

if not skip_burdens:
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir_burdens) / "genes.npy", genes)

print("done")
if link_burdens is not None:
source_path = Path(out_dir) / "burdens.zarr"
source_path.unlink(missing_ok=True)
source_path.symlink_to(link_burdens)
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir) / "genes.npy", genes)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit e6fd13b

Please sign in to comment.